Merge pull request #422 from wernsaar/develop

optimization of sandybridge cgemm-kernel
This commit is contained in:
Zhang Xianyi 2014-07-30 17:09:58 +08:00
commit 4f83217df6
2 changed files with 127 additions and 55 deletions

View File

@ -25,6 +25,32 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
/*********************************************************************
* 2014/07/29 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
* 2013/10/28 Saar
* Parameter:
* CGEMM_DEFAULT_UNROLL_N 2
* CGEMM_DEFAULT_UNROLL_M 8
* CGEMM_DEFAULT_P 768
* CGEMM_DEFAULT_Q 512
* A_PR1 512
* B_PR1 512
*
* 2014/07/29 Saar
* Performance at 6192x6192x6192:
* 1 thread: 49 GFLOPS (MKL: 52)
* 2 threads: 99 GFLOPS (MKL: 102)
* 3 threads: 148 GFLOPS (MKL: 150)
* 4 threads: 195 GFLOPS (MKL: 194)
* 8 threads: 354 GFLOPS (MKL: 317)
*
*
*********************************************************************/
#define ASSEMBLER
#include "common.h"
@ -192,22 +218,108 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/***************************************************************************************************************************/
.macro KERNEL8x2_SUB
.macro KERNEL8x2_1
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4
VFMADDPS_YR( %ymm8,%ymm4,%ymm0 )
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
VFMADDPS_YR( %ymm12,%ymm4,%ymm1 )
vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5
VFMADDPS_YI( %ymm9,%ymm5,%ymm0 )
VFMADDPS_YI( %ymm13,%ymm5,%ymm1 )
prefetcht0 A_PR1(AO, %rax, SIZE)
VFMADDPS_YR( %ymm8,%ymm4,%ymm0 )
vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6
VFMADDPS_YR( %ymm10,%ymm6,%ymm0 )
VFMADDPS_YR( %ymm14,%ymm6,%ymm1 )
VFMADDPS_YI( %ymm9,%ymm5,%ymm0 )
vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7
VFMADDPS_YR( %ymm12,%ymm4,%ymm1 )
VFMADDPS_YI( %ymm13,%ymm5,%ymm1 )
VFMADDPS_YR( %ymm10,%ymm6,%ymm0 )
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4
VFMADDPS_YI( %ymm11,%ymm7,%ymm0 )
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5
VFMADDPS_YR( %ymm14,%ymm6,%ymm1 )
vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0
VFMADDPS_YI( %ymm15,%ymm7,%ymm1 )
vmovups 8 * SIZE(AO, %rax, SIZE), %ymm1
prefetcht0 A_PR1+64(AO, %rax, SIZE)
VFMADDPS_YR( %ymm8,%ymm4,%ymm0 )
vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm6
VFMADDPS_YI( %ymm9,%ymm5,%ymm0 )
vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm7
VFMADDPS_YR( %ymm12,%ymm4,%ymm1 )
VFMADDPS_YI( %ymm13,%ymm5,%ymm1 )
VFMADDPS_YR( %ymm10,%ymm6,%ymm0 )
vbroadcastss 0 * SIZE(BO, BI, SIZE), %ymm4
VFMADDPS_YI( %ymm11,%ymm7,%ymm0 )
vbroadcastss 1 * SIZE(BO, BI, SIZE), %ymm5
VFMADDPS_YR( %ymm14,%ymm6,%ymm1 )
vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0
VFMADDPS_YI( %ymm15,%ymm7,%ymm1 )
vmovups 24 * SIZE(AO, %rax, SIZE), %ymm1
prefetcht0 A_PR1+128(AO, %rax, SIZE)
VFMADDPS_YR( %ymm8,%ymm4,%ymm0 )
vbroadcastss 2 * SIZE(BO, BI, SIZE), %ymm6
VFMADDPS_YI( %ymm9,%ymm5,%ymm0 )
vbroadcastss 3 * SIZE(BO, BI, SIZE), %ymm7
VFMADDPS_YR( %ymm12,%ymm4,%ymm1 )
VFMADDPS_YI( %ymm13,%ymm5,%ymm1 )
VFMADDPS_YR( %ymm10,%ymm6,%ymm0 )
vbroadcastss 4 * SIZE(BO, BI, SIZE), %ymm4
VFMADDPS_YI( %ymm11,%ymm7,%ymm0 )
vbroadcastss 5 * SIZE(BO, BI, SIZE), %ymm5
VFMADDPS_YR( %ymm14,%ymm6,%ymm1 )
vmovups 32 * SIZE(AO, %rax, SIZE), %ymm0
VFMADDPS_YI( %ymm15,%ymm7,%ymm1 )
vmovups 40 * SIZE(AO, %rax, SIZE), %ymm1
prefetcht0 A_PR1+192(AO, %rax, SIZE)
VFMADDPS_YR( %ymm8,%ymm4,%ymm0 )
vbroadcastss 6 * SIZE(BO, BI, SIZE), %ymm6
VFMADDPS_YI( %ymm9,%ymm5,%ymm0 )
vbroadcastss 7 * SIZE(BO, BI, SIZE), %ymm7
VFMADDPS_YR( %ymm12,%ymm4,%ymm1 )
VFMADDPS_YI( %ymm13,%ymm5,%ymm1 )
VFMADDPS_YR( %ymm10,%ymm6,%ymm0 )
VFMADDPS_YI( %ymm11,%ymm7,%ymm0 )
addq $ 16, BI
VFMADDPS_YR( %ymm14,%ymm6,%ymm1 )
VFMADDPS_YI( %ymm15,%ymm7,%ymm1 )
addq $ 64, %rax
.endm
.macro KERNEL8x2_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4
vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5
VFMADDPS_YR( %ymm8,%ymm4,%ymm0 )
vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6
VFMADDPS_YI( %ymm9,%ymm5,%ymm0 )
vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7
VFMADDPS_YR( %ymm12,%ymm4,%ymm1 )
VFMADDPS_YI( %ymm13,%ymm5,%ymm1 )
VFMADDPS_YR( %ymm10,%ymm6,%ymm0 )
VFMADDPS_YI( %ymm11,%ymm7,%ymm0 )
VFMADDPS_YR( %ymm14,%ymm6,%ymm1 )
VFMADDPS_YI( %ymm15,%ymm7,%ymm1 )
addq $ 4 , BI
addq $ 16, %rax
.endm
@ -984,47 +1096,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L2_8_12:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
KERNEL8x2_1
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
KERNEL8x2_1
je .L2_8_16
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
KERNEL8x2_1
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
KERNEL8x2_1
je .L2_8_16
@ -1152,7 +1236,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L2_4_12:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x2_SUB
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
@ -1160,7 +1243,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x2_SUB
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
@ -1170,7 +1252,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
je .L2_4_16
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x2_SUB
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
@ -1178,7 +1259,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x2_SUB
KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
@ -1305,14 +1385,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L2_4_22:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1321,14 +1399,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
je .L2_4_26
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1507,13 +1583,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L2_4_42:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1522,13 +1596,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
je .L2_4_46
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB

View File

@ -1134,9 +1134,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define QGEMM_DEFAULT_P 504
#define QGEMM_DEFAULT_R qgemm_r
#define CGEMM_DEFAULT_P 384
//#define CGEMM_DEFAULT_R cgemm_r
#define CGEMM_DEFAULT_R 1024
#define CGEMM_DEFAULT_P 768
#define CGEMM_DEFAULT_R cgemm_r
//#define CGEMM_DEFAULT_R 1024
#define ZGEMM_DEFAULT_P 512
#define ZGEMM_DEFAULT_R zgemm_r
@ -1148,7 +1148,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SGEMM_DEFAULT_Q 384
#define DGEMM_DEFAULT_Q 256
#define QGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 192
#define CGEMM_DEFAULT_Q 512
#define ZGEMM_DEFAULT_Q 192
#define XGEMM_DEFAULT_Q 128