From cdbfb891da2a8de14aa1d9bd7a57265284f7432c Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Mon, 17 Jun 2019 15:33:38 +0000 Subject: [PATCH] new sgemm 8x16 --- kernel/power/sgemm_logic_power9.S | 193 ++++++++-------- kernel/power/sgemm_macros_power9.S | 344 +++++++++++++++-------------- param.h | 2 +- 3 files changed, 288 insertions(+), 251 deletions(-) diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S index 25e8c8387..053836cbf 100644 --- a/kernel/power/sgemm_logic_power9.S +++ b/kernel/power/sgemm_logic_power9.S @@ -3,89 +3,89 @@ b L8 MY_ALIGN LSGEMM_L8x16_LMAIN_SUB: - LOAD8x16_0 - mtctr L + LOAD8x16_2 MY_ALIGN LSGEMM_L8x16_LOOP: - - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_2 64,32, 1,0 - KERNEL8x16_I1_L4_2 64,32, 2,0 - KERNEL8x16_I1_L4_2 64,32, 3,0 - KERNEL8x16_I1_L4_2 64,32, 4,0 - KERNEL8x16_I1_L4_2 64,32, 5,0 - KERNEL8x16_I1_L4_2 64,32, 6,0 - KERNEL8x16_I1_L4_2 64,32, 7,0 - KERNEL8x16_I1_L4_2 64,32, 8,0 - KERNEL8x16_I1_L4_2 64,32, 9,0 - KERNEL8x16_I1_L4_2 64,32, 10,0 - KERNEL8x16_I1_L4_2 64,32, 11,0 - KERNEL8x16_I1_L4_2 64,32, 12,0 - KERNEL8x16_I1_L4_2 64,32, 13,0 - KERNEL8x16_I1_L4_2 64,32, 14,0 - KERNEL8x16_I1_L4_2 64,32, 15,0 - KERNEL8x16_I1_L4_2 64,32, 16,0 - KERNEL8x16_I1_L4_2 64,32, 17,0 - KERNEL8x16_I1_L4_2 64,32, 18,0 - KERNEL8x16_I1_L4_2 64,32, 19,0 - KERNEL8x16_I1_L4_2 64,32, 20,0 - KERNEL8x16_I1_L4_2 64,32, 21,0 - KERNEL8x16_I1_L4_2 64,32, 22,0 - KERNEL8x16_I1_L4_2 64,32, 23,0 - KERNEL8x16_I1_L4_2 64,32, 24,0 - KERNEL8x16_I1_L4_2 64,32, 25,0 - KERNEL8x16_I1_L4_2 64,32, 26,0 - KERNEL8x16_I1_L4_2 64,32, 27,0 - KERNEL8x16_I1_L4_2 64,32, 28,0 - KERNEL8x16_I1_L4_2 64,32, 29,0 - KERNEL8x16_I1_L4_2 64,32, 30,0 - KERNEL8x16_I1_L4_2 64,32, 31,1 + KERNEL8x16_L2 128,64,0,0 +LSGEMM_L8x16_K128: + KERNEL8x16_L2 128,64,1,0 + KERNEL8x16_I1_L4_2 128,64, 1,0 + KERNEL8x16_I1_L4_2 128,64, 2,0 + KERNEL8x16_I1_L4_2 128,64, 3,0 + KERNEL8x16_I1_L4_2 128,64, 4,0 + KERNEL8x16_I1_L4_2 128,64, 5,0 + KERNEL8x16_I1_L4_2 128,64, 6,0 + KERNEL8x16_I1_L4_2 128,64, 7,0 + KERNEL8x16_I1_L4_2 128,64, 8,0 + KERNEL8x16_I1_L4_2 128,64, 9,0 + KERNEL8x16_I1_L4_2 128,64, 10,0 + KERNEL8x16_I1_L4_2 128,64, 11,0 + KERNEL8x16_I1_L4_2 128,64, 12,0 + KERNEL8x16_I1_L4_2 128,64, 13,0 + KERNEL8x16_I1_L4_2 128,64, 14,0 + KERNEL8x16_I1_L4_2 128,64, 15,0 + KERNEL8x16_I1_L4_2 128,64, 16,0 + KERNEL8x16_I1_L4_2 128,64, 17,0 + KERNEL8x16_I1_L4_2 128,64, 18,0 + KERNEL8x16_I1_L4_2 128,64, 19,0 + KERNEL8x16_I1_L4_2 128,64, 20,0 + KERNEL8x16_I1_L4_2 128,64, 21,0 + KERNEL8x16_I1_L4_2 128,64, 22,0 + KERNEL8x16_I1_L4_2 128,64, 23,0 + KERNEL8x16_I1_L4_2 128,64, 24,0 + KERNEL8x16_I1_L4_2 128,64, 25,0 + KERNEL8x16_I1_L4_2 128,64, 26,0 + KERNEL8x16_I1_L4_2 128,64, 27,0 + KERNEL8x16_I1_L4_2 128,64, 28,0 + KERNEL8x16_I1_L4_2 128,64, 29,0 + KERNEL8x16_I1_L4_2 128,64, 30,0 + KERNEL8x16_I1_L4_2 128,64, 31,1 bdnz LSGEMM_L8x16_LOOP MY_ALIGN LSGEMM_L8x16_LOOP_END: - END8x16 0, AO, BO, 64, 32 + END8x16_2 blr MY_ALIGN LSGEMM_L8x16_L64_SUB: - LOAD8x16_0 - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_2 64,32, 1,0 - KERNEL8x16_I1_L4_2 64,32, 2,0 - KERNEL8x16_I1_L4_2 64,32, 3,0 - KERNEL8x16_I1_L4_2 64,32, 4,0 - KERNEL8x16_I1_L4_2 64,32, 5,0 - KERNEL8x16_I1_L4_2 64,32, 6,0 - KERNEL8x16_I1_L4_2 64,32, 7,0 - KERNEL8x16_I1_L4_2 64,32, 8,0 - KERNEL8x16_I1_L4_2 64,32, 9,0 - KERNEL8x16_I1_L4_2 64,32, 10,0 - KERNEL8x16_I1_L4_2 64,32, 11,0 - KERNEL8x16_I1_L4_2 64,32, 12,0 - KERNEL8x16_I1_L4_2 64,32, 13,0 - KERNEL8x16_I1_L4_2 64,32, 14,0 - KERNEL8x16_I1_L4_3 64,32, 15,1 + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64, 0,0 + KERNEL8x16_I1_L4_2 128,64, 1,0 + KERNEL8x16_I1_L4_2 128,64, 2,0 + KERNEL8x16_I1_L4_2 128,64,3,0 + KERNEL8x16_I1_L4_2 128,64,4,0 + KERNEL8x16_I1_L4_2 128,64,5,0 + KERNEL8x16_I1_L4_2 128,64,6,0 + KERNEL8x16_I1_L4_2 128,64,7,0 + KERNEL8x16_I1_L4_2 128,64,8,0 + KERNEL8x16_I1_L4_2 128,64,9,0 + KERNEL8x16_I1_L4_2 128,64,10,0 + KERNEL8x16_I1_L4_2 128,64,11,0 + KERNEL8x16_I1_L4_2 128,64,12,0 + KERNEL8x16_I1_L4_2 128,64,13,0 + KERNEL8x16_I1_L4_2 128,64,14,0 + KERNEL8x16_I1_L4_3 128,64,15,1 blr LSGEMM_L8x16_L32_SUB: - LOAD8x16_0 - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_2 64,32, 1,0 - KERNEL8x16_I1_L4_2 64,32, 2,0 - KERNEL8x16_I1_L4_2 64,32, 3,0 - KERNEL8x16_I1_L4_2 64,32, 4,0 - KERNEL8x16_I1_L4_2 64,32, 5,0 - KERNEL8x16_I1_L4_2 64,32, 6,0 - KERNEL8x16_I1_L4_3 64,32, 7,1 + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64,0,0 + KERNEL8x16_I1_L4_2 128,64,1,0 + KERNEL8x16_I1_L4_2 128,64,2,0 + KERNEL8x16_I1_L4_2 128,64,3,0 + KERNEL8x16_I1_L4_2 128,64,4,0 + KERNEL8x16_I1_L4_2 128,64,5,0 + KERNEL8x16_I1_L4_2 128,64,6,0 + KERNEL8x16_I1_L4_3 128,64,7,1 blr LSGEMM_L8x16_L16_SUB: - LOAD8x16_0 - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_2 64,32, 1,0 - KERNEL8x16_I1_L4_2 64,32, 2,0 - KERNEL8x16_I1_L4_3 64,32, 3,1 + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64,0,0 + KERNEL8x16_I1_L4_2 128,64,1,0 + KERNEL8x16_I1_L4_2 128,64,2,0 + KERNEL8x16_I1_L4_3 128,64,3,1 blr L8: @@ -127,15 +127,16 @@ LSGEMM_L8x16_BEGIN: #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 7 /**(T11-1) % 128x */ + addi T12,T12, -2 + srawi. L, T12, 7 /**(T11-2) % 128x */ #else mr T12, K - addi T12,T12, -1 - srawi. L, T12, 7 /**(K-1) % 128x */ + addi T12,T12, -2 + srawi. L, T12, 7 /**(K-2) % 128x */ #endif - ZERO8x16 + ZERO8x16 + mtctr L ble LSGEMM_L8x16_SUB0 bl LSGEMM_L8x16_LMAIN_SUB andi. L, T12, 127 @@ -148,15 +149,33 @@ LSGEMM_L8x16_SUB0: cmpwi T11,128 #else andi. L, K, 255 + cmpwi K,129 +#endif + li T10,1 + bne CMP8x16_128K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD8x16 64,32 + END8x16_WITHOUT_ADD + LOAD8x16_2O AO,BO, 128, 64 + mtctr T10 + bl LSGEMM_L8x16_K128 + b LSGEMM_L8x16_SAVE +CMP8x16_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T11,128 +#else cmpwi K,128 -#endif - - bne LSGEMM_L8x16_SUB2 - MY_ALIGN -LSGEMM_L8x16_SUB2_128: - bl LSGEMM_L8x16_L64_SUB - bl LSGEMM_L8x16_L64_SUB - b LSGEMM_L8x16_SAVE +#endif + bne LSGEMM_L8x16_SUB2 + MY_ALIGN + mtctr T10 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD8x16_2O AO,BO, 128,64 + bl LSGEMM_L8x16_K128 + b LSGEMM_L8x16_SAVE MY_ALIGN LSGEMM_L8x16_SUB2: andi. T10,L,64 @@ -176,21 +195,21 @@ LSGEMM_L8x16_SUB2_16: LSGEMM_L8x16_SUB2_8: andi. T10,L, 8 ble LSGEMM_L8x16_SUB2_4 - LOAD8x16_0 - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_3 64,32, 1,1 + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64, 0,0 + KERNEL8x16_I1_L4_3 128,64, 1,1 MY_ALIGN LSGEMM_L8x16_SUB2_4: andi. T10,L, 4 ble LSGEMM_L8x16_SUB2_2 - LOAD8x16_0 - KERNEL8x16_I1_L4_3 64,32, 0,1 + LOAD8x16_2 + KERNEL8x16_I1_L4_3 128,64, 0,1 MY_ALIGN LSGEMM_L8x16_SUB2_2: andi. T10,L, 2 ble LSGEMM_L8x16_SUB2_1 - LOAD8x16_0 - KERNEL8x16_I1_L2_3 64,32, 0,1 + LOAD8x16_2 + KERNEL8x16_E2 128,64, 0,1 MY_ALIGN LSGEMM_L8x16_SUB2_1: andi. T10,L, 1 diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S index 3f86a1d25..2c9e537c7 100644 --- a/kernel/power/sgemm_macros_power9.S +++ b/kernel/power/sgemm_macros_power9.S @@ -38,13 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=8 and M=16 **********************************************************************************************/ -.macro LOAD8x16_1 - LOAD8x16 1 -.endm - -.macro LOAD8x16_0 - LOAD8x16 0 -.endm + .macro KERNEL8x16_L1_L4 Index,IsLast KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 @@ -61,10 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm -.macro KERNEL8x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L2_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - + .macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 .endm @@ -108,61 +99,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs63, vs63, vs63 .endm -.macro LOAD8x16 Zero +.macro LOAD8x16 OffsetA,OffsetB - lxv vs24, 0(BO) - lxv vs28, 16(BO) + lxv vs24, (\OffsetB+0)(BO) + lxv vs28, (\OffsetB+16)(BO) xxperm vs26, vs24, permute_mask xxperm vs30, vs28, permute_mask - lxv vs0, 0(AO) - lxv vs1, 16(AO) + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) xxpermdi vs25, vs24, vs24,2 xxpermdi vs29, vs28, vs28,2 - lxv vs2, 32(AO) - lxv vs3, 48(AO) + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) xxpermdi vs27, vs26, vs26,2 xxpermdi vs31, vs30, vs30,2 -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endif .endm .macro END8x16_NORMAL END8x16 0, AO, BO, 64,32 .endm +.macro END8x16_WITHOUT_ADD + END8x16 0, AO,BO,0,0 +.endm + .macro END8x16 First, AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 @@ -258,145 +219,202 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete -KERNEL8x16_L1_L2_I \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0 -KERNEL8x16_L1_L2_I \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete +KERNEL8x16_2 \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0 +KERNEL8x16_2 \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete .endm .macro KERNEL8x16 First - LOAD8x16 0 + LOAD8x16 0,0 END8x16 \First, AO, BO, 64,32 .endm -.macro KERNEL8x16_L1_L2_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP16(\Index,\OffsetB)(\BREG) - lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) +.macro LOAD8x16_2 + LOAD8x16_2O AO,BO, 0,0 +.endm + +.macro LOAD8x16_2O AREG,BREG, OffsetA,OffsetB + lxv vs8, (\OffsetB)(\BREG) + lxv vs12, (16+\OffsetB)(\BREG) + lxv vs24, (32+\OffsetB)(\BREG) + lxv vs28, (32+16+\OffsetB)(\BREG) + lxv vs4, (0+\OffsetA)(\AREG) + lxv vs5, (16+\OffsetA)(\AREG) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + lxv vs6, (32+\OffsetA)(\AREG) + lxv vs7, (48+\OffsetA)(\AREG) + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + lxv vs0, (64+\OffsetA)(\AREG) + lxv vs1, (64+16+\OffsetA)(\AREG) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + lxv vs2, (64+32+\OffsetA)(\AREG) + lxv vs3, (64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + +.macro END8x16_2 + /*for load2 offset will be 128 and 64*/ + KERNEL8x16_2 AO,BO, 128,64,0 ,1,1 +.endm + + + +.macro KERNEL8x16_E2 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL8x16_L2 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL8x16_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + +.if \Complete==0 + lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 +.if \Complete==0 + lxv vs8, DISP16(\Index,\OffsetB)(\BREG) + lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask +.endif + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 +.if \Complete==0 + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.endif + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 +.endif + +.if \Complete==0 + lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) +.endif xvmaddasp vs32, vs0,vs24 - xvmaddasp vs36, vs0,vs25 - lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs44, vs0,vs27 - lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) + xvmaddasp vs33, vs1,vs24 xvmaddasp vs48, vs0,vs28 - xvmaddasp vs52, vs0,vs29 - - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 xvmaddasp vs60, vs0,vs31 - - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - - - - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs37, vs1,vs25 - - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs61, vs1,vs31 + xvmaddasp vs61, vs1,vs31 .if \Complete==0 - lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) + lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) .endif - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs47, vs3,vs27 - xvmaddasp vs51, vs3,vs28 - xvmaddasp vs55, vs3,vs29 - xvmaddasp vs59, vs3,vs30 - xvmaddasp vs63, vs3,vs31 -.if \Complete==0 - lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) -.endif - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs36, vs4,vs9 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 .if \Complete==0 - lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) + lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 .endif +.if \Complete==0 + lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) +.endif + + .if \IsLast==1 .if \Complete==1 - addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) - addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) + addi \BREG, \BREG, DISP16(\Index,\OffsetB) + addi \AREG, \AREG, DISP32(\Index,\OffsetA) .else - addi \AREG, \AREG, DISP32(\Index,128) addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP32(\Index,128) .endif .endif - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs44, vs4,vs11 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask -.endif - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs52, vs4,vs13 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs60, vs4,vs15 - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.endif - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs49, vs5,vs12 - xvmaddasp vs53, vs5,vs13 - xvmaddasp vs57, vs5,vs14 - xvmaddasp vs61, vs5,vs15 - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs50, vs6,vs12 - xvmaddasp vs54, vs6,vs13 - xvmaddasp vs58, vs6,vs14 - xvmaddasp vs62, vs6,vs15 - - xvmaddasp vs35, vs7,vs8 - xvmaddasp vs39, vs7,vs9 - xvmaddasp vs43, vs7,vs10 - xvmaddasp vs47, vs7,vs11 - xvmaddasp vs51, vs7,vs12 - xvmaddasp vs55, vs7,vs13 - xvmaddasp vs59, vs7,vs14 - xvmaddasp vs63, vs7,vs15 - .endm diff --git a/param.h b/param.h index 9a1a68ecd..3934da6c8 100644 --- a/param.h +++ b/param.h @@ -2253,7 +2253,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_P 640 #define ZGEMM_DEFAULT_P 256 -#define SGEMM_DEFAULT_Q 1025 +#define SGEMM_DEFAULT_Q 1026 #define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 640 #define ZGEMM_DEFAULT_Q 1026