new sgemm 8x16

This commit is contained in:
AbdelRauf 2019-06-17 15:33:38 +00:00
parent 148c4cc5fd
commit cdbfb891da
3 changed files with 288 additions and 251 deletions

View File

@ -3,89 +3,89 @@ b L8
MY_ALIGN
LSGEMM_L8x16_LMAIN_SUB:
LOAD8x16_0
mtctr L
LOAD8x16_2
MY_ALIGN
LSGEMM_L8x16_LOOP:
KERNEL8x16_I1_L4_2 64,32, 0,0
KERNEL8x16_I1_L4_2 64,32, 1,0
KERNEL8x16_I1_L4_2 64,32, 2,0
KERNEL8x16_I1_L4_2 64,32, 3,0
KERNEL8x16_I1_L4_2 64,32, 4,0
KERNEL8x16_I1_L4_2 64,32, 5,0
KERNEL8x16_I1_L4_2 64,32, 6,0
KERNEL8x16_I1_L4_2 64,32, 7,0
KERNEL8x16_I1_L4_2 64,32, 8,0
KERNEL8x16_I1_L4_2 64,32, 9,0
KERNEL8x16_I1_L4_2 64,32, 10,0
KERNEL8x16_I1_L4_2 64,32, 11,0
KERNEL8x16_I1_L4_2 64,32, 12,0
KERNEL8x16_I1_L4_2 64,32, 13,0
KERNEL8x16_I1_L4_2 64,32, 14,0
KERNEL8x16_I1_L4_2 64,32, 15,0
KERNEL8x16_I1_L4_2 64,32, 16,0
KERNEL8x16_I1_L4_2 64,32, 17,0
KERNEL8x16_I1_L4_2 64,32, 18,0
KERNEL8x16_I1_L4_2 64,32, 19,0
KERNEL8x16_I1_L4_2 64,32, 20,0
KERNEL8x16_I1_L4_2 64,32, 21,0
KERNEL8x16_I1_L4_2 64,32, 22,0
KERNEL8x16_I1_L4_2 64,32, 23,0
KERNEL8x16_I1_L4_2 64,32, 24,0
KERNEL8x16_I1_L4_2 64,32, 25,0
KERNEL8x16_I1_L4_2 64,32, 26,0
KERNEL8x16_I1_L4_2 64,32, 27,0
KERNEL8x16_I1_L4_2 64,32, 28,0
KERNEL8x16_I1_L4_2 64,32, 29,0
KERNEL8x16_I1_L4_2 64,32, 30,0
KERNEL8x16_I1_L4_2 64,32, 31,1
KERNEL8x16_L2 128,64,0,0
LSGEMM_L8x16_K128:
KERNEL8x16_L2 128,64,1,0
KERNEL8x16_I1_L4_2 128,64, 1,0
KERNEL8x16_I1_L4_2 128,64, 2,0
KERNEL8x16_I1_L4_2 128,64, 3,0
KERNEL8x16_I1_L4_2 128,64, 4,0
KERNEL8x16_I1_L4_2 128,64, 5,0
KERNEL8x16_I1_L4_2 128,64, 6,0
KERNEL8x16_I1_L4_2 128,64, 7,0
KERNEL8x16_I1_L4_2 128,64, 8,0
KERNEL8x16_I1_L4_2 128,64, 9,0
KERNEL8x16_I1_L4_2 128,64, 10,0
KERNEL8x16_I1_L4_2 128,64, 11,0
KERNEL8x16_I1_L4_2 128,64, 12,0
KERNEL8x16_I1_L4_2 128,64, 13,0
KERNEL8x16_I1_L4_2 128,64, 14,0
KERNEL8x16_I1_L4_2 128,64, 15,0
KERNEL8x16_I1_L4_2 128,64, 16,0
KERNEL8x16_I1_L4_2 128,64, 17,0
KERNEL8x16_I1_L4_2 128,64, 18,0
KERNEL8x16_I1_L4_2 128,64, 19,0
KERNEL8x16_I1_L4_2 128,64, 20,0
KERNEL8x16_I1_L4_2 128,64, 21,0
KERNEL8x16_I1_L4_2 128,64, 22,0
KERNEL8x16_I1_L4_2 128,64, 23,0
KERNEL8x16_I1_L4_2 128,64, 24,0
KERNEL8x16_I1_L4_2 128,64, 25,0
KERNEL8x16_I1_L4_2 128,64, 26,0
KERNEL8x16_I1_L4_2 128,64, 27,0
KERNEL8x16_I1_L4_2 128,64, 28,0
KERNEL8x16_I1_L4_2 128,64, 29,0
KERNEL8x16_I1_L4_2 128,64, 30,0
KERNEL8x16_I1_L4_2 128,64, 31,1
bdnz LSGEMM_L8x16_LOOP
MY_ALIGN
LSGEMM_L8x16_LOOP_END:
END8x16 0, AO, BO, 64, 32
END8x16_2
blr
MY_ALIGN
LSGEMM_L8x16_L64_SUB:
LOAD8x16_0
KERNEL8x16_I1_L4_2 64,32, 0,0
KERNEL8x16_I1_L4_2 64,32, 1,0
KERNEL8x16_I1_L4_2 64,32, 2,0
KERNEL8x16_I1_L4_2 64,32, 3,0
KERNEL8x16_I1_L4_2 64,32, 4,0
KERNEL8x16_I1_L4_2 64,32, 5,0
KERNEL8x16_I1_L4_2 64,32, 6,0
KERNEL8x16_I1_L4_2 64,32, 7,0
KERNEL8x16_I1_L4_2 64,32, 8,0
KERNEL8x16_I1_L4_2 64,32, 9,0
KERNEL8x16_I1_L4_2 64,32, 10,0
KERNEL8x16_I1_L4_2 64,32, 11,0
KERNEL8x16_I1_L4_2 64,32, 12,0
KERNEL8x16_I1_L4_2 64,32, 13,0
KERNEL8x16_I1_L4_2 64,32, 14,0
KERNEL8x16_I1_L4_3 64,32, 15,1
LOAD8x16_2
KERNEL8x16_I1_L4_2 128,64, 0,0
KERNEL8x16_I1_L4_2 128,64, 1,0
KERNEL8x16_I1_L4_2 128,64, 2,0
KERNEL8x16_I1_L4_2 128,64,3,0
KERNEL8x16_I1_L4_2 128,64,4,0
KERNEL8x16_I1_L4_2 128,64,5,0
KERNEL8x16_I1_L4_2 128,64,6,0
KERNEL8x16_I1_L4_2 128,64,7,0
KERNEL8x16_I1_L4_2 128,64,8,0
KERNEL8x16_I1_L4_2 128,64,9,0
KERNEL8x16_I1_L4_2 128,64,10,0
KERNEL8x16_I1_L4_2 128,64,11,0
KERNEL8x16_I1_L4_2 128,64,12,0
KERNEL8x16_I1_L4_2 128,64,13,0
KERNEL8x16_I1_L4_2 128,64,14,0
KERNEL8x16_I1_L4_3 128,64,15,1
blr
LSGEMM_L8x16_L32_SUB:
LOAD8x16_0
KERNEL8x16_I1_L4_2 64,32, 0,0
KERNEL8x16_I1_L4_2 64,32, 1,0
KERNEL8x16_I1_L4_2 64,32, 2,0
KERNEL8x16_I1_L4_2 64,32, 3,0
KERNEL8x16_I1_L4_2 64,32, 4,0
KERNEL8x16_I1_L4_2 64,32, 5,0
KERNEL8x16_I1_L4_2 64,32, 6,0
KERNEL8x16_I1_L4_3 64,32, 7,1
LOAD8x16_2
KERNEL8x16_I1_L4_2 128,64,0,0
KERNEL8x16_I1_L4_2 128,64,1,0
KERNEL8x16_I1_L4_2 128,64,2,0
KERNEL8x16_I1_L4_2 128,64,3,0
KERNEL8x16_I1_L4_2 128,64,4,0
KERNEL8x16_I1_L4_2 128,64,5,0
KERNEL8x16_I1_L4_2 128,64,6,0
KERNEL8x16_I1_L4_3 128,64,7,1
blr
LSGEMM_L8x16_L16_SUB:
LOAD8x16_0
KERNEL8x16_I1_L4_2 64,32, 0,0
KERNEL8x16_I1_L4_2 64,32, 1,0
KERNEL8x16_I1_L4_2 64,32, 2,0
KERNEL8x16_I1_L4_3 64,32, 3,1
LOAD8x16_2
KERNEL8x16_I1_L4_2 128,64,0,0
KERNEL8x16_I1_L4_2 128,64,1,0
KERNEL8x16_I1_L4_2 128,64,2,0
KERNEL8x16_I1_L4_3 128,64,3,1
blr
L8:
@ -127,15 +127,16 @@ LSGEMM_L8x16_BEGIN:
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
mr T12, T11
addi T12,T12, -1
srawi. L, T12, 7 /**(T11-1) % 128x */
addi T12,T12, -2
srawi. L, T12, 7 /**(T11-2) % 128x */
#else
mr T12, K
addi T12,T12, -1
srawi. L, T12, 7 /**(K-1) % 128x */
addi T12,T12, -2
srawi. L, T12, 7 /**(K-2) % 128x */
#endif
ZERO8x16
ZERO8x16
mtctr L
ble LSGEMM_L8x16_SUB0
bl LSGEMM_L8x16_LMAIN_SUB
andi. L, T12, 127
@ -148,15 +149,33 @@ LSGEMM_L8x16_SUB0:
cmpwi T11,128
#else
andi. L, K, 255
cmpwi K,129
#endif
li T10,1
bne CMP8x16_128K
addi BO,BO,-32
addi AO,AO,-64
LOAD8x16 64,32
END8x16_WITHOUT_ADD
LOAD8x16_2O AO,BO, 128, 64
mtctr T10
bl LSGEMM_L8x16_K128
b LSGEMM_L8x16_SAVE
CMP8x16_128K:
/*----------------------------------------*/
#if defined(TRMMKERNEL)
cmpwi T11,128
#else
cmpwi K,128
#endif
bne LSGEMM_L8x16_SUB2
MY_ALIGN
LSGEMM_L8x16_SUB2_128:
bl LSGEMM_L8x16_L64_SUB
bl LSGEMM_L8x16_L64_SUB
b LSGEMM_L8x16_SAVE
#endif
bne LSGEMM_L8x16_SUB2
MY_ALIGN
mtctr T10
addi BO,BO,-64
addi AO,AO,-128
LOAD8x16_2O AO,BO, 128,64
bl LSGEMM_L8x16_K128
b LSGEMM_L8x16_SAVE
MY_ALIGN
LSGEMM_L8x16_SUB2:
andi. T10,L,64
@ -176,21 +195,21 @@ LSGEMM_L8x16_SUB2_16:
LSGEMM_L8x16_SUB2_8:
andi. T10,L, 8
ble LSGEMM_L8x16_SUB2_4
LOAD8x16_0
KERNEL8x16_I1_L4_2 64,32, 0,0
KERNEL8x16_I1_L4_3 64,32, 1,1
LOAD8x16_2
KERNEL8x16_I1_L4_2 128,64, 0,0
KERNEL8x16_I1_L4_3 128,64, 1,1
MY_ALIGN
LSGEMM_L8x16_SUB2_4:
andi. T10,L, 4
ble LSGEMM_L8x16_SUB2_2
LOAD8x16_0
KERNEL8x16_I1_L4_3 64,32, 0,1
LOAD8x16_2
KERNEL8x16_I1_L4_3 128,64, 0,1
MY_ALIGN
LSGEMM_L8x16_SUB2_2:
andi. T10,L, 2
ble LSGEMM_L8x16_SUB2_1
LOAD8x16_0
KERNEL8x16_I1_L2_3 64,32, 0,1
LOAD8x16_2
KERNEL8x16_E2 128,64, 0,1
MY_ALIGN
LSGEMM_L8x16_SUB2_1:
andi. T10,L, 1

View File

@ -38,13 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Macros for N=8 and M=16
**********************************************************************************************/
.macro LOAD8x16_1
LOAD8x16 1
.endm
.macro LOAD8x16_0
LOAD8x16 0
.endm
.macro KERNEL8x16_L1_L4 Index,IsLast
KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
@ -61,10 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast
KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
.endm
.macro KERNEL8x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast
KERNEL8x16_L1_L2_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
.endm
.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
.endm
@ -108,61 +99,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxlxor vs63, vs63, vs63
.endm
.macro LOAD8x16 Zero
.macro LOAD8x16 OffsetA,OffsetB
lxv vs24, 0(BO)
lxv vs28, 16(BO)
lxv vs24, (\OffsetB+0)(BO)
lxv vs28, (\OffsetB+16)(BO)
xxperm vs26, vs24, permute_mask
xxperm vs30, vs28, permute_mask
lxv vs0, 0(AO)
lxv vs1, 16(AO)
lxv vs0, (\OffsetA+0)(AO)
lxv vs1, (\OffsetA+16)(AO)
xxpermdi vs25, vs24, vs24,2
xxpermdi vs29, vs28, vs28,2
lxv vs2, 32(AO)
lxv vs3, 48(AO)
lxv vs2, (\OffsetA+32)(AO)
lxv vs3, (\OffsetA+48)(AO)
xxpermdi vs27, vs26, vs26,2
xxpermdi vs31, vs30, vs30,2
.if \Zero==1
xxlxor vs32, vs32, vs32
xxlxor vs33, vs33, vs33
xxlxor vs34, vs34, vs34
xxlxor vs35, vs35, vs35
xxlxor vs36, vs36, vs36
xxlxor vs37, vs37, vs37
xxlxor vs38, vs38, vs38
xxlxor vs39, vs39, vs39
xxlxor vs40, vs40, vs40
xxlxor vs41, vs41, vs41
xxlxor vs42, vs42, vs42
xxlxor vs43, vs43, vs43
xxlxor vs44, vs44, vs44
xxlxor vs45, vs45, vs45
xxlxor vs46, vs46, vs46
xxlxor vs47, vs47, vs47
xxlxor vs48, vs48, vs48
xxlxor vs49, vs49, vs49
xxlxor vs50, vs50, vs50
xxlxor vs51, vs51, vs51
xxlxor vs52, vs52, vs52
xxlxor vs53, vs53, vs53
xxlxor vs54, vs54, vs54
xxlxor vs55, vs55, vs55
xxlxor vs56, vs56, vs56
xxlxor vs57, vs57, vs57
xxlxor vs58, vs58, vs58
xxlxor vs59, vs59, vs59
xxlxor vs60, vs60, vs60
xxlxor vs61, vs61, vs61
xxlxor vs62, vs62, vs62
xxlxor vs63, vs63, vs63
.endif
.endm
.macro END8x16_NORMAL
END8x16 0, AO, BO, 64,32
.endm
.macro END8x16_WITHOUT_ADD
END8x16 0, AO,BO,0,0
.endm
.macro END8x16 First, AREG, BREG, OffsetA, OffsetB
.if \OffsetB != 0
@ -258,145 +219,202 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
KERNEL8x16_L1_L2_I \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
KERNEL8x16_L1_L2_I \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
KERNEL8x16_2 \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
KERNEL8x16_2 \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
.endm
.macro KERNEL8x16 First
LOAD8x16 0
LOAD8x16 0,0
END8x16 \First, AO, BO, 64,32
.endm
.macro KERNEL8x16_L1_L2_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
lxv vs8, DISP16(\Index,\OffsetB)(\BREG)
lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
.macro LOAD8x16_2
LOAD8x16_2O AO,BO, 0,0
.endm
.macro LOAD8x16_2O AREG,BREG, OffsetA,OffsetB
lxv vs8, (\OffsetB)(\BREG)
lxv vs12, (16+\OffsetB)(\BREG)
lxv vs24, (32+\OffsetB)(\BREG)
lxv vs28, (32+16+\OffsetB)(\BREG)
lxv vs4, (0+\OffsetA)(\AREG)
lxv vs5, (16+\OffsetA)(\AREG)
xxperm vs10, vs8, permute_mask
xxperm vs14, vs12, permute_mask
lxv vs6, (32+\OffsetA)(\AREG)
lxv vs7, (48+\OffsetA)(\AREG)
xxpermdi vs9, vs8, vs8,2
xxpermdi vs13, vs12, vs12,2
lxv vs0, (64+\OffsetA)(\AREG)
lxv vs1, (64+16+\OffsetA)(\AREG)
xxpermdi vs11, vs10, vs10,2
xxpermdi vs15, vs14, vs14,2
lxv vs2, (64+32+\OffsetA)(\AREG)
lxv vs3, (64+48+\OffsetA)(\AREG)
xxperm vs26, vs24, permute_mask
xxperm vs30, vs28, permute_mask
xxpermdi vs25, vs24, vs24,2
xxpermdi vs29, vs28, vs28,2
xxpermdi vs27, vs26, vs26,2
xxpermdi vs31, vs30, vs30,2
.endm
.macro END8x16_2
/*for load2 offset will be 128 and 64*/
KERNEL8x16_2 AO,BO, 128,64,0 ,1,1
.endm
.macro KERNEL8x16_E2 OffsetA,OffsetB, Index,IsLast
KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
.endm
.macro KERNEL8x16_L2 OffsetA,OffsetB, Index,IsLast
KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
.endm
.macro KERNEL8x16_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
xvmaddasp vs32, vs4,vs8
xvmaddasp vs33, vs5,vs8
xvmaddasp vs48, vs4,vs12
xvmaddasp vs49, vs5,vs12
xvmaddasp vs40, vs4,vs10
xvmaddasp vs41, vs5,vs10
xvmaddasp vs56, vs4,vs14
xvmaddasp vs57, vs5,vs14
xvmaddasp vs36, vs4,vs9
xvmaddasp vs37, vs5,vs9
xvmaddasp vs52, vs4,vs13
xvmaddasp vs53, vs5,vs13
xvmaddasp vs44, vs4,vs11
xvmaddasp vs45, vs5,vs11
xvmaddasp vs60, vs4,vs15
xvmaddasp vs61, vs5,vs15
.if \Complete==0
lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG)
lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
.endif
xvmaddasp vs34, vs6,vs8
xvmaddasp vs35, vs7,vs8
xvmaddasp vs50, vs6,vs12
xvmaddasp vs51, vs7,vs12
.if \Complete==0
lxv vs8, DISP16(\Index,\OffsetB)(\BREG)
lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
.endif
xvmaddasp vs42, vs6,vs10
xvmaddasp vs43, vs7,vs10
xvmaddasp vs58, vs6,vs14
xvmaddasp vs59, vs7,vs14
.if \Complete==0
xxperm vs10, vs8, permute_mask
xxperm vs14, vs12, permute_mask
.endif
xvmaddasp vs38, vs6,vs9
xvmaddasp vs39, vs7,vs9
xvmaddasp vs54, vs6,vs13
xvmaddasp vs55, vs7,vs13
.if \Complete==0
xxpermdi vs9, vs8, vs8,2
xxpermdi vs13, vs12, vs12,2
.endif
xvmaddasp vs46, vs6,vs11
xvmaddasp vs47, vs7,vs11
xvmaddasp vs62, vs6,vs15
xvmaddasp vs63, vs7,vs15
.if \Complete==0
xxpermdi vs11, vs10, vs10,2
xxpermdi vs15, vs14, vs14,2
.endif
.if \Complete==0
lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG)
lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG)
.endif
xvmaddasp vs32, vs0,vs24
xvmaddasp vs36, vs0,vs25
lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG)
lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
xxperm vs10, vs8, permute_mask
xxperm vs14, vs12, permute_mask
xvmaddasp vs40, vs0,vs26
xvmaddasp vs44, vs0,vs27
lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG)
lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG)
xvmaddasp vs33, vs1,vs24
xvmaddasp vs48, vs0,vs28
xvmaddasp vs52, vs0,vs29
xxpermdi vs9, vs8, vs8,2
xxpermdi vs13, vs12, vs12,2
xvmaddasp vs49, vs1,vs28
xvmaddasp vs40, vs0,vs26
xvmaddasp vs41, vs1,vs26
xvmaddasp vs56, vs0,vs30
xvmaddasp vs57, vs1,vs30
xvmaddasp vs36, vs0,vs25
xvmaddasp vs37, vs1,vs25
xvmaddasp vs52, vs0,vs29
xvmaddasp vs53, vs1,vs29
xvmaddasp vs44, vs0,vs27
xvmaddasp vs45, vs1,vs27
xvmaddasp vs60, vs0,vs31
xxpermdi vs11, vs10, vs10,2
xxpermdi vs15, vs14, vs14,2
xvmaddasp vs33, vs1,vs24
xvmaddasp vs37, vs1,vs25
xvmaddasp vs41, vs1,vs26
xvmaddasp vs45, vs1,vs27
xvmaddasp vs49, vs1,vs28
xvmaddasp vs53, vs1,vs29
xvmaddasp vs57, vs1,vs30
xvmaddasp vs61, vs1,vs31
xvmaddasp vs61, vs1,vs31
.if \Complete==0
lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
.endif
xvmaddasp vs34, vs2,vs24
xvmaddasp vs38, vs2,vs25
xvmaddasp vs42, vs2,vs26
xvmaddasp vs46, vs2,vs27
xvmaddasp vs50, vs2,vs28
xvmaddasp vs54, vs2,vs29
xvmaddasp vs58, vs2,vs30
xvmaddasp vs62, vs2,vs31
xvmaddasp vs35, vs3,vs24
xvmaddasp vs39, vs3,vs25
xvmaddasp vs43, vs3,vs26
xvmaddasp vs47, vs3,vs27
xvmaddasp vs51, vs3,vs28
xvmaddasp vs55, vs3,vs29
xvmaddasp vs59, vs3,vs30
xvmaddasp vs63, vs3,vs31
.if \Complete==0
lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
.endif
xvmaddasp vs32, vs4,vs8
xvmaddasp vs36, vs4,vs9
xvmaddasp vs34, vs2,vs24
xvmaddasp vs35, vs3,vs24
xvmaddasp vs50, vs2,vs28
xvmaddasp vs51, vs3,vs28
.if \Complete==0
lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
.endif
xvmaddasp vs42, vs2,vs26
xvmaddasp vs43, vs3,vs26
xvmaddasp vs58, vs2,vs30
xvmaddasp vs59, vs3,vs30
.if \Complete==0
xxperm vs26, vs24, permute_mask
xxperm vs30, vs28, permute_mask
.endif
xvmaddasp vs38, vs2,vs25
xvmaddasp vs39, vs3,vs25
xvmaddasp vs54, vs2,vs29
xvmaddasp vs55, vs3,vs29
.if \Complete==0
xxpermdi vs25, vs24, vs24,2
xxpermdi vs29, vs28, vs28,2
.endif
xvmaddasp vs46, vs2,vs27
xvmaddasp vs47, vs3,vs27
xvmaddasp vs62, vs2,vs31
xvmaddasp vs63, vs3,vs31
.if \Complete==0
xxpermdi vs27, vs26, vs26,2
xxpermdi vs31, vs30, vs30,2
.endif
.if \Complete==0
lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
.endif
.if \IsLast==1
.if \Complete==1
addi \AREG, \AREG, DISP32(\Index,64+\OffsetA)
addi \BREG, \BREG, DISP16(\Index,32+\OffsetB)
addi \BREG, \BREG, DISP16(\Index,\OffsetB)
addi \AREG, \AREG, DISP32(\Index,\OffsetA)
.else
addi \AREG, \AREG, DISP32(\Index,128)
addi \BREG, \BREG, DISP16(\Index,64)
addi \AREG, \AREG, DISP32(\Index,128)
.endif
.endif
xvmaddasp vs40, vs4,vs10
xvmaddasp vs44, vs4,vs11
.if \Complete==0
xxperm vs26, vs24, permute_mask
xxperm vs30, vs28, permute_mask
.endif
xvmaddasp vs48, vs4,vs12
xvmaddasp vs52, vs4,vs13
.if \Complete==0
xxpermdi vs25, vs24, vs24,2
xxpermdi vs29, vs28, vs28,2
.endif
xvmaddasp vs56, vs4,vs14
xvmaddasp vs60, vs4,vs15
.if \Complete==0
xxpermdi vs27, vs26, vs26,2
xxpermdi vs31, vs30, vs30,2
.endif
xvmaddasp vs33, vs5,vs8
xvmaddasp vs37, vs5,vs9
xvmaddasp vs41, vs5,vs10
xvmaddasp vs45, vs5,vs11
xvmaddasp vs49, vs5,vs12
xvmaddasp vs53, vs5,vs13
xvmaddasp vs57, vs5,vs14
xvmaddasp vs61, vs5,vs15
xvmaddasp vs34, vs6,vs8
xvmaddasp vs38, vs6,vs9
xvmaddasp vs42, vs6,vs10
xvmaddasp vs46, vs6,vs11
xvmaddasp vs50, vs6,vs12
xvmaddasp vs54, vs6,vs13
xvmaddasp vs58, vs6,vs14
xvmaddasp vs62, vs6,vs15
xvmaddasp vs35, vs7,vs8
xvmaddasp vs39, vs7,vs9
xvmaddasp vs43, vs7,vs10
xvmaddasp vs47, vs7,vs11
xvmaddasp vs51, vs7,vs12
xvmaddasp vs55, vs7,vs13
xvmaddasp vs59, vs7,vs14
xvmaddasp vs63, vs7,vs15
.endm

View File

@ -2253,7 +2253,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CGEMM_DEFAULT_P 640
#define ZGEMM_DEFAULT_P 256
#define SGEMM_DEFAULT_Q 1025
#define SGEMM_DEFAULT_Q 1026
#define DGEMM_DEFAULT_Q 384
#define CGEMM_DEFAULT_Q 640
#define ZGEMM_DEFAULT_Q 1026