OpenBLAS/kernel/power/sgemm_logic_power9.S

2192 lines
43 KiB
ArmAsm

#define MY_ALIGN .align 3
b L8
MY_ALIGN
LSGEMM_L8x16_LMAIN_SUB:
LOAD8x16_2
MY_ALIGN
LSGEMM_L8x16_LOOP:
KERNEL8x16_L2 128,64,0,0
LSGEMM_L8x16_K128:
KERNEL8x16_L2 128,64,1,0
KERNEL8x16_I1_L4_2 128,64, 1,0
KERNEL8x16_I1_L4_2 128,64, 2,0
KERNEL8x16_I1_L4_2 128,64, 3,0
KERNEL8x16_I1_L4_2 128,64, 4,0
KERNEL8x16_I1_L4_2 128,64, 5,0
KERNEL8x16_I1_L4_2 128,64, 6,0
KERNEL8x16_I1_L4_2 128,64, 7,0
KERNEL8x16_I1_L4_2 128,64, 8,0
KERNEL8x16_I1_L4_2 128,64, 9,0
KERNEL8x16_I1_L4_2 128,64, 10,0
KERNEL8x16_I1_L4_2 128,64, 11,0
KERNEL8x16_I1_L4_2 128,64, 12,0
KERNEL8x16_I1_L4_2 128,64, 13,0
KERNEL8x16_I1_L4_2 128,64, 14,0
KERNEL8x16_I1_L4_2 128,64, 15,0
KERNEL8x16_I1_L4_2 128,64, 16,0
KERNEL8x16_I1_L4_2 128,64, 17,0
KERNEL8x16_I1_L4_2 128,64, 18,0
KERNEL8x16_I1_L4_2 128,64, 19,0
KERNEL8x16_I1_L4_2 128,64, 20,0
KERNEL8x16_I1_L4_2 128,64, 21,0
KERNEL8x16_I1_L4_2 128,64, 22,0
KERNEL8x16_I1_L4_2 128,64, 23,0
KERNEL8x16_I1_L4_2 128,64, 24,0
KERNEL8x16_I1_L4_2 128,64, 25,0
KERNEL8x16_I1_L4_2 128,64, 26,0
KERNEL8x16_I1_L4_2 128,64, 27,0
KERNEL8x16_I1_L4_2 128,64, 28,0
KERNEL8x16_I1_L4_2 128,64, 29,0
KERNEL8x16_I1_L4_2 128,64, 30,0
KERNEL8x16_I1_L4_2 128,64, 31,1
bdnz LSGEMM_L8x16_LOOP
MY_ALIGN
LSGEMM_L8x16_LOOP_END:
END8x16_2
blr
MY_ALIGN
LSGEMM_L8x16_L64_SUB:
LOAD8x16_2
KERNEL8x16_I1_L4_2 128,64, 0,0
KERNEL8x16_I1_L4_2 128,64, 1,0
KERNEL8x16_I1_L4_2 128,64, 2,0
KERNEL8x16_I1_L4_2 128,64,3,0
KERNEL8x16_I1_L4_2 128,64,4,0
KERNEL8x16_I1_L4_2 128,64,5,0
KERNEL8x16_I1_L4_2 128,64,6,0
KERNEL8x16_I1_L4_2 128,64,7,0
KERNEL8x16_I1_L4_2 128,64,8,0
KERNEL8x16_I1_L4_2 128,64,9,0
KERNEL8x16_I1_L4_2 128,64,10,0
KERNEL8x16_I1_L4_2 128,64,11,0
KERNEL8x16_I1_L4_2 128,64,12,0
KERNEL8x16_I1_L4_2 128,64,13,0
KERNEL8x16_I1_L4_2 128,64,14,0
KERNEL8x16_I1_L4_3 128,64,15,1
blr
LSGEMM_L8x16_L32_SUB:
LOAD8x16_2
KERNEL8x16_I1_L4_2 128,64,0,0
KERNEL8x16_I1_L4_2 128,64,1,0
KERNEL8x16_I1_L4_2 128,64,2,0
KERNEL8x16_I1_L4_2 128,64,3,0
KERNEL8x16_I1_L4_2 128,64,4,0
KERNEL8x16_I1_L4_2 128,64,5,0
KERNEL8x16_I1_L4_2 128,64,6,0
KERNEL8x16_I1_L4_3 128,64,7,1
blr
LSGEMM_L8x16_L16_SUB:
LOAD8x16_2
KERNEL8x16_I1_L4_2 128,64,0,0
KERNEL8x16_I1_L4_2 128,64,1,0
KERNEL8x16_I1_L4_2 128,64,2,0
KERNEL8x16_I1_L4_3 128,64,3,1
blr
L8:
#if defined(TRMMKERNEL) && !defined(LEFT)
neg TEMP_REG, OFFSET
#endif
srawi. J, N, 3
ble LSGEMM_L8_END
LSGEMM_L8_BEGIN:
li T1, 128
li T2, 256
mr AO, A
mr CO, C
slwi T3, LDC , 3
add C, C, T3
dcbt A, T1
dcbt A, T2
#if defined(TRMMKERNEL) && defined(LEFT)
mr TEMP_REG, OFFSET /*off = offset;*/
#endif
srawi. I, M, 4
ble LSGEMM_L8x16_END
MY_ALIGN
LSGEMM_L8x16_BEGIN:
#if defined(TRMMKERNEL)
REFRESH_POINTERS AO,BO,TEMP_REG,B,16,8
#else
mr BO, B
#endif
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
mr T12, T11
addi T12,T12, -2
srawi. L, T12, 7 /**(T11-2) % 128x */
#else
mr T12, K
addi T12,T12, -2
srawi. L, T12, 7 /**(K-2) % 128x */
#endif
ZERO8x16
ble LSGEMM_L8x16_SUB0
mtctr L
bl LSGEMM_L8x16_LMAIN_SUB
andi. L, T12, 127
ble LSGEMM_L8x16_SAVE
b LSGEMM_L8x16_SUB2
MY_ALIGN
LSGEMM_L8x16_SUB0:
#if defined(TRMMKERNEL)
andi. L, T11, 255
cmpwi T11,129
#else
andi. L, K, 255
cmpwi K,129
#endif
li T10,1
bne CMP8x16_128K
addi BO,BO,-32
addi AO,AO,-64
LOAD8x16 64,32
END8x16_WITHOUT_ADD
LOAD8x16_2O AO,BO, 128, 64
mtctr T10
bl LSGEMM_L8x16_K128
b LSGEMM_L8x16_SAVE
CMP8x16_128K:
/*----------------------------------------*/
#if defined(TRMMKERNEL)
cmpwi T11,128
#else
cmpwi K,128
#endif
bne LSGEMM_L8x16_SUB2
MY_ALIGN
mtctr T10
addi BO,BO,-64
addi AO,AO,-128
LOAD8x16_2O AO,BO, 128,64
bl LSGEMM_L8x16_K128
b LSGEMM_L8x16_SAVE
MY_ALIGN
LSGEMM_L8x16_SUB2:
andi. T10,L,64
ble LSGEMM_L8x16_SUB2_32
bl LSGEMM_L8x16_L64_SUB
MY_ALIGN
LSGEMM_L8x16_SUB2_32:
andi. T10,L, 32
ble LSGEMM_L8x16_SUB2_16
bl LSGEMM_L8x16_L32_SUB
MY_ALIGN
LSGEMM_L8x16_SUB2_16:
andi. T10,L, 16
ble LSGEMM_L8x16_SUB2_8
bl LSGEMM_L8x16_L16_SUB
MY_ALIGN
LSGEMM_L8x16_SUB2_8:
andi. T10,L, 8
ble LSGEMM_L8x16_SUB2_4
LOAD8x16_2
KERNEL8x16_I1_L4_2 128,64, 0,0
KERNEL8x16_I1_L4_3 128,64, 1,1
MY_ALIGN
LSGEMM_L8x16_SUB2_4:
andi. T10,L, 4
ble LSGEMM_L8x16_SUB2_2
LOAD8x16_2
KERNEL8x16_I1_L4_3 128,64, 0,1
MY_ALIGN
LSGEMM_L8x16_SUB2_2:
andi. T10,L, 2
ble LSGEMM_L8x16_SUB2_1
LOAD8x16_2
KERNEL8x16_E2 128,64, 0,1
MY_ALIGN
LSGEMM_L8x16_SUB2_1:
andi. T10,L, 1
ble LSGEMM_L8x16_SAVE
KERNEL8x16 0
MY_ALIGN
LSGEMM_L8x16_SAVE:
SAVE8x16
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8
#endif
addic. I, I, -1
bgt+ LSGEMM_L8x16_BEGIN
MY_ALIGN
LSGEMM_L8x16_END:
LSGEMM_L8x8_BEGIN:
andi. T2, M, 15
ble LSGEMM_L8x1_END
andi. T1, M, 8
ble LSGEMM_L8x8_END
#if defined(TRMMKERNEL)
REFRESH_POINTERS AO,BO,TEMP_REG,B,8,8
#else
mr BO, B
#endif
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,8,8
mr T12, T11
addi T12,T12, -1
srawi. L, T12, 4 /**(T11-1) % 16x */
#else
mr T12, K
addi T12,T12, -1
srawi. L, T12, 4 /**(K-1) % 16x */
#endif
ZERO8x8
ble LSGEMM_L8x8_SUB0
MY_ALIGN
LSGEMM_L8x8_LOOP_START:
LOAD8x8_0 /*we already zeroed */
mtctr L
MY_ALIGN
LSGEMM_L8x8_LOOP:
KERNEL8x8_I1_L4_2 32,32, 0,0
KERNEL8x8_I1_L4_2 32,32, 1,0
KERNEL8x8_I1_L4_2 32,32, 2,0
KERNEL8x8_I1_L4_2 32,32, 3,1
bdnz LSGEMM_L8x8_LOOP
MY_ALIGN
LSGEMM_L8x8_LOOP_END:
END8x8 0, AO, BO, 32, 32
b LSGEMM_L8x8_SUB1
MY_ALIGN
LSGEMM_L8x8_SUB0:
#if defined(TRMMKERNEL)
andi. L, T11, 31
#else
andi. L, K, 31
#endif
b LSGEMM_L8x8_SUB2
MY_ALIGN
LSGEMM_L8x8_SUB1:
#if defined(TRMMKERNEL)
andi. L, T12, 15
#else
andi. L, T12, 15
#endif
ble LSGEMM_L8x8_SAVE
MY_ALIGN
LSGEMM_L8x8_SUB2:
srawi. T1,L, 3
ble LSGEMM_L8x8_SUB2_4
mtctr T1
MY_ALIGN
LSGEMM_L8x8_SUB2_LOOP:
LOAD8x8_0
KERNEL8x8_I1_L4_2 32,32, 0,0
KERNEL8x8_I1_L4_3 32,32, 1,1
bdnz LSGEMM_L8x8_SUB2_LOOP
MY_ALIGN
LSGEMM_L8x8_SUB2_4:
andi. T1,L, 4
ble LSGEMM_L8x8_SUB2_2
LOAD8x8_0
KERNEL8x8_I1_L4_3 32,32, 0,1
MY_ALIGN
LSGEMM_L8x8_SUB2_2:
andi. T1,L, 2
ble LSGEMM_L8x8_SUB2_1
LOAD8x8_0
KERNEL8x8_I1_L2_3 32,32, 0,1
MY_ALIGN
LSGEMM_L8x8_SUB2_1:
andi. T1,L, 1
ble LSGEMM_L8x8_SAVE
KERNEL8x8 0
MY_ALIGN
LSGEMM_L8x8_SAVE:
SAVE8x8
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8
#endif
MY_ALIGN
LSGEMM_L8x8_END:
LSGEMM_L8x4_BEGIN:
andi. T2, M, 15
ble LSGEMM_L8x1_END
andi. T1, M, 4
ble LSGEMM_L8x4_END
#if defined(TRMMKERNEL)
REFRESH_POINTERS AO,BO,TEMP_REG,B,4,8
#else
mr BO, B
#endif
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,4,8
mr T12, T11
addi T12,T12, -1
srawi. L, T12, 4 /**(T11-1) % 16x */
#else
mr T12, K
addi T12,T12, -1
srawi. L, T12, 4 /**(K-1) % 16x */
#endif
ZERO8x4
ble LSGEMM_L8x4_SUB0
MY_ALIGN
LSGEMM_L8x4_LOOP_START:
LOAD8x4_0 /*we already zeroed */
mtctr L
MY_ALIGN
LSGEMM_L8x4_LOOP:
KERNEL8x4_I1_L4_2 16,32, 0,0
KERNEL8x4_I1_L4_2 16,32, 1,0
KERNEL8x4_I1_L4_2 16,32, 2,0
KERNEL8x4_I1_L4_2 16,32, 3,1
bdnz LSGEMM_L8x4_LOOP
MY_ALIGN
LSGEMM_L8x4_LOOP_END:
END8x4 0, AO, BO, 16, 32
b LSGEMM_L8x4_SUB1
MY_ALIGN
LSGEMM_L8x4_SUB0:
#if defined(TRMMKERNEL)
andi. L, T11, 31
#else
andi. L, K, 31
#endif
b LSGEMM_L8x4_SUB2
MY_ALIGN
LSGEMM_L8x4_SUB1:
#if defined(TRMMKERNEL)
andi. L, T12, 15
#else
andi. L, T12, 15
#endif
ble LSGEMM_L8x4_SAVE
MY_ALIGN
LSGEMM_L8x4_SUB2:
srawi. T1,L, 3
ble LSGEMM_L8x4_SUB2_4
mtctr T1
MY_ALIGN
LSGEMM_L8x4_SUB2_LOOP:
LOAD8x4_0
KERNEL8x4_I1_L4_2 16,32, 0,0
KERNEL8x4_I1_L4_3 16,32, 1,1
bdnz LSGEMM_L8x4_SUB2_LOOP
MY_ALIGN
LSGEMM_L8x4_SUB2_4:
andi. T1,L, 4
ble LSGEMM_L8x4_SUB2_2
LOAD8x4_0
KERNEL8x4_I1_L4_3 16,32, 0,1
MY_ALIGN
LSGEMM_L8x4_SUB2_2:
andi. T1,L, 2
ble LSGEMM_L8x4_SUB2_1
LOAD8x4_0
KERNEL8x4_I1_L2_3 16,32, 0,1
MY_ALIGN
LSGEMM_L8x4_SUB2_1:
andi. T1,L, 1
ble LSGEMM_L8x4_SAVE
KERNEL8x4 0
MY_ALIGN
LSGEMM_L8x4_SAVE:
SAVE8x4
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8
#endif
MY_ALIGN
LSGEMM_L8x4_END:
LSGEMM_L8x2_BEGIN:
andi. T1, M, 2
ble LSGEMM_L8x2_END
#if defined(TRMMKERNEL)
REFRESH_POINTERS AO,BO,TEMP_REG,B,2,8
#else
mr BO, B
#endif
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,2,8
srawi. L, T11, 3 /**(T11) % 8x */
#else
srawi. L, K, 3 /**(K) % 8x */
#endif
ZERO8x2
ble LSGEMM_L8x2_SUB0
MY_ALIGN
LSGEMM_L8x2_LOOP_START:
mtctr L
MY_ALIGN
LSGEMM_L8x2_LOOP:
KERNEL8x2_2 0,0, 0,0
KERNEL8x2_2 0,0, 1,0
KERNEL8x2_2 0,0, 2,0
KERNEL8x2_2 0,0, 3,1
bdnz LSGEMM_L8x2_LOOP
MY_ALIGN
LSGEMM_L8x2_LOOP_END:
LSGEMM_L8x2_SUB0:
#if defined(TRMMKERNEL)
andi. L, T11, 7
#else
andi. L, K, 7
#endif
ble LSGEMM_L8x2_SAVE
MY_ALIGN
LSGEMM_L8x2_SUB2:
andi. T1,L, 4
ble LSGEMM_L8x2_SUB2_2
KERNEL8x2_2 0,0, 0,0
KERNEL8x2_2 0,0, 1,1
MY_ALIGN
LSGEMM_L8x2_SUB2_2:
andi. T1,L, 2
ble LSGEMM_L8x2_SUB2_1
KERNEL8x2_2 0,0, 0,1
MY_ALIGN
LSGEMM_L8x2_SUB2_1:
andi. T1,L, 1
ble LSGEMM_L8x2_SAVE
KERNEL8x2
MY_ALIGN
LSGEMM_L8x2_SAVE:
SAVE8x2
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8
#endif
MY_ALIGN
LSGEMM_L8x2_END:
LSGEMM_L8x1_BEGIN:
andi. T1, M, 1
ble LSGEMM_L8x1_END
#if defined(TRMMKERNEL)
REFRESH_POINTERS AO,BO,TEMP_REG,B,1,8
#else
mr BO, B
#endif
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,1,8
srawi. L, T11, 3 /**(T11) % 8x */
#else
srawi. L, K, 3 /**(K) % 8x */
#endif
ZERO8x1
ble LSGEMM_L8x1_SUB0
MY_ALIGN
LSGEMM_L8x1_LOOP_START:
mtctr L
MY_ALIGN
LSGEMM_L8x1_LOOP:
KERNEL8x1_4 0,0, 0,0
KERNEL8x1_4 0,0, 1,1
bdnz LSGEMM_L8x1_LOOP
MY_ALIGN
LSGEMM_L8x1_LOOP_END:
LSGEMM_L8x1_SUB0:
#if defined(TRMMKERNEL)
andi. L, T11, 7
#else
andi. L, K, 7
#endif
ble LSGEMM_L8x1_SAVE
MY_ALIGN
LSGEMM_L8x1_SUB2:
andi. T1,L, 4
ble LSGEMM_L8x1_SUB2_2
KERNEL8x1_4 0,0, 0,1
MY_ALIGN
LSGEMM_L8x1_SUB2_2:
andi. T1,L, 2
ble LSGEMM_L8x1_SUB2_1
KERNEL8x1_2
MY_ALIGN
LSGEMM_L8x1_SUB2_1:
andi. T1,L, 1
ble LSGEMM_L8x1_SAVE
KERNEL8x1
MY_ALIGN
LSGEMM_L8x1_SAVE:
SAVE8x1
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8
#endif
MY_ALIGN
LSGEMM_L8x1_END:
slwi T1, K, 5
add B, B, T1
#if defined(TRMMKERNEL) && !defined(LEFT)
addi TEMP_REG, TEMP_REG, 8
#endif
addic. J, J, -1
bgt LSGEMM_L8_BEGIN
LSGEMM_L8_END:
/* b LSGEMM_L4_BEGIN*/
andi. T1, N, 4
ble LSGEMM_L4_END
LSGEMM_L4_BEGIN:
mr AO, A
mr CO, C
slwi T3, LDC , 2
add C, C, T3
#if defined(TRMMKERNEL) && defined(LEFT)
mr TEMP_REG, OFFSET /*off = offset;*/
#endif
srawi. I, M, 4
ble LSGEMM_L4x16_END
MY_ALIGN
LSGEMM_L4x16_BEGIN:
#if defined(TRMMKERNEL)
REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4
#else
mr BO, B
#endif
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,16,4
mr T12, T11
addi T12,T12, -1
srawi. L, T12, 6 /**(T11-1) % 64x */
#else
mr T12, K
addi T12,T12, -1
srawi. L, T12, 6 /**(K-1) % 64x */
#endif
ZERO4x16
ble LSGEMM_L4x16_SUB0
MY_ALIGN
LSGEMM_L4x16_LOOP_START:
LOAD4x16_0 /*we already zeroed */
##OffsetA=64 OffsetB=16
addi AO,AO,2112
addi BO,BO,16
mtctr L
MY_ALIGN
LSGEMM_L4x16_LOOP:
KERNEL4x16_I1_L4_2 -2048,0, 0,0
KERNEL4x16_I1_L4_2 -2048,0, 1,0
KERNEL4x16_I1_L4_2 -2048,0, 2,0
KERNEL4x16_I1_L4_2 -2048,0, 3,0
KERNEL4x16_I1_L4_2 -2048,0, 4,0
KERNEL4x16_I1_L4_2 -2048,0, 5,0
KERNEL4x16_I1_L4_2 -2048,0, 6,0
KERNEL4x16_I1_L4_2 -2048,0, 7,0
KERNEL4x16_I1_L4_2 -2048,0, 8,0
KERNEL4x16_I1_L4_2 -2048,0, 9,0
KERNEL4x16_I1_L4_2 -2048,0, 10,0
KERNEL4x16_I1_L4_2 -2048,0, 11,0
KERNEL4x16_I1_L4_2 -2048,0, 12,0
KERNEL4x16_I1_L4_2 -2048,0, 13,0
KERNEL4x16_I1_L4_2 -2048,0, 14,0
KERNEL4x16_I1_L4_2 -2048,0, 15,1
bdnz LSGEMM_L4x16_LOOP
MY_ALIGN
LSGEMM_L4x16_LOOP_END:
END4x16 0, AO, BO, -2048, 0
b LSGEMM_L4x16_SUB1
MY_ALIGN
LSGEMM_L4x16_SUB0:
#if defined(TRMMKERNEL)
andi. L, T11, 127
#else
andi. L, K, 127
#endif
b LSGEMM_L4x16_SUB2
MY_ALIGN
LSGEMM_L4x16_SUB1:
#if defined(TRMMKERNEL)
andi. L, T12, 63
#else
andi. L, T12, 63
#endif
ble LSGEMM_L4x16_SAVE
MY_ALIGN
LSGEMM_L4x16_SUB2:
srawi. T10,L, 5
ble LSGEMM_L4x16_SUB2_16
mtctr T10
MY_ALIGN
LSGEMM_L4x16_SUB2_LOOP:
LOAD4x16_0
KERNEL4x16_I1_L4_2 64,16, 0,0
KERNEL4x16_I1_L4_2 64,16, 1,0
KERNEL4x16_I1_L4_2 64,16, 2,0
KERNEL4x16_I1_L4_2 64,16, 3,0
KERNEL4x16_I1_L4_2 64,16, 4,0
KERNEL4x16_I1_L4_2 64,16, 5,0
KERNEL4x16_I1_L4_2 64,16, 6,0
KERNEL4x16_I1_L4_3 64,16, 7,1
bdnz LSGEMM_L4x16_SUB2_LOOP
MY_ALIGN
LSGEMM_L4x16_SUB2_16:
andi. T10,L, 16
ble LSGEMM_L4x16_SUB2_8
LOAD4x16_0
KERNEL4x16_I1_L4_2 64,16, 0,0
KERNEL4x16_I1_L4_2 64,16, 1,0
KERNEL4x16_I1_L4_2 64,16, 2,0
KERNEL4x16_I1_L4_3 64,16, 3,1
MY_ALIGN
LSGEMM_L4x16_SUB2_8:
andi. T10,L, 8
ble LSGEMM_L4x16_SUB2_4
LOAD4x16_0
KERNEL4x16_I1_L4_2 64,16, 0,0
KERNEL4x16_I1_L4_3 64,16, 1,1
MY_ALIGN
LSGEMM_L4x16_SUB2_4:
andi. T10,L, 4
ble LSGEMM_L4x16_SUB2_2
LOAD4x16_0
KERNEL4x16_I1_L4_3 64,16, 0,1
MY_ALIGN
LSGEMM_L4x16_SUB2_2:
andi. T10,L, 2
ble LSGEMM_L4x16_SUB2_1
LOAD4x16_0
KERNEL4x16_I1_L2_3 64,16, 0,1
MY_ALIGN
LSGEMM_L4x16_SUB2_1:
andi. T10,L, 1
ble LSGEMM_L4x16_SAVE
KERNEL4x16 0
# addic. L, L, -1
# bgt LSGEMM_L4x16_SUB2
MY_ALIGN
LSGEMM_L4x16_SAVE:
SAVE4x16
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4
#endif
addic. I, I, -1
bgt+ LSGEMM_L4x16_BEGIN
MY_ALIGN
LSGEMM_L4x16_END:
LSGEMM_L4x8_BEGIN:
andi. T2, M, 15
ble LSGEMM_L4x1_END
andi. T1, M, 8
ble LSGEMM_L4x8_END
#if defined(TRMMKERNEL)
REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4
#else
mr BO, B
#endif
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,8,4
mr T12, T11
addi T12,T12, -1
srawi. L, T12, 4 /**(T11-1) % 16x */
#else
mr T12, K
addi T12,T12, -1
srawi. L, T12, 4 /**(K-1) % 16x */
#endif
ZERO4x8
ble LSGEMM_L4x8_SUB0
MY_ALIGN
LSGEMM_L4x8_LOOP_START:
LOAD4x8_0 /*we already zeroed */
mtctr L
MY_ALIGN
LSGEMM_L4x8_LOOP:
KERNEL4x8_I1_L4_2 32,16, 0,0
KERNEL4x8_I1_L4_2 32,16, 1,0
KERNEL4x8_I1_L4_2 32,16, 2,0
KERNEL4x8_I1_L4_2 32,16, 3,1
bdnz LSGEMM_L4x8_LOOP
MY_ALIGN
LSGEMM_L4x8_LOOP_END:
END4x8 0, AO, BO, 32, 16
b LSGEMM_L4x8_SUB1
MY_ALIGN
LSGEMM_L4x8_SUB0:
#if defined(TRMMKERNEL)
andi. L, T11, 31
#else
andi. L, K, 31
#endif
b LSGEMM_L4x8_SUB2
MY_ALIGN
LSGEMM_L4x8_SUB1:
#if defined(TRMMKERNEL)
andi. L, T12, 15
#else
andi. L, T12, 15
#endif
ble LSGEMM_L4x8_SAVE
MY_ALIGN
LSGEMM_L4x8_SUB2:
srawi. T1,L, 3
ble LSGEMM_L4x8_SUB2_4
mtctr T1
MY_ALIGN
LSGEMM_L4x8_SUB2_LOOP:
LOAD4x8_0
KERNEL4x8_I1_L4_2 32,16, 0,0
KERNEL4x8_I1_L4_3 32,16, 1,1
bdnz LSGEMM_L4x8_SUB2_LOOP
MY_ALIGN
LSGEMM_L4x8_SUB2_4:
andi. T1,L, 4
ble LSGEMM_L4x8_SUB2_2
LOAD4x8_0
KERNEL4x8_I1_L4_3 32,16, 0,1
MY_ALIGN
LSGEMM_L4x8_SUB2_2:
andi. T1,L, 2
ble LSGEMM_L4x8_SUB2_1
LOAD4x8_0
KERNEL4x8_I1_L2_3 32,16, 0,1
MY_ALIGN
LSGEMM_L4x8_SUB2_1:
andi. T1,L, 1
ble LSGEMM_L4x8_SAVE
KERNEL4x8 0
MY_ALIGN
LSGEMM_L4x8_SAVE:
SAVE4x8
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4
#endif
MY_ALIGN
LSGEMM_L4x8_END:
LSGEMM_L4x4_BEGIN:
andi. T2, M, 15
ble LSGEMM_L4x1_END
andi. T1, M, 4
ble LSGEMM_L4x4_END
#if defined(TRMMKERNEL)
REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4
#else
mr BO, B
#endif
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,4,4
mr T12, T11
addi T12,T12, -1
srawi. L, T12, 4 /**(T11-1) % 16x */
#else
mr T12, K
addi T12,T12, -1
srawi. L, T12, 4 /**(K-1) % 16x */
#endif
ZERO4x4
ble LSGEMM_L4x4_SUB0
MY_ALIGN
LSGEMM_L4x4_LOOP_START:
LOAD4x4_0 /*we already zeroed */
mtctr L
MY_ALIGN
LSGEMM_L4x4_LOOP:
KERNEL4x4_I1_L4_2 16,16, 0,0
KERNEL4x4_I1_L4_2 16,16, 1,0
KERNEL4x4_I1_L4_2 16,16, 2,0
KERNEL4x4_I1_L4_2 16,16, 3,1
bdnz LSGEMM_L4x4_LOOP
MY_ALIGN
LSGEMM_L4x4_LOOP_END:
END4x4 0, AO, BO, 16, 16
b LSGEMM_L4x4_SUB1
MY_ALIGN
LSGEMM_L4x4_SUB0:
#if defined(TRMMKERNEL)
andi. L, T11, 31
#else
andi. L, K, 31
#endif
b LSGEMM_L4x4_SUB2
MY_ALIGN
LSGEMM_L4x4_SUB1:
#if defined(TRMMKERNEL)
andi. L, T12, 15
#else
andi. L, T12, 15
#endif
ble LSGEMM_L4x4_SAVE
MY_ALIGN
LSGEMM_L4x4_SUB2:
srawi. T1,L, 3
ble LSGEMM_L4x4_SUB2_4
mtctr T1
MY_ALIGN
LSGEMM_L4x4_SUB2_LOOP:
LOAD4x4_0
KERNEL4x4_I1_L4_2 16,16, 0,0
KERNEL4x4_I1_L4_3 16,16, 1,1
bdnz LSGEMM_L4x4_SUB2_LOOP
MY_ALIGN
LSGEMM_L4x4_SUB2_4:
andi. T1,L, 4
ble LSGEMM_L4x4_SUB2_2
LOAD4x4_0
KERNEL4x4_I1_L4_3 16,16, 0,1
MY_ALIGN
LSGEMM_L4x4_SUB2_2:
andi. T1,L, 2
ble LSGEMM_L4x4_SUB2_1
LOAD4x4_0
KERNEL4x4_I1_L2_3 16,16, 0,1
MY_ALIGN
LSGEMM_L4x4_SUB2_1:
andi. T1,L, 1
ble LSGEMM_L4x4_SAVE
KERNEL4x4 0
MY_ALIGN
LSGEMM_L4x4_SAVE:
SAVE4x4
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4
#endif
MY_ALIGN
LSGEMM_L4x4_END:
LSGEMM_L4x2_BEGIN:
andi. T1, M, 2
ble LSGEMM_L4x2_END
#if defined(TRMMKERNEL)
REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4
#else
mr BO, B
#endif
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,2,4
srawi. L, T11, 3 /**(T11) % 8x */
#else
srawi. L, K, 3 /**(K) % 8x */
#endif
ZERO4x2
ble LSGEMM_L4x2_SUB0
MY_ALIGN
LSGEMM_L4x2_LOOP_START:
mtctr L
MY_ALIGN
LSGEMM_L4x2_LOOP:
KERNEL4x2_2 0,0, 0,0
KERNEL4x2_2 0,0, 1,0
KERNEL4x2_2 0,0, 2,0
KERNEL4x2_2 0,0, 3,1
bdnz LSGEMM_L4x2_LOOP
MY_ALIGN
LSGEMM_L4x2_LOOP_END:
LSGEMM_L4x2_SUB0:
#if defined(TRMMKERNEL)
andi. L, T11, 7
#else
andi. L, K, 7
#endif
ble LSGEMM_L4x2_SAVE
MY_ALIGN
LSGEMM_L4x2_SUB2:
andi. T1,L, 4
ble LSGEMM_L4x2_SUB2_2
KERNEL4x2_2 0,0, 0,0
KERNEL4x2_2 0,0, 1,1
MY_ALIGN
LSGEMM_L4x2_SUB2_2:
andi. T1,L, 2
ble LSGEMM_L4x2_SUB2_1
KERNEL4x2_2 0,0, 0,1
MY_ALIGN
LSGEMM_L4x2_SUB2_1:
andi. T1,L, 1
ble LSGEMM_L4x2_SAVE
KERNEL4x2
MY_ALIGN
LSGEMM_L4x2_SAVE:
SAVE4x2
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4
#endif
MY_ALIGN
LSGEMM_L4x2_END:
LSGEMM_L4x1_BEGIN:
andi. T1, M, 1
ble LSGEMM_L4x1_END
#if defined(TRMMKERNEL)
REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4
#else
mr BO, B
#endif
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,1,4
srawi. L, T11, 3 /**(T11) % 8x */
#else
srawi. L, K, 3 /**(K) % 8x */
#endif
ZERO4x1
ble LSGEMM_L4x1_SUB0
MY_ALIGN
LSGEMM_L4x1_LOOP_START:
mtctr L
MY_ALIGN
LSGEMM_L4x1_LOOP:
KERNEL4x1_4 0,0, 0,0
KERNEL4x1_4 0,0, 1,1
bdnz LSGEMM_L4x1_LOOP
MY_ALIGN
LSGEMM_L4x1_LOOP_END:
LSGEMM_L4x1_SUB0:
#if defined(TRMMKERNEL)
andi. L, T11, 7
#else
andi. L, K, 7
#endif
ble LSGEMM_L4x1_SAVE
MY_ALIGN
LSGEMM_L4x1_SUB2:
andi. T1,L, 4
ble LSGEMM_L4x1_SUB2_2
KERNEL4x1_4 0,0, 0,1
MY_ALIGN
LSGEMM_L4x1_SUB2_2:
andi. T1,L, 2
ble LSGEMM_L4x1_SUB2_1
KERNEL4x1_2
MY_ALIGN
LSGEMM_L4x1_SUB2_1:
andi. T1,L, 1
ble LSGEMM_L4x1_SAVE
KERNEL4x1
MY_ALIGN
LSGEMM_L4x1_SAVE:
SAVE4x1
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4
#endif
MY_ALIGN
LSGEMM_L4x1_END:
slwi T1, K, 4
add B, B, T1
#if defined(TRMMKERNEL) && !defined(LEFT)
addi TEMP_REG, TEMP_REG, 4
#endif
andi. T2, N, 3
ble .L999
LSGEMM_L4_END:
andi. T1, N, 2
ble LSGEMM_L2_END
LSGEMM_L2_BEGIN:
mr AO, A
mr CO, C
slwi T3, LDC , 1
add C, C, T3
#if defined(TRMMKERNEL) && defined(LEFT)
mr TEMP_REG, OFFSET /*off = offset;*/
#endif
srawi. I, M, 4
ble LSGEMM_L2x16_END
MY_ALIGN
LSGEMM_L2x16_BEGIN:
#if defined(TRMMKERNEL)
REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2
#else
mr BO, B
#endif
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,16,2
srawi. L, T11, 6 /**(T11 ) % 64x */
#else
srawi. L, K, 6 /**(K ) % 64x */
#endif
ZERO2x16
ble LSGEMM_L2x16_SUB0
addi AO,AO,2048
mtctr L
MY_ALIGN
LSGEMM_L2x16_LOOP:
KERNEL2x16_4 -2048,0, 0,0
KERNEL2x16_4 -2048,0, 1,0
KERNEL2x16_4 -2048,0, 2,0
KERNEL2x16_4 -2048,0, 3,0
KERNEL2x16_4 -2048,0, 4,0
KERNEL2x16_4 -2048,0, 5,0
KERNEL2x16_4 -2048,0, 6,0
KERNEL2x16_4 -2048,0, 7,0
KERNEL2x16_4 -2048,0, 8,0
KERNEL2x16_4 -2048,0, 9,0
KERNEL2x16_4 -2048,0, 10,0
KERNEL2x16_4 -2048,0, 11,0
KERNEL2x16_4 -2048,0, 12,0
KERNEL2x16_4 -2048,0, 13,0
KERNEL2x16_4 -2048,0, 14,0
KERNEL2x16_4 -2048,0, 15,1
bdnz LSGEMM_L2x16_LOOP
MY_ALIGN
addi AO,AO, -2048
MY_ALIGN
LSGEMM_L2x16_SUB0:
#if defined(TRMMKERNEL)
andi. L, T11, 63
#else
andi. L, K, 63
#endif
ble LSGEMM_L2x16_SAVE
MY_ALIGN
LSGEMM_L2x16_SUB2:
andi. T10,L, 32
ble LSGEMM_L2x16_SUB2_16
KERNEL2x16_4 0,0, 0,0
KERNEL2x16_4 0,0, 1,0
KERNEL2x16_4 0,0, 2,0
KERNEL2x16_4 0,0, 3,0
KERNEL2x16_4 0,0, 4,0
KERNEL2x16_4 0,0, 5,0
KERNEL2x16_4 0,0, 6,0
KERNEL2x16_4 0,0, 7,1
MY_ALIGN
LSGEMM_L2x16_SUB2_16:
andi. T10,L, 16
ble LSGEMM_L2x16_SUB2_8
KERNEL2x16_4 0,0, 0,0
KERNEL2x16_4 0,0, 1,0
KERNEL2x16_4 0,0, 2,0
KERNEL2x16_4 0,0, 3,1
MY_ALIGN
LSGEMM_L2x16_SUB2_8:
andi. T10,L, 8
ble LSGEMM_L2x16_SUB2_4
KERNEL2x16_4 0,0, 0,0
KERNEL2x16_4 0,0, 1,1
MY_ALIGN
LSGEMM_L2x16_SUB2_4:
andi. T10,L, 4
ble LSGEMM_L2x16_SUB2_2
KERNEL2x16_4 0,0, 0,1
MY_ALIGN
LSGEMM_L2x16_SUB2_2:
andi. T10,L, 2
ble LSGEMM_L2x16_SUB2_1
KERNEL2x16_2 0,0, 0,1
MY_ALIGN
LSGEMM_L2x16_SUB2_1:
andi. T10,L, 1
ble LSGEMM_L2x16_SAVE
KERNEL2x16
MY_ALIGN
LSGEMM_L2x16_SAVE:
SAVE2x16
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2
#endif
addic. I, I, -1
bgt+ LSGEMM_L2x16_BEGIN
MY_ALIGN
LSGEMM_L2x16_END:
andi. I, M, 8
ble LSGEMM_L2x8_END
MY_ALIGN
LSGEMM_L2x8_BEGIN:
#if defined(TRMMKERNEL)
REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2
#else
mr BO, B
#endif
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,8,2
srawi. L, T11, 6 /**(T11 ) % 64x */
#else
srawi. L, K, 6 /**(K ) % 64x */
#endif
ZERO2x8
ble LSGEMM_L2x8_SUB0
addi AO,AO,2048
mtctr L
MY_ALIGN
LSGEMM_L2x8_LOOP:
KERNEL2x8_4 -2048,0, 0,0
KERNEL2x8_4 -2048,0, 1,0
KERNEL2x8_4 -2048,0, 2,0
KERNEL2x8_4 -2048,0, 3,0
KERNEL2x8_4 -2048,0, 4,0
KERNEL2x8_4 -2048,0, 5,0
KERNEL2x8_4 -2048,0, 6,0
KERNEL2x8_4 -2048,0, 7,0
KERNEL2x8_4 -2048,0, 8,0
KERNEL2x8_4 -2048,0, 9,0
KERNEL2x8_4 -2048,0, 10,0
KERNEL2x8_4 -2048,0, 11,0
KERNEL2x8_4 -2048,0, 12,0
KERNEL2x8_4 -2048,0, 13,0
KERNEL2x8_4 -2048,0, 14,0
KERNEL2x8_4 -2048,0, 15,1
bdnz LSGEMM_L2x8_LOOP
MY_ALIGN
addi AO,AO, -2048
MY_ALIGN
LSGEMM_L2x8_SUB0:
#if defined(TRMMKERNEL)
andi. L, T11, 63
#else
andi. L, K, 63
#endif
ble LSGEMM_L2x8_SAVE
MY_ALIGN
LSGEMM_L2x8_SUB2:
andi. T10,L, 32
ble LSGEMM_L2x8_SUB2_16
KERNEL2x8_4 0,0, 0,0
KERNEL2x8_4 0,0, 1,0
KERNEL2x8_4 0,0, 2,0
KERNEL2x8_4 0,0, 3,0
KERNEL2x8_4 0,0, 4,0
KERNEL2x8_4 0,0, 5,0
KERNEL2x8_4 0,0, 6,0
KERNEL2x8_4 0,0, 7,1
MY_ALIGN
LSGEMM_L2x8_SUB2_16:
andi. T10,L, 16
ble LSGEMM_L2x8_SUB2_8
KERNEL2x8_4 0,0, 0,0
KERNEL2x8_4 0,0, 1,0
KERNEL2x8_4 0,0, 2,0
KERNEL2x8_4 0,0, 3,1
MY_ALIGN
LSGEMM_L2x8_SUB2_8:
andi. T10,L, 8
ble LSGEMM_L2x8_SUB2_4
KERNEL2x8_4 0,0, 0,0
KERNEL2x8_4 0,0, 1,1
MY_ALIGN
LSGEMM_L2x8_SUB2_4:
andi. T10,L, 4
ble LSGEMM_L2x8_SUB2_2
KERNEL2x8_4 0,0, 0,1
MY_ALIGN
LSGEMM_L2x8_SUB2_2:
andi. T10,L, 2
ble LSGEMM_L2x8_SUB2_1
KERNEL2x8_2 0,0, 0,1
MY_ALIGN
LSGEMM_L2x8_SUB2_1:
andi. T10,L, 1
ble LSGEMM_L2x8_SAVE
KERNEL2x8
MY_ALIGN
LSGEMM_L2x8_SAVE:
SAVE2x8
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2
#endif
MY_ALIGN
LSGEMM_L2x8_END:
andi. I, M, 4
ble LSGEMM_L2x4_END
MY_ALIGN
LSGEMM_L2x4_BEGIN:
#if defined(TRMMKERNEL)
REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2
#else
mr BO, B
#endif
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,4,2
srawi. L, T11, 6 /**(T11 ) % 64x */
#else
srawi. L, K, 6 /**(K ) % 64x */
#endif
ZERO2x4
ble LSGEMM_L2x4_SUB0
mtctr L
MY_ALIGN
LSGEMM_L2x4_LOOP:
KERNEL2x4_4 0,0, 0,0
KERNEL2x4_4 0,0, 1,0
KERNEL2x4_4 0,0, 2,0
KERNEL2x4_4 0,0, 3,0
KERNEL2x4_4 0,0, 4,0
KERNEL2x4_4 0,0, 5,0
KERNEL2x4_4 0,0, 6,0
KERNEL2x4_4 0,0, 7,0
KERNEL2x4_4 0,0, 8,0
KERNEL2x4_4 0,0, 9,0
KERNEL2x4_4 0,0, 10,0
KERNEL2x4_4 0,0, 11,0
KERNEL2x4_4 0,0, 12,0
KERNEL2x4_4 0,0, 13,0
KERNEL2x4_4 0,0, 14,0
KERNEL2x4_4 0,0, 15,1
bdnz LSGEMM_L2x4_LOOP
MY_ALIGN
MY_ALIGN
LSGEMM_L2x4_SUB0:
#if defined(TRMMKERNEL)
andi. L, T11, 63
#else
andi. L, K, 63
#endif
ble LSGEMM_L2x4_SAVE
MY_ALIGN
LSGEMM_L2x4_SUB2:
andi. T10,L, 32
ble LSGEMM_L2x4_SUB2_16
KERNEL2x4_4 0,0, 0,0
KERNEL2x4_4 0,0, 1,0
KERNEL2x4_4 0,0, 2,0
KERNEL2x4_4 0,0, 3,0
KERNEL2x4_4 0,0, 4,0
KERNEL2x4_4 0,0, 5,0
KERNEL2x4_4 0,0, 6,0
KERNEL2x4_4 0,0, 7,1
MY_ALIGN
LSGEMM_L2x4_SUB2_16:
andi. T10,L, 16
ble LSGEMM_L2x4_SUB2_8
KERNEL2x4_4 0,0, 0,0
KERNEL2x4_4 0,0, 1,0
KERNEL2x4_4 0,0, 2,0
KERNEL2x4_4 0,0, 3,1
MY_ALIGN
LSGEMM_L2x4_SUB2_8:
andi. T10,L, 8
ble LSGEMM_L2x4_SUB2_4
KERNEL2x4_4 0,0, 0,0
KERNEL2x4_4 0,0, 1,1
MY_ALIGN
LSGEMM_L2x4_SUB2_4:
andi. T10,L, 4
ble LSGEMM_L2x4_SUB2_2
KERNEL2x4_4 0,0, 0,1
MY_ALIGN
LSGEMM_L2x4_SUB2_2:
andi. T10,L, 2
ble LSGEMM_L2x4_SUB2_1
KERNEL2x4_2 0,0, 0,1
MY_ALIGN
LSGEMM_L2x4_SUB2_1:
andi. T10,L, 1
ble LSGEMM_L2x4_SAVE
KERNEL2x4
MY_ALIGN
LSGEMM_L2x4_SAVE:
SAVE2x4
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2
#endif
MY_ALIGN
LSGEMM_L2x4_END:
andi. I, M, 2
ble LSGEMM_L2x2_END
MY_ALIGN
LSGEMM_L2x2_BEGIN:
#if defined(TRMMKERNEL)
REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2
#else
mr BO, B
#endif
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,2,2
srawi. L, T11, 6 /**(T11 ) % 64x */
#else
srawi. L, K, 6 /**(K ) % 64x */
#endif
ZERO2x2
ble LSGEMM_L2x2_SUB0
mtctr L
MY_ALIGN
LSGEMM_L2x2_LOOP:
KERNEL2x2_4 0,0, 0,0
KERNEL2x2_4 0,0, 1,0
KERNEL2x2_4 0,0, 2,0
KERNEL2x2_4 0,0, 3,0
KERNEL2x2_4 0,0, 4,0
KERNEL2x2_4 0,0, 5,0
KERNEL2x2_4 0,0, 6,0
KERNEL2x2_4 0,0, 7,0
KERNEL2x2_4 0,0, 8,0
KERNEL2x2_4 0,0, 9,0
KERNEL2x2_4 0,0, 10,0
KERNEL2x2_4 0,0, 11,0
KERNEL2x2_4 0,0, 12,0
KERNEL2x2_4 0,0, 13,0
KERNEL2x2_4 0,0, 14,0
KERNEL2x2_4 0,0, 15,1
bdnz LSGEMM_L2x2_LOOP
MY_ALIGN
MY_ALIGN
LSGEMM_L2x2_SUB0:
#if defined(TRMMKERNEL)
andi. L, T11, 63
#else
andi. L, K, 63
#endif
ble LSGEMM_L2x2_SAVE
MY_ALIGN
LSGEMM_L2x2_SUB2:
andi. T10,L, 32
ble LSGEMM_L2x2_SUB2_16
KERNEL2x2_4 0,0, 0,0
KERNEL2x2_4 0,0, 1,0
KERNEL2x2_4 0,0, 2,0
KERNEL2x2_4 0,0, 3,0
KERNEL2x2_4 0,0, 4,0
KERNEL2x2_4 0,0, 5,0
KERNEL2x2_4 0,0, 6,0
KERNEL2x2_4 0,0, 7,1
MY_ALIGN
LSGEMM_L2x2_SUB2_16:
andi. T10,L, 16
ble LSGEMM_L2x2_SUB2_8
KERNEL2x2_4 0,0, 0,0
KERNEL2x2_4 0,0, 1,0
KERNEL2x2_4 0,0, 2,0
KERNEL2x2_4 0,0, 3,1
MY_ALIGN
LSGEMM_L2x2_SUB2_8:
andi. T10,L, 8
ble LSGEMM_L2x2_SUB2_4
KERNEL2x2_4 0,0, 0,0
KERNEL2x2_4 0,0, 1,1
MY_ALIGN
LSGEMM_L2x2_SUB2_4:
andi. T10,L, 4
ble LSGEMM_L2x2_SUB2_2
KERNEL2x2_4 0,0, 0,1
MY_ALIGN
LSGEMM_L2x2_SUB2_2:
andi. T10,L, 2
ble LSGEMM_L2x2_SUB2_1
KERNEL2x2_2 0,0, 0,1
MY_ALIGN
LSGEMM_L2x2_SUB2_1:
andi. T10,L, 1
ble LSGEMM_L2x2_SAVE
KERNEL2x2
MY_ALIGN
LSGEMM_L2x2_SAVE:
SAVE2x2
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2
#endif
MY_ALIGN
LSGEMM_L2x2_END:
andi. I, M, 1
ble LSGEMM_L2x1_END
MY_ALIGN
LSGEMM_L2x1_BEGIN:
#if defined(TRMMKERNEL)
REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2
#else
mr BO, B
#endif
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,1,2
srawi. L, T11, 6 /**(T11 ) % 64x */
#else
srawi. L, K, 6 /**(K ) % 64x */
#endif
ZERO2x1
ble LSGEMM_L2x1_SUB0
mtctr L
MY_ALIGN
LSGEMM_L2x1_LOOP:
KERNEL2x1_4 0,0, 0,0
KERNEL2x1_4 0,0, 1,0
KERNEL2x1_4 0,0, 2,0
KERNEL2x1_4 0,0, 3,0
KERNEL2x1_4 0,0, 4,0
KERNEL2x1_4 0,0, 5,0
KERNEL2x1_4 0,0, 6,0
KERNEL2x1_4 0,0, 7,0
KERNEL2x1_4 0,0, 8,0
KERNEL2x1_4 0,0, 9,0
KERNEL2x1_4 0,0, 10,0
KERNEL2x1_4 0,0, 11,0
KERNEL2x1_4 0,0, 12,0
KERNEL2x1_4 0,0, 13,0
KERNEL2x1_4 0,0, 14,0
KERNEL2x1_4 0,0, 15,1
bdnz LSGEMM_L2x1_LOOP
MY_ALIGN
MY_ALIGN
LSGEMM_L2x1_SUB0:
#if defined(TRMMKERNEL)
andi. L, T11, 63
#else
andi. L, K, 63
#endif
ble LSGEMM_L2x1_SAVE
MY_ALIGN
LSGEMM_L2x1_SUB2:
andi. T10,L, 32
ble LSGEMM_L2x1_SUB2_16
KERNEL2x1_4 0,0, 0,0
KERNEL2x1_4 0,0, 1,0
KERNEL2x1_4 0,0, 2,0
KERNEL2x1_4 0,0, 3,0
KERNEL2x1_4 0,0, 4,0
KERNEL2x1_4 0,0, 5,0
KERNEL2x1_4 0,0, 6,0
KERNEL2x1_4 0,0, 7,1
MY_ALIGN
LSGEMM_L2x1_SUB2_16:
andi. T10,L, 16
ble LSGEMM_L2x1_SUB2_8
KERNEL2x1_4 0,0, 0,0
KERNEL2x1_4 0,0, 1,0
KERNEL2x1_4 0,0, 2,0
KERNEL2x1_4 0,0, 3,1
MY_ALIGN
LSGEMM_L2x1_SUB2_8:
andi. T10,L, 8
ble LSGEMM_L2x1_SUB2_4
KERNEL2x1_4 0,0, 0,0
KERNEL2x1_4 0,0, 1,1
MY_ALIGN
LSGEMM_L2x1_SUB2_4:
andi. T10,L, 4
ble LSGEMM_L2x1_SUB2_2
KERNEL2x1_4 0,0, 0,1
MY_ALIGN
LSGEMM_L2x1_SUB2_2:
andi. T10,L, 2
ble LSGEMM_L2x1_SUB2_1
KERNEL2x1_2 0,0, 0,1
MY_ALIGN
LSGEMM_L2x1_SUB2_1:
andi. T10,L, 1
ble LSGEMM_L2x1_SAVE
KERNEL2x1
MY_ALIGN
LSGEMM_L2x1_SAVE:
SAVE2x1
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2
#endif
MY_ALIGN
LSGEMM_L2x1_END:
slwi T1, K, 3
add B, B, T1
#if defined(TRMMKERNEL) && !defined(LEFT)
addi TEMP_REG, TEMP_REG, 2
#endif
LSGEMM_L2_END:
andi. T1, N, 1
ble LSGEMM_END
LSGEMM_1_BEGIN:
mr AO, A
mr CO, C
add C, C, LDC
#if defined(TRMMKERNEL) && defined(LEFT)
mr TEMP_REG, OFFSET /*off = offset;*/
#endif
srawi. I, M, 4
ble LSGEMM_1x16_END
MY_ALIGN
LSGEMM_1x16_BEGIN:
#if defined(TRMMKERNEL)
REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1
#else
mr BO, B
#endif
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,16,1
srawi. L, T11, 6 /**(T11 ) % 64x */
#else
srawi. L, K, 6 /**(K ) % 64x */
#endif
ZERO1x16
ble LSGEMM_1x16_SUB0
addi AO,AO,2048
mtctr L
MY_ALIGN
LSGEMM_1x16_LOOP:
KERNEL1x16_4 -2048,0, 0,0
KERNEL1x16_4 -2048,0, 1,0
KERNEL1x16_4 -2048,0, 2,0
KERNEL1x16_4 -2048,0, 3,0
KERNEL1x16_4 -2048,0, 4,0
KERNEL1x16_4 -2048,0, 5,0
KERNEL1x16_4 -2048,0, 6,0
KERNEL1x16_4 -2048,0, 7,0
KERNEL1x16_4 -2048,0, 8,0
KERNEL1x16_4 -2048,0, 9,0
KERNEL1x16_4 -2048,0, 10,0
KERNEL1x16_4 -2048,0, 11,0
KERNEL1x16_4 -2048,0, 12,0
KERNEL1x16_4 -2048,0, 13,0
KERNEL1x16_4 -2048,0, 14,0
KERNEL1x16_4 -2048,0, 15,1
bdnz LSGEMM_1x16_LOOP
MY_ALIGN
addi AO,AO, -2048
MY_ALIGN
LSGEMM_1x16_SUB0:
#if defined(TRMMKERNEL)
andi. L, T11, 63
#else
andi. L, K, 63
#endif
ble LSGEMM_1x16_SAVE
MY_ALIGN
LSGEMM_1x16_SUB2:
andi. T10,L, 32
ble LSGEMM_1x16_SUB2_16
KERNEL1x16_4 0,0, 0,0
KERNEL1x16_4 0,0, 1,0
KERNEL1x16_4 0,0, 2,0
KERNEL1x16_4 0,0, 3,0
KERNEL1x16_4 0,0, 4,0
KERNEL1x16_4 0,0, 5,0
KERNEL1x16_4 0,0, 6,0
KERNEL1x16_4 0,0, 7,1
MY_ALIGN
LSGEMM_1x16_SUB2_16:
andi. T10,L, 16
ble LSGEMM_1x16_SUB2_8
KERNEL1x16_4 0,0, 0,0
KERNEL1x16_4 0,0, 1,0
KERNEL1x16_4 0,0, 2,0
KERNEL1x16_4 0,0, 3,1
MY_ALIGN
LSGEMM_1x16_SUB2_8:
andi. T10,L, 8
ble LSGEMM_1x16_SUB2_4
KERNEL1x16_4 0,0, 0,0
KERNEL1x16_4 0,0, 1,1
MY_ALIGN
LSGEMM_1x16_SUB2_4:
andi. T10,L, 4
ble LSGEMM_1x16_SUB2_2
KERNEL1x16_4 0,0, 0,1
MY_ALIGN
LSGEMM_1x16_SUB2_2:
andi. T10,L, 2
ble LSGEMM_1x16_SUB2_1
KERNEL1x16_2 0,0, 0,1
MY_ALIGN
LSGEMM_1x16_SUB2_1:
andi. T10,L, 1
ble LSGEMM_1x16_SAVE
KERNEL1x16
MY_ALIGN
LSGEMM_1x16_SAVE:
SAVE1x16
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1
#endif
addic. I, I, -1
bgt+ LSGEMM_1x16_BEGIN
MY_ALIGN
LSGEMM_1x16_END:
andi. I, M, 8
ble LSGEMM_1x8_END
MY_ALIGN
LSGEMM_1x8_BEGIN:
#if defined(TRMMKERNEL)
REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1
#else
mr BO, B
#endif
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,8,1
srawi. L, T11, 6 /**(T11 ) % 64x */
#else
srawi. L, K, 6 /**(K ) % 64x */
#endif
ZERO1x8
ble LSGEMM_1x8_SUB0
addi AO,AO,2048
mtctr L
MY_ALIGN
LSGEMM_1x8_LOOP:
KERNEL1x8_4 -2048,0, 0,0
KERNEL1x8_4 -2048,0, 1,0
KERNEL1x8_4 -2048,0, 2,0
KERNEL1x8_4 -2048,0, 3,0
KERNEL1x8_4 -2048,0, 4,0
KERNEL1x8_4 -2048,0, 5,0
KERNEL1x8_4 -2048,0, 6,0
KERNEL1x8_4 -2048,0, 7,0
KERNEL1x8_4 -2048,0, 8,0
KERNEL1x8_4 -2048,0, 9,0
KERNEL1x8_4 -2048,0, 10,0
KERNEL1x8_4 -2048,0, 11,0
KERNEL1x8_4 -2048,0, 12,0
KERNEL1x8_4 -2048,0, 13,0
KERNEL1x8_4 -2048,0, 14,0
KERNEL1x8_4 -2048,0, 15,1
bdnz LSGEMM_1x8_LOOP
MY_ALIGN
addi AO,AO, -2048
MY_ALIGN
LSGEMM_1x8_SUB0:
#if defined(TRMMKERNEL)
andi. L, T11, 63
#else
andi. L, K, 63
#endif
ble LSGEMM_1x8_SAVE
MY_ALIGN
LSGEMM_1x8_SUB2:
andi. T10,L, 32
ble LSGEMM_1x8_SUB2_16
KERNEL1x8_4 0,0, 0,0
KERNEL1x8_4 0,0, 1,0
KERNEL1x8_4 0,0, 2,0
KERNEL1x8_4 0,0, 3,0
KERNEL1x8_4 0,0, 4,0
KERNEL1x8_4 0,0, 5,0
KERNEL1x8_4 0,0, 6,0
KERNEL1x8_4 0,0, 7,1
MY_ALIGN
LSGEMM_1x8_SUB2_16:
andi. T10,L, 16
ble LSGEMM_1x8_SUB2_8
KERNEL1x8_4 0,0, 0,0
KERNEL1x8_4 0,0, 1,0
KERNEL1x8_4 0,0, 2,0
KERNEL1x8_4 0,0, 3,1
MY_ALIGN
LSGEMM_1x8_SUB2_8:
andi. T10,L, 8
ble LSGEMM_1x8_SUB2_4
KERNEL1x8_4 0,0, 0,0
KERNEL1x8_4 0,0, 1,1
MY_ALIGN
LSGEMM_1x8_SUB2_4:
andi. T10,L, 4
ble LSGEMM_1x8_SUB2_2
KERNEL1x8_4 0,0, 0,1
MY_ALIGN
LSGEMM_1x8_SUB2_2:
andi. T10,L, 2
ble LSGEMM_1x8_SUB2_1
KERNEL1x8_2 0,0, 0,1
MY_ALIGN
LSGEMM_1x8_SUB2_1:
andi. T10,L, 1
ble LSGEMM_1x8_SAVE
KERNEL1x8
MY_ALIGN
LSGEMM_1x8_SAVE:
SAVE1x8
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1
#endif
MY_ALIGN
LSGEMM_1x8_END:
andi. I, M, 4
ble LSGEMM_1x4_END
MY_ALIGN
LSGEMM_1x4_BEGIN:
#if defined(TRMMKERNEL)
REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1
#else
mr BO, B
#endif
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,4,1
srawi. L, T11, 6 /**(T11 ) % 64x */
#else
srawi. L, K, 6 /**(K ) % 64x */
#endif
ZERO1x4
ble LSGEMM_1x4_SUB0
mtctr L
MY_ALIGN
LSGEMM_1x4_LOOP:
KERNEL1x4_4 0,0, 0,0
KERNEL1x4_4 0,0, 1,0
KERNEL1x4_4 0,0, 2,0
KERNEL1x4_4 0,0, 3,0
KERNEL1x4_4 0,0, 4,0
KERNEL1x4_4 0,0, 5,0
KERNEL1x4_4 0,0, 6,0
KERNEL1x4_4 0,0, 7,0
KERNEL1x4_4 0,0, 8,0
KERNEL1x4_4 0,0, 9,0
KERNEL1x4_4 0,0, 10,0
KERNEL1x4_4 0,0, 11,0
KERNEL1x4_4 0,0, 12,0
KERNEL1x4_4 0,0, 13,0
KERNEL1x4_4 0,0, 14,0
KERNEL1x4_4 0,0, 15,1
bdnz LSGEMM_1x4_LOOP
MY_ALIGN
MY_ALIGN
LSGEMM_1x4_SUB0:
#if defined(TRMMKERNEL)
andi. L, T11, 63
#else
andi. L, K, 63
#endif
ble LSGEMM_1x4_SAVE
MY_ALIGN
LSGEMM_1x4_SUB2:
andi. T10,L, 32
ble LSGEMM_1x4_SUB2_16
KERNEL1x4_4 0,0, 0,0
KERNEL1x4_4 0,0, 1,0
KERNEL1x4_4 0,0, 2,0
KERNEL1x4_4 0,0, 3,0
KERNEL1x4_4 0,0, 4,0
KERNEL1x4_4 0,0, 5,0
KERNEL1x4_4 0,0, 6,0
KERNEL1x4_4 0,0, 7,1
MY_ALIGN
LSGEMM_1x4_SUB2_16:
andi. T10,L, 16
ble LSGEMM_1x4_SUB2_8
KERNEL1x4_4 0,0, 0,0
KERNEL1x4_4 0,0, 1,0
KERNEL1x4_4 0,0, 2,0
KERNEL1x4_4 0,0, 3,1
MY_ALIGN
LSGEMM_1x4_SUB2_8:
andi. T10,L, 8
ble LSGEMM_1x4_SUB2_4
KERNEL1x4_4 0,0, 0,0
KERNEL1x4_4 0,0, 1,1
MY_ALIGN
LSGEMM_1x4_SUB2_4:
andi. T10,L, 4
ble LSGEMM_1x4_SUB2_2
KERNEL1x4_4 0,0, 0,1
MY_ALIGN
LSGEMM_1x4_SUB2_2:
andi. T10,L, 2
ble LSGEMM_1x4_SUB2_1
KERNEL1x4_2 0,0, 0,1
MY_ALIGN
LSGEMM_1x4_SUB2_1:
andi. T10,L, 1
ble LSGEMM_1x4_SAVE
KERNEL1x4
MY_ALIGN
LSGEMM_1x4_SAVE:
SAVE1x4
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1
#endif
MY_ALIGN
LSGEMM_1x4_END:
andi. I, M, 2
ble LSGEMM_1x2_END
MY_ALIGN
LSGEMM_1x2_BEGIN:
#if defined(TRMMKERNEL)
REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1
#else
mr BO, B
#endif
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,2,1
srawi. L, T11, 6 /**(T11 ) % 64x */
#else
srawi. L, K, 6 /**(K ) % 64x */
#endif
ZERO1x2
ble LSGEMM_1x2_SUB0
mtctr L
MY_ALIGN
LSGEMM_1x2_LOOP:
KERNEL1x2_4 0,0, 0,0
KERNEL1x2_4 0,0, 1,0
KERNEL1x2_4 0,0, 2,0
KERNEL1x2_4 0,0, 3,0
KERNEL1x2_4 0,0, 4,0
KERNEL1x2_4 0,0, 5,0
KERNEL1x2_4 0,0, 6,0
KERNEL1x2_4 0,0, 7,0
KERNEL1x2_4 0,0, 8,0
KERNEL1x2_4 0,0, 9,0
KERNEL1x2_4 0,0, 10,0
KERNEL1x2_4 0,0, 11,0
KERNEL1x2_4 0,0, 12,0
KERNEL1x2_4 0,0, 13,0
KERNEL1x2_4 0,0, 14,0
KERNEL1x2_4 0,0, 15,1
bdnz LSGEMM_1x2_LOOP
MY_ALIGN
MY_ALIGN
LSGEMM_1x2_SUB0:
#if defined(TRMMKERNEL)
andi. L, T11, 63
#else
andi. L, K, 63
#endif
ble LSGEMM_1x2_SAVE
MY_ALIGN
LSGEMM_1x2_SUB2:
andi. T10,L, 32
ble LSGEMM_1x2_SUB2_16
KERNEL1x2_4 0,0, 0,0
KERNEL1x2_4 0,0, 1,0
KERNEL1x2_4 0,0, 2,0
KERNEL1x2_4 0,0, 3,0
KERNEL1x2_4 0,0, 4,0
KERNEL1x2_4 0,0, 5,0
KERNEL1x2_4 0,0, 6,0
KERNEL1x2_4 0,0, 7,1
MY_ALIGN
LSGEMM_1x2_SUB2_16:
andi. T10,L, 16
ble LSGEMM_1x2_SUB2_8
KERNEL1x2_4 0,0, 0,0
KERNEL1x2_4 0,0, 1,0
KERNEL1x2_4 0,0, 2,0
KERNEL1x2_4 0,0, 3,1
MY_ALIGN
LSGEMM_1x2_SUB2_8:
andi. T10,L, 8
ble LSGEMM_1x2_SUB2_4
KERNEL1x2_4 0,0, 0,0
KERNEL1x2_4 0,0, 1,1
MY_ALIGN
LSGEMM_1x2_SUB2_4:
andi. T10,L, 4
ble LSGEMM_1x2_SUB2_2
KERNEL1x2_4 0,0, 0,1
MY_ALIGN
LSGEMM_1x2_SUB2_2:
andi. T10,L, 2
ble LSGEMM_1x2_SUB2_1
KERNEL1x2_2 0,0, 0,1
MY_ALIGN
LSGEMM_1x2_SUB2_1:
andi. T10,L, 1
ble LSGEMM_1x2_SAVE
KERNEL1x2
MY_ALIGN
LSGEMM_1x2_SAVE:
SAVE1x2
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1
#endif
MY_ALIGN
LSGEMM_1x2_END:
andi. I, M, 1
ble LSGEMM_1x1_END
MY_ALIGN
LSGEMM_1x1_BEGIN:
#if defined(TRMMKERNEL)
REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1
#else
mr BO, B
#endif
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,1,1
srawi. L, T11, 6 /**(T11 ) % 64x */
#else
srawi. L, K, 6 /**(K ) % 64x */
#endif
ZERO1x1
ble LSGEMM_1x1_SUB0
mtctr L
MY_ALIGN
LSGEMM_1x1_LOOP:
KERNEL1x1_16 0,0, 0,0
KERNEL1x1_16 0,0, 1,0
KERNEL1x1_16 0,0, 2,0
KERNEL1x1_16 0,0, 3,1
bdnz LSGEMM_1x1_LOOP
MY_ALIGN
MY_ALIGN
LSGEMM_1x1_SUB0:
#if defined(TRMMKERNEL)
andi. L, T11, 63
#else
andi. L, K, 63
#endif
ble LSGEMM_1x1_SAVE
MY_ALIGN
LSGEMM_1x1_SUB2:
andi. T10,L, 32
ble LSGEMM_1x1_SUB2_16
KERNEL1x1_16 0,0, 0,0
KERNEL1x1_16 0,0, 1,1
MY_ALIGN
LSGEMM_1x1_SUB2_16:
andi. T10,L, 16
ble LSGEMM_1x1_SUB2_8
KERNEL1x1_16 0,0, 0,1
MY_ALIGN
LSGEMM_1x1_SUB2_8:
andi. T10,L, 8
ble LSGEMM_1x1_SUB2_4
KERNEL1x1_8 0,0, 0,1
MY_ALIGN
LSGEMM_1x1_SUB2_4:
andi. T10,L, 4
ble LSGEMM_1x1_SUB2_2
KERNEL1x1_4 0,0, 0,1
MY_ALIGN
LSGEMM_1x1_SUB2_2:
andi. T10,L, 2
ble LSGEMM_1x1_SUB2_1
KERNEL1x1_2 0,0, 0,1
MY_ALIGN
LSGEMM_1x1_SUB2_1:
andi. T10,L, 1
ble LSGEMM_1x1_SAVE
KERNEL1x1
MY_ALIGN
LSGEMM_1x1_SAVE:
SAVE1x1
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1
#endif
MY_ALIGN
LSGEMM_1x1_END:
slwi T1, K, 2
add B, B, T1
#if defined(TRMMKERNEL) && !defined(LEFT)
addi TEMP_REG, TEMP_REG, 1
#endif
LSGEMM_END: