diff --git a/benchmark/gemm.c b/benchmark/gemm.c index 85bcbc710..dd016a7c3 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -207,7 +207,7 @@ int main(int argc, char *argv[]){ for (i = 0; i < m * n * COMPSIZE; i++) { c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - + fprintf(stderr, " SIZE Flops Time\n"); for (i = from; i <= to; i += step) { diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 5c10ad64a..440eaab1b 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -42,7 +42,7 @@ ZGEMMKERNEL = zgemm_kernel_power9.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c -ZGEMMITCOPY = zgemm_tcopy_8_power8.S +ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o ZGEMMINCOPYOBJ = zgemm_incopy.o diff --git a/kernel/power/dgemm_kernel_power9.S b/kernel/power/dgemm_kernel_power9.S index a1762dcf2..2fb1b27ef 100644 --- a/kernel/power/dgemm_kernel_power9.S +++ b/kernel/power/dgemm_kernel_power9.S @@ -135,18 +135,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r14, 280(SP) - stxv v20, 288(SP) - stxv v21, 304(SP) - stxv v22, 320(SP) - stxv v23, 336(SP) - stxv v24, 352(SP) - stxv v25, 368(SP) - stxv v26, 384(SP) - stxv v27, 400(SP) - stxv v28, 416(SP) - stxv v29, 432(SP) - stxv v30, 448(SP) - stxv v31, 464(SP) + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) stfd f1, ALPHA_SP @@ -229,18 +229,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld r15, 272(SP) ld r14, 280(SP) - lxv v20, 288(SP) - lxv v21, 304(SP) - lxv v22, 320(SP) - lxv v23, 336(SP) - lxv v24, 352(SP) - lxv v25, 368(SP) - lxv v26, 384(SP) - lxv v27, 400(SP) - lxv v28, 416(SP) - lxv v29, 432(SP) - lxv v30, 448(SP) - lxv v31, 464(SP) + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) addi SP, SP, STACKSIZE blr diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S index f408cdc17..7a0f3143e 100644 --- a/kernel/power/sgemm_kernel_power9.S +++ b/kernel/power/sgemm_kernel_power9.S @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LOAD ld #define STACKSIZE (512 ) - +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ #define M r3 #define N r4 #define K r5 @@ -91,7 +91,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROFCODE addi SP, SP, -STACKSIZE - li r0, 0 + mflr r0 + stfd f14, 0(SP) stfd f15, 8(SP) @@ -137,19 +138,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r14, 280(SP) - stxv v20, 288(SP) - stxv v21, 304(SP) - stxv v22, 320(SP) - stxv v23, 336(SP) - stxv v24, 352(SP) - stxv v25, 368(SP) - stxv v26, 384(SP) - stxv v27, 400(SP) - stxv v28, 416(SP) - stxv v29, 432(SP) - stxv v30, 448(SP) - stxv v31, 464(SP) - + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + std r0, FLINK_SAVE(SP) #if defined(TRMMKERNEL) @@ -157,72 +158,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif slwi LDC, LDC, 2 - -/* cmpwi cr0, M, 0 - ble .L999_H1 - cmpwi cr0, N, 0 - ble .L999_H1 - cmpwi cr0, K, 0 - ble .L999_H1 -*/ /*alpha is stored in f1. convert to single and splat*/ - xscvdpspn alpha_r,vs1 - xxspltw alpha_r,alpha_r,0 - + xscvdpspn alpha_r,vs1 + xxspltw alpha_r,alpha_r,0 /*load reverse permute mask for big endian uint128 = 0xc0d0e0f08090a0b0405060700010203 */ lis T2, perm_const2@highest - ori T2, T2, perm_const2@higher - rldicr T2, T2, 32, 31 - oris T2, T2, perm_const2@h - ori T2, T2, perm_const2@l - lis T1, perm_const1@highest + lis T3, save_permute_12@highest + lis T4, save_permute_11@highest + lis T5, save_permute_22@highest + lis T6, save_permute_21@highest + ori T2, T2, perm_const2@higher ori T1, T1, perm_const1@higher + ori T3, T3, save_permute_12@higher + ori T4, T4, save_permute_11@higher + ori T5, T5, save_permute_22@higher + ori T6, T6, save_permute_21@higher + rldicr T2, T2, 32, 31 rldicr T1, T1, 32, 31 + rldicr T3, T3, 32, 31 + rldicr T4, T4, 32, 31 + rldicr T5, T5, 32, 31 + rldicr T6, T6, 32, 31 + oris T2, T2, perm_const2@h oris T1, T1, perm_const1@h + oris T3, T3, save_permute_12@h + oris T4, T4, save_permute_11@h + oris T5, T5, save_permute_22@h + oris T6, T6, save_permute_21@h + ori T2, T2, perm_const2@l ori T1, T1, perm_const1@l - + ori T3, T3, save_permute_12@l + ori T4, T4, save_permute_11@l + ori T5, T5, save_permute_22@l + ori T6, T6, save_permute_21@l + li r0,0 mtvsrdd permute_mask,T2,T1 - - lis T2, save_permute_12@highest - ori T2, T2, save_permute_12@higher - rldicr T2, T2, 32, 31 - oris T2, T2, save_permute_12@h - ori T2, T2, save_permute_12@l - - lis T1, save_permute_11@highest - ori T1, T1, save_permute_11@higher - rldicr T1, T1, 32, 31 - oris T1, T1, save_permute_11@h - ori T1, T1, save_permute_11@l - - mtvsrdd save_permute_1,T2,T1 - - lis T2, save_permute_22@highest - ori T2, T2, save_permute_22@higher - rldicr T2, T2, 32, 31 - oris T2, T2, save_permute_22@h - ori T2, T2, save_permute_22@l - - lis T1, save_permute_21@highest - ori T1, T1, save_permute_21@higher - rldicr T1, T1, 32, 31 - oris T1, T1, save_permute_21@h - ori T1, T1, save_permute_21@l - - mtvsrdd save_permute_2,T2,T1 + mtvsrdd save_permute_1,T3,T4 + mtvsrdd save_permute_2,T5,T6 #include "sgemm_logic_power9.S" -.L999: - addi r3, 0, 0 - +.L999: lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) @@ -264,23 +247,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) - - lxv v20, 288(SP) - lxv v21, 304(SP) - lxv v22, 320(SP) - lxv v23, 336(SP) - lxv v24, 352(SP) - lxv v25, 368(SP) - lxv v26, 384(SP) - lxv v27, 400(SP) - lxv v28, 416(SP) - lxv v29, 432(SP) - lxv v30, 448(SP) - lxv v31, 464(SP) + ld r0, FLINK_SAVE(SP) - addi SP, SP, STACKSIZE + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE blr + EPILOGUE #endif diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S index c149cb903..25e8c8387 100644 --- a/kernel/power/sgemm_logic_power9.S +++ b/kernel/power/sgemm_logic_power9.S @@ -1,5 +1,94 @@ #define MY_ALIGN .align 3 +b L8 + MY_ALIGN +LSGEMM_L8x16_LMAIN_SUB: + LOAD8x16_0 + mtctr L + MY_ALIGN + +LSGEMM_L8x16_LOOP: + + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_2 64,32, 1,0 + KERNEL8x16_I1_L4_2 64,32, 2,0 + KERNEL8x16_I1_L4_2 64,32, 3,0 + KERNEL8x16_I1_L4_2 64,32, 4,0 + KERNEL8x16_I1_L4_2 64,32, 5,0 + KERNEL8x16_I1_L4_2 64,32, 6,0 + KERNEL8x16_I1_L4_2 64,32, 7,0 + KERNEL8x16_I1_L4_2 64,32, 8,0 + KERNEL8x16_I1_L4_2 64,32, 9,0 + KERNEL8x16_I1_L4_2 64,32, 10,0 + KERNEL8x16_I1_L4_2 64,32, 11,0 + KERNEL8x16_I1_L4_2 64,32, 12,0 + KERNEL8x16_I1_L4_2 64,32, 13,0 + KERNEL8x16_I1_L4_2 64,32, 14,0 + KERNEL8x16_I1_L4_2 64,32, 15,0 + KERNEL8x16_I1_L4_2 64,32, 16,0 + KERNEL8x16_I1_L4_2 64,32, 17,0 + KERNEL8x16_I1_L4_2 64,32, 18,0 + KERNEL8x16_I1_L4_2 64,32, 19,0 + KERNEL8x16_I1_L4_2 64,32, 20,0 + KERNEL8x16_I1_L4_2 64,32, 21,0 + KERNEL8x16_I1_L4_2 64,32, 22,0 + KERNEL8x16_I1_L4_2 64,32, 23,0 + KERNEL8x16_I1_L4_2 64,32, 24,0 + KERNEL8x16_I1_L4_2 64,32, 25,0 + KERNEL8x16_I1_L4_2 64,32, 26,0 + KERNEL8x16_I1_L4_2 64,32, 27,0 + KERNEL8x16_I1_L4_2 64,32, 28,0 + KERNEL8x16_I1_L4_2 64,32, 29,0 + KERNEL8x16_I1_L4_2 64,32, 30,0 + KERNEL8x16_I1_L4_2 64,32, 31,1 + bdnz LSGEMM_L8x16_LOOP + + MY_ALIGN +LSGEMM_L8x16_LOOP_END: + END8x16 0, AO, BO, 64, 32 + blr + + MY_ALIGN +LSGEMM_L8x16_L64_SUB: + LOAD8x16_0 + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_2 64,32, 1,0 + KERNEL8x16_I1_L4_2 64,32, 2,0 + KERNEL8x16_I1_L4_2 64,32, 3,0 + KERNEL8x16_I1_L4_2 64,32, 4,0 + KERNEL8x16_I1_L4_2 64,32, 5,0 + KERNEL8x16_I1_L4_2 64,32, 6,0 + KERNEL8x16_I1_L4_2 64,32, 7,0 + KERNEL8x16_I1_L4_2 64,32, 8,0 + KERNEL8x16_I1_L4_2 64,32, 9,0 + KERNEL8x16_I1_L4_2 64,32, 10,0 + KERNEL8x16_I1_L4_2 64,32, 11,0 + KERNEL8x16_I1_L4_2 64,32, 12,0 + KERNEL8x16_I1_L4_2 64,32, 13,0 + KERNEL8x16_I1_L4_2 64,32, 14,0 + KERNEL8x16_I1_L4_3 64,32, 15,1 + blr +LSGEMM_L8x16_L32_SUB: + LOAD8x16_0 + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_2 64,32, 1,0 + KERNEL8x16_I1_L4_2 64,32, 2,0 + KERNEL8x16_I1_L4_2 64,32, 3,0 + KERNEL8x16_I1_L4_2 64,32, 4,0 + KERNEL8x16_I1_L4_2 64,32, 5,0 + KERNEL8x16_I1_L4_2 64,32, 6,0 + KERNEL8x16_I1_L4_3 64,32, 7,1 + blr + +LSGEMM_L8x16_L16_SUB: + LOAD8x16_0 + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_2 64,32, 1,0 + KERNEL8x16_I1_L4_2 64,32, 2,0 + KERNEL8x16_I1_L4_3 64,32, 3,1 + blr + +L8: #if defined(TRMMKERNEL) && !defined(LEFT) neg TEMP_REG, OFFSET #endif @@ -39,98 +128,50 @@ LSGEMM_L8x16_BEGIN: REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 mr T12, T11 addi T12,T12, -1 - srawi. L, T12, 6 /**(T11-1) % 64x */ + srawi. L, T12, 7 /**(T11-1) % 128x */ #else mr T12, K addi T12,T12, -1 - srawi. L, T12, 6 /**(K-1) % 64x */ + srawi. L, T12, 7 /**(K-1) % 128x */ #endif ZERO8x16 ble LSGEMM_L8x16_SUB0 - - MY_ALIGN -LSGEMM_L8x16_LOOP_START: - - LOAD8x16_0 /*we already zeroed */ - /*##OffsetA=64 OffsetB=32 - #addi AO,AO,2112 - #addi BO,BO,32 */ - - mtctr L - - MY_ALIGN - -LSGEMM_L8x16_LOOP: - - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_2 64,32, 1,0 - KERNEL8x16_I1_L4_2 64,32, 2,0 - KERNEL8x16_I1_L4_2 64,32, 3,0 - KERNEL8x16_I1_L4_2 64,32, 4,0 - KERNEL8x16_I1_L4_2 64,32, 5,0 - KERNEL8x16_I1_L4_2 64,32, 6,0 - KERNEL8x16_I1_L4_2 64,32, 7,0 - KERNEL8x16_I1_L4_2 64,32, 8,0 - KERNEL8x16_I1_L4_2 64,32, 9,0 - KERNEL8x16_I1_L4_2 64,32, 10,0 - KERNEL8x16_I1_L4_2 64,32, 11,0 - KERNEL8x16_I1_L4_2 64,32, 12,0 - KERNEL8x16_I1_L4_2 64,32, 13,0 - KERNEL8x16_I1_L4_2 64,32, 14,0 - KERNEL8x16_I1_L4_2 64,32, 15,1 - - bdnz LSGEMM_L8x16_LOOP - - MY_ALIGN -LSGEMM_L8x16_LOOP_END: - - END8x16 0, AO, BO, 64, 32 - - b LSGEMM_L8x16_SUB1 + bl LSGEMM_L8x16_LMAIN_SUB + andi. L, T12, 127 + ble LSGEMM_L8x16_SAVE + b LSGEMM_L8x16_SUB2 MY_ALIGN LSGEMM_L8x16_SUB0: #if defined(TRMMKERNEL) - andi. L, T11, 127 + andi. L, T11, 255 + cmpwi T11,128 #else - andi. L, K, 127 + andi. L, K, 255 + cmpwi K,128 #endif - b LSGEMM_L8x16_SUB2 - MY_ALIGN -LSGEMM_L8x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 63 -#else - andi. L, T12, 63 -#endif - ble LSGEMM_L8x16_SAVE + + bne LSGEMM_L8x16_SUB2 + MY_ALIGN +LSGEMM_L8x16_SUB2_128: + bl LSGEMM_L8x16_L64_SUB + bl LSGEMM_L8x16_L64_SUB + b LSGEMM_L8x16_SAVE MY_ALIGN LSGEMM_L8x16_SUB2: - - srawi. T10,L, 5 + andi. T10,L,64 + ble LSGEMM_L8x16_SUB2_32 + bl LSGEMM_L8x16_L64_SUB + MY_ALIGN +LSGEMM_L8x16_SUB2_32: + andi. T10,L, 32 ble LSGEMM_L8x16_SUB2_16 - mtctr T10 - MY_ALIGN -LSGEMM_L8x16_SUB2_LOOP: - LOAD8x16_0 - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_2 64,32, 1,0 - KERNEL8x16_I1_L4_2 64,32, 2,0 - KERNEL8x16_I1_L4_2 64,32, 3,0 - KERNEL8x16_I1_L4_2 64,32, 4,0 - KERNEL8x16_I1_L4_2 64,32, 5,0 - KERNEL8x16_I1_L4_2 64,32, 6,0 - KERNEL8x16_I1_L4_3 64,32, 7,1 - bdnz LSGEMM_L8x16_SUB2_LOOP - MY_ALIGN + bl LSGEMM_L8x16_L32_SUB + MY_ALIGN LSGEMM_L8x16_SUB2_16: andi. T10,L, 16 ble LSGEMM_L8x16_SUB2_8 - LOAD8x16_0 - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_2 64,32, 1,0 - KERNEL8x16_I1_L4_2 64,32, 2,0 - KERNEL8x16_I1_L4_3 64,32, 3,1 + bl LSGEMM_L8x16_L16_SUB MY_ALIGN LSGEMM_L8x16_SUB2_8: andi. T10,L, 8 @@ -155,8 +196,7 @@ LSGEMM_L8x16_SUB2_1: andi. T10,L, 1 ble LSGEMM_L8x16_SAVE KERNEL8x16 0 -# addic. L, L, -1 -# bgt LSGEMM_L8x16_SUB2 + MY_ALIGN LSGEMM_L8x16_SAVE: diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S index c61f419ac..3f86a1d25 100644 --- a/kernel/power/sgemm_macros_power9.S +++ b/kernel/power/sgemm_macros_power9.S @@ -62,7 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro KERNEL8x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 + KERNEL8x16_L1_L2_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast @@ -112,15 +112,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxv vs24, 0(BO) lxv vs28, 16(BO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask lxv vs0, 0(AO) lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask xxpermdi vs25, vs24, vs24,2 xxpermdi vs29, vs28, vs28,2 - + lxv vs2, 32(AO) + lxv vs3, 48(AO) xxpermdi vs27, vs26, vs26,2 xxpermdi vs31, vs30, vs30,2 @@ -259,247 +258,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG) - lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG) - - lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG) - lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 - - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 - - lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) - - lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG) - lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - xvmaddasp vs50, vs6,vs12 - xvmaddasp vs51, vs7,vs12 - - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - xvmaddasp vs54, vs6,vs13 - xvmaddasp vs55, vs7,vs13 - - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - xvmaddasp vs58, vs6,vs14 - xvmaddasp vs59, vs7,vs14 - - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - xvmaddasp vs62, vs6,vs15 - xvmaddasp vs63, vs7,vs15 - - lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) - lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) - - lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG) - lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG) - lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG) - lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 - - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 - -.if \Complete==0 - lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) - - lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG) - lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG) - lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG) - lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - -.endif -.if \IsLast==1 -.if \Complete==1 - - addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) - addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA) -.else - - addi \BREG, \BREG, DISP32(\Index,128) - addi \AREG, \AREG, DISP64(\Index,256) -.endif -.endif - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.endif - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - xvmaddasp vs50, vs6,vs12 - xvmaddasp vs51, vs7,vs12 - - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - xvmaddasp vs54, vs6,vs13 - xvmaddasp vs55, vs7,vs13 - - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - xvmaddasp vs58, vs6,vs14 - xvmaddasp vs59, vs7,vs14 - - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - xvmaddasp vs62, vs6,vs15 - xvmaddasp vs63, vs7,vs15 +KERNEL8x16_L1_L2_I \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0 +KERNEL8x16_L1_L2_I \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete .endm @@ -509,224 +269,134 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. END8x16 \First, AO, BO, 64,32 .endm -.macro KERNEL8x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) +.macro KERNEL8x16_L1_L2_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + lxv vs8, DISP16(\Index,\OffsetB)(\BREG) lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) - lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs36, vs0,vs25 + lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG) lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) - lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) - xxperm vs10, vs8, permute_mask xxperm vs14, vs12, permute_mask + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs44, vs0,vs27 + lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs52, vs0,vs29 + xxpermdi vs9, vs8, vs8,2 xxpermdi vs13, vs12, vs12,2 -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - xvmulsp vs34, vs2,vs24 - xvmulsp vs35, vs3,vs24 - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - xvmulsp vs38, vs2,vs25 - xvmulsp vs39, vs3,vs25 -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 -.endif + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs60, vs0,vs31 xxpermdi vs11, vs10, vs10,2 xxpermdi vs15, vs14, vs14,2 - -.if \First==1 - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - xvmulsp vs42, vs2,vs26 - xvmulsp vs43, vs3,vs26 - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - xvmulsp vs46, vs2,vs27 - xvmulsp vs47, vs3,vs27 - xvmulsp vs48, vs0,vs28 - xvmulsp vs49, vs1,vs28 - xvmulsp vs50, vs2,vs28 - xvmulsp vs51, vs3,vs28 - xvmulsp vs52, vs0,vs29 - xvmulsp vs53, vs1,vs29 - xvmulsp vs54, vs2,vs29 - xvmulsp vs55, vs3,vs29 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs37, vs1,vs25 - xvmulsp vs56, vs0,vs30 - xvmulsp vs57, vs1,vs30 - xvmulsp vs58, vs2,vs30 - xvmulsp vs59, vs3,vs30 - - xvmulsp vs60, vs0,vs31 - xvmulsp vs61, vs1,vs31 - xvmulsp vs62, vs2,vs31 - xvmulsp vs63, vs3,vs31 - -.else - xvmaddasp vs40, vs0,vs26 xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - xvmaddasp vs48, vs0,vs28 xvmaddasp vs49, vs1,vs28 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 - - xvmaddasp vs52, vs0,vs29 xvmaddasp vs53, vs1,vs29 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 - - xvmaddasp vs56, vs0,vs30 xvmaddasp vs57, vs1,vs30 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 - + xvmaddasp vs61, vs1,vs31 +.if \Complete==0 + lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) .endif + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs62, vs2,vs31 + + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs51, vs3,vs28 + xvmaddasp vs55, vs3,vs29 + xvmaddasp vs59, vs3,vs30 + xvmaddasp vs63, vs3,vs31 +.if \Complete==0 + lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) +.endif + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs36, vs4,vs9 .if \Complete==0 lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) - - lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) - lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif +.endif .if \IsLast==1 .if \Complete==1 - addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) - addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) + addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) + addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) .else - addi \BREG, \BREG, DISP16(\Index,64) - addi \AREG, \AREG, DISP32(\Index,128) + addi \AREG, \AREG, DISP32(\Index,128) + addi \BREG, \BREG, DISP16(\Index,64) + .endif +.endif + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs44, vs4,vs11 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask .endif - -.if \First==1 - xvmulsp vs32, vs4,vs8 - xvmulsp vs33, vs5,vs8 - xvmulsp vs34, vs6,vs8 - xvmulsp vs35, vs7,vs8 - - xvmulsp vs36, vs4,vs9 - xvmulsp vs37, vs5,vs9 - xvmulsp vs38, vs6,vs9 - xvmulsp vs39, vs7,vs9 -.else - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs52, vs4,vs13 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 .endif + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs60, vs4,vs15 .if \Complete==0 xxpermdi vs27, vs26, vs26,2 xxpermdi vs31, vs30, vs30,2 - -.endif -.if \First==1 - xvmulsp vs40, vs4,vs10 - xvmulsp vs41, vs5,vs10 - xvmulsp vs42, vs6,vs10 - xvmulsp vs43, vs7,vs10 + +.endif - xvmulsp vs44, vs4,vs11 - xvmulsp vs45, vs5,vs11 - xvmulsp vs46, vs6,vs11 - xvmulsp vs47, vs7,vs11 - - xvmulsp vs48, vs4,vs12 - xvmulsp vs49, vs5,vs12 - xvmulsp vs50, vs6,vs12 - xvmulsp vs51, vs7,vs12 - - xvmulsp vs52, vs4,vs13 - xvmulsp vs53, vs5,vs13 - xvmulsp vs54, vs6,vs13 - xvmulsp vs55, vs7,vs13 - - xvmulsp vs56, vs4,vs14 - xvmulsp vs57, vs5,vs14 - xvmulsp vs58, vs6,vs14 - xvmulsp vs59, vs7,vs14 - - xvmulsp vs60, vs4,vs15 - xvmulsp vs61, vs5,vs15 - xvmulsp vs62, vs6,vs15 - xvmulsp vs63, vs7,vs15 - -.else - xvmaddasp vs40, vs4,vs10 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs37, vs5,vs9 xvmaddasp vs41, vs5,vs10 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - - xvmaddasp vs44, vs4,vs11 xvmaddasp vs45, vs5,vs11 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - - xvmaddasp vs48, vs4,vs12 xvmaddasp vs49, vs5,vs12 - xvmaddasp vs50, vs6,vs12 - xvmaddasp vs51, vs7,vs12 - - xvmaddasp vs52, vs4,vs13 xvmaddasp vs53, vs5,vs13 - xvmaddasp vs54, vs6,vs13 - xvmaddasp vs55, vs7,vs13 - - xvmaddasp vs56, vs4,vs14 xvmaddasp vs57, vs5,vs14 - xvmaddasp vs58, vs6,vs14 - xvmaddasp vs59, vs7,vs14 - - xvmaddasp vs60, vs4,vs15 xvmaddasp vs61, vs5,vs15 - xvmaddasp vs62, vs6,vs15 + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs62, vs6,vs15 + + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs39, vs7,vs9 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs47, vs7,vs11 + xvmaddasp vs51, vs7,vs12 + xvmaddasp vs55, vs7,vs13 + xvmaddasp vs59, vs7,vs14 xvmaddasp vs63, vs7,vs15 - -.endif - + .endm @@ -763,7 +433,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxmrghw vs2, vs37, vs41 xxmrghw vs3, vs33, vs45 - +#ifndef TRMMKERNEL + lxv vs32, 0(CO) + lxv vs33, 16(CO) +#endif xxmrglw vs16, vs34, vs46 xxmrglw vs18, vs38, vs42 @@ -784,176 +457,203 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxmrghw vs30, vs39, vs43 xxmrghw vs31, vs35, vs47 - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - -#ifndef TRMMKERNEL - lxv vs32, 0(CO) - lxv vs33, 16(CO) +#ifndef TRMMKERNEL lxv vs34, 32(CO) lxv vs35, 48(CO) #endif - xxlor vs25, vs24, vs24 - xxlor vs27, vs26, vs26 - + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 #ifndef TRMMKERNEL lxv vs36, 0(T1) lxv vs37, 16(T1) +#endif + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + +#ifndef TRMMKERNEL lxv vs38, 32(T1) lxv vs39, 48(T1) #endif + + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + + + #ifndef TRMMKERNEL lxv vs40, 0(T2) lxv vs41, 16(T2) +#endif + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 +#ifndef TRMMKERNEL lxv vs42, 32(T2) lxv vs43, 48(T2) #endif + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 #ifndef TRMMKERNEL lxv vs44, 0(T3) - lxv vs45, 16(T3) + lxv vs45, 16(T3) +#endif + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 +#ifndef TRMMKERNEL lxv vs46, 32(T3) lxv vs47, 48(T3) #endif - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 + + - xxperm vs16, vs4, save_permute_1 - xxperm vs18, vs5, save_permute_1 xxperm vs17, vs4, save_permute_2 xxperm vs19, vs5, save_permute_2 - +#ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r +#endif xxperm vs24, vs30, save_permute_1 xxperm vs26, vs31, save_permute_1 + + + stxv vs32, 0(CO) + stxv vs33, 16(CO) +#ifdef TRMMKERNEL + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r +#else + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r +#endif xxperm vs25, vs30, save_permute_2 xxperm vs27, vs31, save_permute_2 - /* multiply add normal way */ - -#ifdef TRMMKERNEL - xvmulsp vs32, vs8, alpha_r - xvmulsp vs33, vs12, alpha_r - xvmulsp vs34, vs16, alpha_r - xvmulsp vs35, vs24, alpha_r + stxv vs34, 32(CO) + stxv vs35, 48(CO) +#ifdef TRMMKERNEL xvmulsp vs36, vs9, alpha_r - xvmulsp vs37, vs13, alpha_r + xvmulsp vs37, vs13, alpha_r +#else + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r +#endif + stxv vs36, 0(T1) + stxv vs37, 16(T1) +#ifdef TRMMKERNEL xvmulsp vs38, vs17, alpha_r xvmulsp vs39, vs25, alpha_r -#else - xvmaddasp vs32, vs8, alpha_r - xvmaddasp vs33, vs12, alpha_r - xvmaddasp vs34, vs16, alpha_r - xvmaddasp vs35, vs24, alpha_r - xvmaddasp vs36, vs9, alpha_r - xvmaddasp vs37, vs13, alpha_r +#else xvmaddasp vs38, vs17, alpha_r xvmaddasp vs39, vs25, alpha_r #endif - - - -#ifdef TRMMKERNEL - xvmulsp vs40, vs10, alpha_r - xvmulsp vs41, vs14, alpha_r - xvmulsp vs42, vs18, alpha_r - xvmulsp vs43, vs26, alpha_r - xvmulsp vs44, vs11, alpha_r - xvmulsp vs45, vs15, alpha_r - xvmulsp vs46, vs19, alpha_r - xvmulsp vs47, vs27, alpha_r -#else - - xvmaddasp vs40, vs10, alpha_r - xvmaddasp vs41, vs14, alpha_r - xvmaddasp vs42, vs18, alpha_r - xvmaddasp vs43, vs26, alpha_r - xvmaddasp vs44, vs11, alpha_r - xvmaddasp vs45, vs15, alpha_r - xvmaddasp vs46, vs19, alpha_r - xvmaddasp vs47, vs27, alpha_r - -#endif - - stxv vs32, 0(CO) - stxv vs33, 16(CO) - stxv vs34, 32(CO) - stxv vs35, 48(CO) - - stxv vs36, 0(T1) - stxv vs37, 16(T1) stxv vs38, 32(T1) stxv vs39, 48(T1) +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r +#else + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r +#endif + stxv vs40, 0(T2) stxv vs41, 16(T2) +#ifdef TRMMKERNEL + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r +#else + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r +#endif stxv vs42, 32(T2) stxv vs43, 48(T2) +#ifdef TRMMKERNEL + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r +#else + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r +#endif stxv vs44, 0(T3) stxv vs45, 16(T3) +#ifdef TRMMKERNEL + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r +#endif stxv vs46, 32(T3) stxv vs47, 48(T3) /*****the same with the second 8X8 ****/ -#ifndef TRMMKERNEL - + #ifndef TRMMKERNEL lxv vs32, 0(T4) lxv vs33, 16(T4) - lxv vs34, 32(T4) - lxv vs35, 48(T4) - lxv vs36, 0(T5) - lxv vs37, 16(T5) - lxv vs38,32(T5) - lxv vs39, 48(T5) #endif - xxmrglw vs8, vs48, vs60 xxmrglw vs10, vs52, vs56 - +#ifndef TRMMKERNEL + lxv vs34, 32(T4) + lxv vs35, 48(T4) +#endif xxmrghw vs1, vs48, vs60 xxmrghw vs0, vs52, vs56 +#ifndef TRMMKERNEL + lxv vs36, 0(T5) + lxv vs37, 16(T5) +#endif xxmrglw vs12, vs49, vs61 xxmrglw vs14, vs53, vs57 - -#ifndef TRMMKERNEL - lxv vs40, 0(T6) - lxv vs41, 16(T6) - lxv vs42, 32(T6) - lxv vs43, 48(T6) - lxv vs44, 0(T7) - lxv vs45, 16(T7) - lxv vs46, 32(T7) - lxv vs47, 48(T7) -#endif +#ifndef TRMMKERNEL + lxv vs38,32(T5) + lxv vs39, 48(T5) +#endif + xxmrghw vs2, vs53, vs57 xxmrghw vs3, vs49, vs61 - +#ifndef TRMMKERNEL + lxv vs40, 0(T6) + lxv vs41, 16(T6) +#endif xxmrglw vs16, vs50, vs62 xxmrglw vs18, vs54, vs58 - +#ifndef TRMMKERNEL + lxv vs42, 32(T6) + lxv vs43, 48(T6) +#endif xxlor vs9, vs8, vs8 xxlor vs11, vs10, vs10 xxmrghw vs4, vs54, vs58 xxmrghw vs5, vs50, vs62 - +#ifndef TRMMKERNEL + lxv vs44, 0(T7) + lxv vs45, 16(T7) +#endif xxlor vs13, vs12, vs12 xxlor vs15, vs14, vs14 xxmrglw vs24, vs51, vs63 - xxmrglw vs26, vs55, vs59 - + xxmrglw vs26, vs55, vs59 +#ifndef TRMMKERNEL + lxv vs46, 32(T7) + lxv vs47, 48(T7) +#endif xxlor vs17, vs16, vs16 xxlor vs19, vs18, vs18 xxmrghw vs30, vs55, vs59 - xxmrghw vs31, vs51, vs63 + xxmrghw vs31, vs51, vs63 + + xxperm vs8, vs0, save_permute_1 xxperm vs10, vs1, save_permute_1 @@ -965,11 +665,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs27, vs26, vs26 xxperm vs12, vs2, save_permute_1 xxperm vs14, vs3, save_permute_1 + xxperm vs13, vs2, save_permute_2 xxperm vs15, vs3, save_permute_2 - + #ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r +#endif xxperm vs16, vs4, save_permute_1 xxperm vs18, vs5, save_permute_1 + stxv vs32, 0(T4) + stxv vs33, 16(T4) xxperm vs17, vs4, save_permute_2 xxperm vs19, vs5, save_permute_2 xxperm vs24, vs30, save_permute_1 @@ -977,64 +686,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxperm vs25, vs30, save_permute_2 xxperm vs27, vs31, save_permute_2 -#ifdef TRMMKERNEL - xvmulsp vs32, vs8, alpha_r - xvmulsp vs33, vs12, alpha_r +#ifdef TRMMKERNEL xvmulsp vs34, vs16, alpha_r - xvmulsp vs35, vs24, alpha_r + xvmulsp vs35, vs24, alpha_r +#else + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r +#endif + stxv vs34, 32(T4) + stxv vs35, 48(T4) + +#ifdef TRMMKERNEL xvmulsp vs36, vs9, alpha_r - xvmulsp vs37, vs13, alpha_r + xvmulsp vs37, vs13, alpha_r +#else + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r +#endif + stxv vs36, 0(T5) + stxv vs37, 16(T5) + +#ifdef TRMMKERNEL xvmulsp vs38, vs17, alpha_r xvmulsp vs39, vs25, alpha_r -#else - xvmaddasp vs32, vs8, alpha_r - xvmaddasp vs33, vs12, alpha_r - xvmaddasp vs34, vs16, alpha_r - xvmaddasp vs35, vs24, alpha_r - xvmaddasp vs36, vs9, alpha_r - xvmaddasp vs37, vs13, alpha_r +#else xvmaddasp vs38, vs17, alpha_r xvmaddasp vs39, vs25, alpha_r #endif - stxv vs32, 0(T4) - stxv vs33, 16(T4) - stxv vs34, 32(T4) - stxv vs35, 48(T4) - stxv vs36, 0(T5) - stxv vs37, 16(T5) + + stxv vs38, 32(T5) stxv vs39, 48(T5) + #ifdef TRMMKERNEL xvmulsp vs40, vs10, alpha_r - xvmulsp vs41, vs14, alpha_r - xvmulsp vs42, vs18, alpha_r - xvmulsp vs43, vs26, alpha_r - xvmulsp vs44, vs11, alpha_r - xvmulsp vs45, vs15, alpha_r - xvmulsp vs46, vs19, alpha_r - xvmulsp vs47, vs27, alpha_r -#else - + xvmulsp vs41, vs14, alpha_r +#else xvmaddasp vs40, vs10, alpha_r xvmaddasp vs41, vs14, alpha_r - xvmaddasp vs42, vs18, alpha_r - xvmaddasp vs43, vs26, alpha_r - xvmaddasp vs44, vs11, alpha_r - xvmaddasp vs45, vs15, alpha_r - xvmaddasp vs46, vs19, alpha_r - xvmaddasp vs47, vs27, alpha_r - #endif - stxv vs40, 0(T6) - stxv vs41, 16(T6) + stxv vs41, 16(T6) +#ifdef TRMMKERNEL + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r +#else + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r +#endif stxv vs42, 32(T6) stxv vs43, 48(T6) +#ifdef TRMMKERNEL + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r +#else + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r +#endif + stxv vs44, 0(T7) stxv vs45, 16(T7) +#ifdef TRMMKERNEL + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r +#endif + stxv vs46, 32(T7) stxv vs47, 48(T7) @@ -1224,12 +946,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxperm vs10, vs8, permute_mask xxperm vs14, vs12, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 xvmaddasp vs32, vs0,vs24 xvmaddasp vs33, vs1,vs24 + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + xvmaddasp vs36, vs0,vs25 xvmaddasp vs37, vs1,vs25 @@ -1247,21 +971,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs52, vs0,vs29 xvmaddasp vs53, vs1,vs29 - + lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) xvmaddasp vs56, vs0,vs30 xvmaddasp vs57, vs1,vs30 xvmaddasp vs60, vs0,vs31 xvmaddasp vs61, vs1,vs31 - lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 xxpermdi vs29, vs28, vs28,2 @@ -1285,21 +1009,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs52, vs4,vs13 xvmaddasp vs53, vs5,vs13 - + lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) xvmaddasp vs56, vs4,vs14 xvmaddasp vs57, vs5,vs14 xvmaddasp vs60, vs4,vs15 xvmaddasp vs61, vs5,vs15 - lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) - lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 xxpermdi vs13, vs12, vs12,2 @@ -1323,22 +1048,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs52, vs0,vs29 xvmaddasp vs53, vs1,vs29 - - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - .if \Complete==0 lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + +.if \Complete==0 lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) +.endif - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask +.if \Complete==0 xxpermdi vs25, vs24, vs24,2 xxpermdi vs29, vs28, vs28,2 diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S index e655f0bfe..a41bcec77 100644 --- a/kernel/power/zgemm_kernel_power9.S +++ b/kernel/power/zgemm_kernel_power9.S @@ -30,10 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LOAD ld -#define STACKSIZE 32192 +#define STACKSIZE 512 #define FZERO 312+192(SP) - + +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ #define M r3 #define N r4 @@ -56,20 +57,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FRAMEPOINTER r12 -#define BBUFFER r14 +#define T10 r14 #define L r15 -#define ALPHA r16 +#define T8 r16 #define T5 r17 #define T2 r19 -#define BBO r20 -#define o8 r21 +#define T9 r20 +#define T6 r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define CO r26 -#define o16 r27 +#define T7 r27 #define T3 r28 #define T4 r29 @@ -82,12 +83,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROFCODE mr FRAMEPOINTER, SP - addi SP, SP, -STACKSIZE - addi SP, SP, -STACKSIZE - addi SP, SP, -STACKSIZE - addi SP, SP, -STACKSIZE - li r0, 0 - + addi SP, SP, -STACKSIZE + mflr r0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) @@ -111,6 +108,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stfd f30, 128(SP) stfd f31, 136(SP) + xxspltd alpha_r,vs1,0 /*copy from register f1 */ + xxspltd alpha_i,vs2,0 /*copy from register f2 */ std r31, 144(SP) std r30, 152(SP) @@ -132,21 +131,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r14, 280(SP) - stxv v20, 288(SP) - stxv v21, 304(SP) - stxv v22, 320(SP) - stxv v23, 336(SP) - stxv v24, 352(SP) - stxv v25, 368(SP) - stxv v26, 384(SP) - stxv v27, 400(SP) - stxv v28, 416(SP) - stxv v29, 432(SP) - stxv v30, 448(SP) - stxv v31, 464(SP) + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + std r0, FLINK_SAVE(SP) - stw r0, FZERO #ifdef linux ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) @@ -162,35 +161,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zgemm_macros_power9.S" - cmpwi cr0, M, 0 - ble L999 - cmpwi cr0, N, 0 - ble L999 - cmpwi cr0, K, 0 - ble L999 + slwi LDC, LDC, ZBASE_SHIFT - li PRE, 512 - li o8 , 8 - li o16 , 16 - - addi BBUFFER, SP, 512+4096 - li T1, -4096 - and BBUFFER, BBUFFER, T1 - + li PRE, 512 + li r0, 0 - addi ALPHA, SP, 296+192 - - xxlor alpha_r,vs1,vs1 /*copy from register f1 */ - xxlor alpha_i,vs2,vs2 /*copy from register f2 */ +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegdp alpha_r,alpha_r + xvnegdp alpha_i,alpha_i +#endif .align 4 #include "zgemm_logic_power9.S" L999: - addi r3, 0, 0 - + lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) @@ -233,24 +221,24 @@ L999: ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) - - lxv v20, 288(SP) - lxv v21, 304(SP) - lxv v22, 320(SP) - lxv v23, 336(SP) - lxv v24, 352(SP) - lxv v25, 368(SP) - lxv v26, 384(SP) - lxv v27, 400(SP) - lxv v28, 416(SP) - lxv v29, 432(SP) - lxv v30, 448(SP) - lxv v31, 464(SP) - addi SP, SP, STACKSIZE - addi SP, SP, STACKSIZE - addi SP, SP, STACKSIZE - addi SP, SP, STACKSIZE + ld r0, FLINK_SAVE(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE blr EPILOGUE diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S index 77ce36294..01685fe79 100644 --- a/kernel/power/zgemm_logic_power9.S +++ b/kernel/power/zgemm_logic_power9.S @@ -25,155 +25,348 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define MY_ALIGN .align 3 +b ZGEMM_L2 +/* MINI SUBROUTINES */ + + + +/* 2x8 MAIN 128x+1 LOOP */ +ZGEMM_L2x8_LMAIN_SUB: + mtctr L + LOAD2x8 0 + MY_ALIGN +ZGEMM_L2x8_LOOP: + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L 128,32,0,0 + KERNEL2x8_L 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L 128,32,2,0 + KERNEL2x8_L 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L 128,32,4,0 + KERNEL2x8_L 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L 128,32,6,0 + KERNEL2x8_L 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L 128,32,8,0 + KERNEL2x8_L 128,32,9,0 + KERNEL2x8_L 128,32,10,0 + KERNEL2x8_L 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L 128,32,12,0 + KERNEL2x8_L 128,32,13,0 + KERNEL2x8_L 128,32,14,0 + KERNEL2x8_L 128,32,15,0 + KERNEL2x8_L 128,32,16,0 + KERNEL2x8_L 128,32,17,0 + KERNEL2x8_L 128,32,18,0 + KERNEL2x8_L 128,32,19,0 + KERNEL2x8_L 128,32,20,0 + KERNEL2x8_L 128,32,21,0 + KERNEL2x8_L 128,32,22,0 + KERNEL2x8_L 128,32,23,0 + KERNEL2x8_L 128,32,24,0 + KERNEL2x8_L 128,32,25,0 + KERNEL2x8_L 128,32,26,0 + KERNEL2x8_L 128,32,27,0 + KERNEL2x8_L 128,32,28,0 + KERNEL2x8_L 128,32,29,0 + KERNEL2x8_L 128,32,30,0 + KERNEL2x8_L 128,32,31,0 + KERNEL2x8_L 128,32,32,0 + KERNEL2x8_L 128,32,33,0 + KERNEL2x8_L 128,32,34,0 + KERNEL2x8_L 128,32,35,0 + KERNEL2x8_L 128,32,36,0 + KERNEL2x8_L 128,32,37,0 + KERNEL2x8_L 128,32,38,0 + KERNEL2x8_L 128,32,39,0 + KERNEL2x8_L 128,32,40,0 + KERNEL2x8_L 128,32,41,0 + KERNEL2x8_L 128,32,42,0 + KERNEL2x8_L 128,32,43,0 + KERNEL2x8_L 128,32,44,0 + KERNEL2x8_L 128,32,45,0 + KERNEL2x8_L 128,32,46,0 + KERNEL2x8_L 128,32,47,0 + KERNEL2x8_L 128,32,48,0 + KERNEL2x8_L 128,32,49,0 + KERNEL2x8_L 128,32,50,0 + KERNEL2x8_L 128,32,51,0 + KERNEL2x8_L 128,32,52,0 + KERNEL2x8_L 128,32,53,0 + KERNEL2x8_L 128,32,54,0 + KERNEL2x8_L 128,32,55,0 + KERNEL2x8_L 128,32,56,0 + KERNEL2x8_L 128,32,57,0 + KERNEL2x8_L 128,32,58,0 + KERNEL2x8_L 128,32,59,0 + KERNEL2x8_L 128,32,60,0 + KERNEL2x8_L 128,32,61,0 + KERNEL2x8_L 128,32,62,0 + KERNEL2x8_L 128,32,63,1 + bdnz ZGEMM_L2x8_LOOP + MY_ALIGN +ZGEMM_L2x8_LOOP_END: + END2x8 AO, BO, 128,32 + blr + + MY_ALIGN +ZGEMM_2x8_L64_SUB: + LOAD2x8 0 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L 128,32,0,0 + KERNEL2x8_L 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L 128,32,2,0 + KERNEL2x8_L 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L 128,32,4,0 + KERNEL2x8_L 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L 128,32,6,0 + KERNEL2x8_L 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L 128,32,8,0 + KERNEL2x8_L 128,32,9,0 + KERNEL2x8_L 128,32,10,0 + KERNEL2x8_L 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L 128,32,12,0 + KERNEL2x8_L 128,32,13,0 + KERNEL2x8_L 128,32,14,0 + KERNEL2x8_L 128,32,15,0 + KERNEL2x8_L 128,32,16,0 + KERNEL2x8_L 128,32,17,0 + KERNEL2x8_L 128,32,18,0 + KERNEL2x8_L 128,32,19,0 + KERNEL2x8_L 128,32,20,0 + KERNEL2x8_L 128,32,21,0 + KERNEL2x8_L 128,32,22,0 + KERNEL2x8_L 128,32,23,0 + KERNEL2x8_L 128,32,24,0 + KERNEL2x8_L 128,32,25,0 + KERNEL2x8_L 128,32,26,0 + KERNEL2x8_L 128,32,27,0 + KERNEL2x8_L 128,32,28,0 + KERNEL2x8_L 128,32,29,0 + KERNEL2x8_L 128,32,30,0 + KERNEL2x8_E 128,32,31,1 + blr + + + MY_ALIGN +ZGEMM_2x8_L32_SUB: + LOAD2x8 0 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L 128,32,0,0 + KERNEL2x8_L 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L 128,32,2,0 + KERNEL2x8_L 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L 128,32,4,0 + KERNEL2x8_L 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L 128,32,6,0 + KERNEL2x8_L 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L 128,32,8,0 + KERNEL2x8_L 128,32,9,0 + KERNEL2x8_L 128,32,10,0 + KERNEL2x8_L 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L 128,32,12,0 + KERNEL2x8_L 128,32,13,0 + KERNEL2x8_L 128,32,14,0 + KERNEL2x8_L 128,32,15,1 + blr + MY_ALIGN + +ZGEMM_2x8_L16_SUB: + LOAD2x8 0 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L 128,32,0,0 + KERNEL2x8_L 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L 128,32,2,0 + KERNEL2x8_L 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L 128,32,4,0 + KERNEL2x8_L 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L 128,32,6,0 + KERNEL2x8_L 128,32,7,1 + blr + MY_ALIGN + +ZGEMM_2x4_LMAIN_SUB: + mtctr L + LOAD2x4 0 + MY_ALIGN +ZGEMM_L2x4_LOOP: + KERNEL2x4_L 64,32,0,0 + KERNEL2x4_L 64,32,1,0 + KERNEL2x4_L 64,32,2,0 + KERNEL2x4_L 64,32,3,0 + KERNEL2x4_L 64,32,4,0 + KERNEL2x4_L 64,32,5,0 + KERNEL2x4_L 64,32,6,0 + KERNEL2x4_L 64,32,7,0 + KERNEL2x4_L 64,32,8,0 + KERNEL2x4_L 64,32,9,0 + KERNEL2x4_L 64,32,10,0 + KERNEL2x4_L 64,32,11,0 + KERNEL2x4_L 64,32,12,0 + KERNEL2x4_L 64,32,13,0 + KERNEL2x4_L 64,32,14,0 + KERNEL2x4_L 64,32,15,1 + bdnz ZGEMM_L2x4_LOOP + MY_ALIGN +ZGEMM_L2x4_LOOP_END: + END2x4 AO, BO, 64,32 + blr + + MY_ALIGN +ZGEMM_2x4_L16_SUB: + LOAD2x4 0 + KERNEL2x4_L 64,32, 0,0 + KERNEL2x4_L 64,32, 1,0 + KERNEL2x4_L 64,32, 2,0 + KERNEL2x4_L 64,32, 3,0 + KERNEL2x4_L 64,32, 4,0 + KERNEL2x4_L 64,32, 5,0 + KERNEL2x4_L 64,32, 6,0 + KERNEL2x4_E 64,32, 7,1 + blr + + MY_ALIGN +ZGEMM_2x4_L8_SUB: + LOAD2x4 0 + KERNEL2x4_L 64,32, 0,0 + KERNEL2x4_L 64,32, 1,0 + KERNEL2x4_L 64,32, 2,0 + KERNEL2x4_E 64,32, 3,1 + blr + +/* MAIN LOOP BEGINS */ + + MY_ALIGN +ZGEMM_L2: srawi. J, N, 1 ble ZGEMM_L2_END ZGEMM_L2_BEGIN: - - mr BO, B - mr BBO, BBUFFER - srawi. T1, K, 2 - ble ZGEMM_L2_COPYB1 - -ZGEMM_L2_COPYB8: - - addi T2, PRE, 128 - dcbt BO, PRE - dcbtst BBO, PRE - dcbtst BBO, T2 - ZCOPYB_8 - addic. T1, T1, -1 - - bgt ZGEMM_L2_COPYB8 - -ZGEMM_L2_COPYB1: - - andi. T1, K, 3 - ble ZGEMM_L2_COPYB_END - -ZGEMM_L2_COPYB_LOOP: - - ZCOPYB_2 - addic. T1, T1, -1 - - bgt ZGEMM_L2_COPYB_LOOP - -ZGEMM_L2_COPYB_END: - - mr CO, C - mr AO, A - slwi T1, LDC , 1 + mr CO, C + slwi T1, LDC , 1 + add T2,C,LDC + mr AO, A add C, C, T1 srawi. I, M, 3 ble ZGEMM_L2x8_END - -ZGEMM_L2x8_BEGIN: - - - mr BO, BBUFFER + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 +ZGEMM_L2x8_BEGIN: mr T1, K + mr BO, B + dcbt B, r0 + dcbt AO, r0 + /* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -1 - srawi. L, T1, 5 /**(K-1) % 32x */ + /* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. L, T1, 7 /**(K-1) % 128x */ + ZERO2x8 ble ZGEMM_L2x8_SUB0 - - -ZGEMM_L2x8_LOOP_START: - - LOAD2x8 0 - li T2, 1024 - li T3, 1024+512 - li T4, 2048 - li T5, 2048+512 - mtctr L - - MY_ALIGN -ZGEMM_L2x8_LOOP: - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L 128,64,0,0 - KERNEL2x8_L 128,64,1,0 - dcbt AO, T2 - KERNEL2x8_L 128,64,2,0 - KERNEL2x8_L 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L 128,64,4,0 - KERNEL2x8_L 128,64,5,0 - dcbt AO, T4 - KERNEL2x8_L 128,64,6,0 - KERNEL2x8_L 128,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L 128,64,8,0 - KERNEL2x8_L 128,64,9,0 - KERNEL2x8_L 128,64,10,0 - KERNEL2x8_L 128,64,11,0 - dcbt BO, T4 - KERNEL2x8_L 128,64,12,0 - KERNEL2x8_L 128,64,13,0 - KERNEL2x8_L 128,64,14,0 - KERNEL2x8_L 128,64,15,1 - bdnz ZGEMM_L2x8_LOOP - MY_ALIGN -ZGEMM_L2x8_LOOP_END: - END2x8 AO, BO, 128, 64 - - b ZGEMM_L2x8_SUB1 - -ZGEMM_L2x8_SUB0: - - andi. L, K, 63 - - b ZGEMM_L2x8_SUB2 - -ZGEMM_L2x8_SUB1: - - andi. L, T1, 31 + bl ZGEMM_L2x8_LMAIN_SUB + + andi. L, T1, 127 ble ZGEMM_L2x8_SAVE - -ZGEMM_L2x8_SUB2: - srawi. T1,L, 3 - ble ZGEMM_L2x8_SUB2_4 - mtctr T1 + b ZGEMM_L2x8_SUB2 + +ZGEMM_L2x8_SUB0: + andi. L, K, 255 + cmpwi K,128 + bne ZGEMM_L2x8_SUB2 + MY_ALIGN +ZGEMM_L2x8_SUB2_128: + bl ZGEMM_2x8_L64_SUB + bl ZGEMM_2x8_L64_SUB + b ZGEMM_L2x8_SAVE MY_ALIGN -ZGEMM_L2x8_SUB2_LOOP: +ZGEMM_L2x8_SUB2: + andi. T1,L, 64 + ble ZGEMM_L2x8_SUB2_32 + bl ZGEMM_2x8_L64_SUB + MY_ALIGN +ZGEMM_L2x8_SUB2_32: + andi. T1,L, 32 + ble ZGEMM_L2x8_SUB2_16 + bl ZGEMM_2x8_L32_SUB + MY_ALIGN +ZGEMM_L2x8_SUB2_16: + andi. T1,L, 16 + ble ZGEMM_L2x8_SUB2_8 + bl ZGEMM_2x8_L16_SUB + MY_ALIGN +ZGEMM_L2x8_SUB2_8: + andi. T1,L, 8 + ble ZGEMM_L2x8_SUB2_4 LOAD2x8 0 - KERNEL2x8_L 128,64, 0,0 - KERNEL2x8_L 128,64, 1,0 - KERNEL2x8_L 128,64, 2,0 - KERNEL2x8_E 128,64, 3,1 - bdnz ZGEMM_L2x8_SUB2_LOOP - MY_ALIGN + KERNEL2x8_L 128,32, 0,0 + KERNEL2x8_L 128,32, 1,0 + KERNEL2x8_L 128,32, 2,0 + KERNEL2x8_E 128,32, 3,1 + MY_ALIGN ZGEMM_L2x8_SUB2_4: andi. T1,L, 4 ble ZGEMM_L2x8_SUB2_2 LOAD2x8 0 - KERNEL2x8_L 128,64, 0,0 - KERNEL2x8_E 128,64, 1,1 + KERNEL2x8_L 128,32, 0,0 + KERNEL2x8_E 128,32, 1,1 MY_ALIGN ZGEMM_L2x8_SUB2_2: andi. T1,L, 2 ble ZGEMM_L2x8_SUB2_1 LOAD2x8 0 - KERNEL2x8_E 128,64, 0,1 + KERNEL2x8_E 128,32, 0,1 MY_ALIGN ZGEMM_L2x8_SUB2_1: andi. T1,L, 1 ble ZGEMM_L2x8_SAVE - KERNEL2x8 - -/* addic. L, L, -1 - bgt ZGEMM_L2x8_SUB2_1*/ + KERNEL2x8 ZGEMM_L2x8_SAVE: - + addic. I, I, -1 SAVE2x8 - addic. I, I, -1 bgt ZGEMM_L2x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L2x1_END + + andi. T1, M, 4 + ble ZGEMM_L2x4_END + b ZGEMM_L2x4_BEGIN + MY_ALIGN ZGEMM_L2x8_END: ZGEMM_L2x4_BEGIN: @@ -183,70 +376,50 @@ ZGEMM_L2x4_BEGIN: andi. T1, M, 4 ble ZGEMM_L2x4_END - mr BO, BBUFFER + mr BO, B mr T1, K addi T1,T1, -1 - srawi. L, T1, 4 /**(K-1) % 16x */ - ZERO2x4 + ZERO2x4 + srawi. L, T1, 5 /**(K-1) % 32x */ + ble ZGEMM_L2x4_SUB0 - -ZGEMM_L2x4_LOOP_START: - LOAD2x4 0 - mtctr L - - MY_ALIGN -ZGEMM_L2x4_LOOP: - KERNEL2x4_L 64,64,0,0 - KERNEL2x4_L 64,64,1,0 - KERNEL2x4_L 64,64,2,0 - KERNEL2x4_L 64,64,3,0 - KERNEL2x4_L 64,64,4,0 - KERNEL2x4_L 64,64,5,0 - KERNEL2x4_L 64,64,6,0 - KERNEL2x4_L 64,64,7,1 - bdnz ZGEMM_L2x4_LOOP - MY_ALIGN -ZGEMM_L2x4_LOOP_END: - END2x4 AO, BO, 64, 64 - - b ZGEMM_L2x4_SUB1 - -ZGEMM_L2x4_SUB0: - - andi. L, K, 31 - + bl ZGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x4_SAVE b ZGEMM_L2x4_SUB2 -ZGEMM_L2x4_SUB1: - - andi. L, T1, 15 - ble ZGEMM_L2x4_SAVE - -ZGEMM_L2x4_SUB2: - srawi. T1,L, 3 - ble ZGEMM_L2x4_SUB2_4 - mtctr T1 +ZGEMM_L2x4_SUB0: + andi. L, K, 63 + cmpwi K,32 + bne ZGEMM_L2x4_SUB2 + MY_ALIGN +ZGEMM_L2x4_SUB2_32: + bl ZGEMM_2x4_L16_SUB + bl ZGEMM_2x4_L16_SUB + b ZGEMM_L2x4_SAVE + MY_ALIGN +ZGEMM_L2x4_SUB2: + andi. T1,L, 16 + ble ZGEMM_L2x4_SUB2_8 + bl ZGEMM_2x4_L16_SUB MY_ALIGN -ZGEMM_L2x4_SUB2_LOOP: - LOAD2x4 0 - KERNEL2x4_L 64,64, 0,0 - KERNEL2x4_L 64,64, 1,0 - KERNEL2x4_L 64,64, 2,0 - KERNEL2x4_E 64,64, 3,1 - bdnz ZGEMM_L2x4_SUB2_LOOP +ZGEMM_L2x4_SUB2_8: + andi. T1,L, 8 + ble ZGEMM_L2x4_SUB2_4 + bl ZGEMM_2x4_L8_SUB MY_ALIGN ZGEMM_L2x4_SUB2_4: andi. T1,L, 4 ble ZGEMM_L2x4_SUB2_2 LOAD2x4 0 - KERNEL2x4_L 64,64, 0,0 - KERNEL2x4_E 64,64, 1,1 + KERNEL2x4_L 64,32, 0,0 + KERNEL2x4_E 64,32, 1,1 MY_ALIGN ZGEMM_L2x4_SUB2_2: andi. T1,L, 2 ble ZGEMM_L2x4_SUB2_1 LOAD2x4 0 - KERNEL2x4_E 64,64, 0,1 + KERNEL2x4_E 64,32, 0,1 MY_ALIGN ZGEMM_L2x4_SUB2_1: andi. T1,L, 1 @@ -259,12 +432,11 @@ ZGEMM_L2x4_SAVE: ZGEMM_L2x4_END: -ZGEMM_L2x2_BEGIN: - +ZGEMM_L2x2_BEGIN: andi. T1, M, 2 ble ZGEMM_L2x2_END - mr BO, BBUFFER + mr BO, B mr T1, K addi T1,T1, -1 srawi. L, T1, 4 /**(K-1) % 16x */ @@ -277,18 +449,18 @@ ZGEMM_L2x2_LOOP_START: MY_ALIGN ZGEMM_L2x2_LOOP: - KERNEL2x2_L 32,64,0,0 - KERNEL2x2_L 32,64,1,0 - KERNEL2x2_L 32,64,2,0 - KERNEL2x2_L 32,64,3,0 - KERNEL2x2_L 32,64,4,0 - KERNEL2x2_L 32,64,5,0 - KERNEL2x2_L 32,64,6,0 - KERNEL2x2_L 32,64,7,1 + KERNEL2x2_L 32,32,0,0 + KERNEL2x2_L 32,32,1,0 + KERNEL2x2_L 32,32,2,0 + KERNEL2x2_L 32,32,3,0 + KERNEL2x2_L 32,32,4,0 + KERNEL2x2_L 32,32,5,0 + KERNEL2x2_L 32,32,6,0 + KERNEL2x2_L 32,32,7,1 bdnz ZGEMM_L2x2_LOOP MY_ALIGN ZGEMM_L2x2_LOOP_END: - END2x2 AO, BO, 32, 64 + END2x2 AO, BO, 32,32 b ZGEMM_L2x2_SUB1 @@ -310,24 +482,24 @@ ZGEMM_L2x2_SUB2: MY_ALIGN ZGEMM_L2x2_SUB2_LOOP: LOAD2x2 0 - KERNEL2x2_L 32,64, 0,0 - KERNEL2x2_L 32,64, 1,0 - KERNEL2x2_L 32,64, 2,0 - KERNEL2x2_E 32,64, 3,1 + KERNEL2x2_L 32,32, 0,0 + KERNEL2x2_L 32,32, 1,0 + KERNEL2x2_L 32,32, 2,0 + KERNEL2x2_E 32,32, 3,1 bdnz ZGEMM_L2x2_SUB2_LOOP MY_ALIGN ZGEMM_L2x2_SUB2_4: andi. T1,L, 4 ble ZGEMM_L2x2_SUB2_2 LOAD2x2 0 - KERNEL2x2_L 32,64, 0,0 - KERNEL2x2_E 32,64, 1,1 + KERNEL2x2_L 32,32, 0,0 + KERNEL2x2_E 32,32, 1,1 MY_ALIGN ZGEMM_L2x2_SUB2_2: andi. T1,L, 2 ble ZGEMM_L2x2_SUB2_1 LOAD2x2 0 - KERNEL2x2_E 32,64, 0,1 + KERNEL2x2_E 32,32, 0,1 MY_ALIGN ZGEMM_L2x2_SUB2_1: andi. T1,L, 1 @@ -339,12 +511,12 @@ ZGEMM_L2x2_SAVE: ZGEMM_L2x2_END: -ZGEMM_L2x1_BEGIN: +ZGEMM_L2x1_BEGIN: andi. T1, M, 1 ble ZGEMM_L2x1_END - mr BO, BBUFFER + mr BO, B mr T1, K addi T1,T1, -1 srawi. L, T1, 4 /**(K-1) % 16x */ @@ -358,18 +530,18 @@ ZGEMM_L2x1_LOOP_START: MY_ALIGN ZGEMM_L2x1_LOOP: - KERNEL2x1_L 16,64,0,0 - KERNEL2x1_L 16,64,1,0 - KERNEL2x1_L 16,64,2,0 - KERNEL2x1_L 16,64,3,0 - KERNEL2x1_L 16,64,4,0 - KERNEL2x1_L 16,64,5,0 - KERNEL2x1_L 16,64,6,0 - KERNEL2x1_L 16,64,7,1 + KERNEL2x1_L 16,32,0,0 + KERNEL2x1_L 16,32,1,0 + KERNEL2x1_L 16,32,2,0 + KERNEL2x1_L 16,32,3,0 + KERNEL2x1_L 16,32,4,0 + KERNEL2x1_L 16,32,5,0 + KERNEL2x1_L 16,32,6,0 + KERNEL2x1_L 16,32,7,1 bdnz ZGEMM_L2x1_LOOP MY_ALIGN ZGEMM_L2x1_LOOP_END: - END2x1 AO, BO, 16, 64 + END2x1 AO, BO, 16,32 b ZGEMM_L2x1_SUB1 @@ -391,24 +563,24 @@ ZGEMM_L2x1_SUB2: MY_ALIGN ZGEMM_L2x1_SUB2_LOOP: LOAD2x1 0 - KERNEL2x1_L 16,64, 0,0 - KERNEL2x1_L 16,64, 1,0 - KERNEL2x1_L 16,64, 2,0 - KERNEL2x1_E 16,64, 3,1 + KERNEL2x1_L 16,32, 0,0 + KERNEL2x1_L 16,32, 1,0 + KERNEL2x1_L 16,32, 2,0 + KERNEL2x1_E 16,32, 3,1 bdnz ZGEMM_L2x1_SUB2_LOOP MY_ALIGN ZGEMM_L2x1_SUB2_4: andi. T1,L, 4 ble ZGEMM_L2x1_SUB2_2 LOAD2x1 0 - KERNEL2x1_L 16,64, 0,0 - KERNEL2x1_E 16,64, 1,1 + KERNEL2x1_L 16,32, 0,0 + KERNEL2x1_E 16,32, 1,1 MY_ALIGN ZGEMM_L2x1_SUB2_2: andi. T1,L, 2 ble ZGEMM_L2x1_SUB2_1 LOAD2x1 0 - KERNEL2x1_E 16,64, 0,1 + KERNEL2x1_E 16,32, 0,1 MY_ALIGN ZGEMM_L2x1_SUB2_1: andi. T1,L, 1 @@ -442,36 +614,6 @@ ZGEMM_L1_BEGIN: andi. T1, N, 1 ble ZGEMM_L1_END - mr BO, B - mr BBO, BBUFFER - srawi. T1, K, 3 /*this time K/8 */ - ble ZGEMM_L1_COPYB1 - -ZGEMM_L1_COPYB8: - - addi T2, PRE, 128 - dcbt BO, PRE - dcbtst BBO, PRE - dcbtst BBO, T2 - ZCOPYB_8 - addic. T1, T1, -1 - - bgt ZGEMM_L1_COPYB8 - -ZGEMM_L1_COPYB1: - - andi. T1, K, 7 - ble ZGEMM_L1_COPYB_END - -ZGEMM_L1_COPYB_LOOP: - - ZCOPYB_1 - addic. T1, T1, -1 - - bgt ZGEMM_L1_COPYB_LOOP - -ZGEMM_L1_COPYB_END: - mr CO, C mr AO, A srawi. I, M, 3 @@ -480,7 +622,7 @@ ZGEMM_L1_COPYB_END: ZGEMM_L1x8_BEGIN: - mr BO, BBUFFER + mr BO, B mr T1, K addi T1,T1, -1 srawi. L, T1, 5 /**(K-1) % 32x */ @@ -501,33 +643,33 @@ ZGEMM_L1x8_LOOP_START: ZGEMM_L1x8_LOOP: dcbt AO, PRE dcbt BO, PRE - KERNEL1x8_L 128,32,0,0 - KERNEL1x8_L 128,32,1,0 + KERNEL1x8_L 128,16,0,0 + KERNEL1x8_L 128,16,1,0 dcbt AO, T2 - KERNEL1x8_L 128,32,2,0 - KERNEL1x8_L 128,32,3,0 + KERNEL1x8_L 128,16,2,0 + KERNEL1x8_L 128,16,3,0 dcbt AO, T3 dcbt BO, T2 - KERNEL1x8_L 128,32,4,0 - KERNEL1x8_L 128,32,5,0 + KERNEL1x8_L 128,16,4,0 + KERNEL1x8_L 128,16,5,0 dcbt AO, T4 - KERNEL1x8_L 128,32,6,0 - KERNEL1x8_L 128,32,7,0 + KERNEL1x8_L 128,16,6,0 + KERNEL1x8_L 128,16,7,0 dcbt AO, T5 dcbt BO, T3 - KERNEL1x8_L 128,32,8,0 - KERNEL1x8_L 128,32,9,0 - KERNEL1x8_L 128,32,10,0 - KERNEL1x8_L 128,32,11,0 + KERNEL1x8_L 128,16,8,0 + KERNEL1x8_L 128,16,9,0 + KERNEL1x8_L 128,16,10,0 + KERNEL1x8_L 128,16,11,0 dcbt BO, T4 - KERNEL1x8_L 128,32,12,0 - KERNEL1x8_L 128,32,13,0 - KERNEL1x8_L 128,32,14,0 - KERNEL1x8_L 128,32,15,1 + KERNEL1x8_L 128,16,12,0 + KERNEL1x8_L 128,16,13,0 + KERNEL1x8_L 128,16,14,0 + KERNEL1x8_L 128,16,15,1 bdnz ZGEMM_L1x8_LOOP MY_ALIGN ZGEMM_L1x8_LOOP_END: - END1x8 AO, BO, 128, 32 + END1x8 AO, BO, 128,16 b ZGEMM_L1x8_SUB1 @@ -549,32 +691,30 @@ ZGEMM_L1x8_SUB2: MY_ALIGN ZGEMM_L1x8_SUB2_LOOP: LOAD1x8 0 - KERNEL1x8_L 128,32, 0,0 - KERNEL1x8_L 128,32, 1,0 - KERNEL1x8_L 128,32, 2,0 - KERNEL1x8_E 128,32, 3,1 + KERNEL1x8_L 128,16, 0,0 + KERNEL1x8_L 128,16, 1,0 + KERNEL1x8_L 128,16, 2,0 + KERNEL1x8_E 128,16, 3,1 bdnz ZGEMM_L1x8_SUB2_LOOP MY_ALIGN ZGEMM_L1x8_SUB2_4: andi. T1,L, 4 ble ZGEMM_L1x8_SUB2_2 LOAD1x8 0 - KERNEL1x8_L 128,32, 0,0 - KERNEL1x8_E 128,32, 1,1 + KERNEL1x8_L 128,16, 0,0 + KERNEL1x8_E 128,16, 1,1 MY_ALIGN ZGEMM_L1x8_SUB2_2: andi. T1,L, 2 ble ZGEMM_L1x8_SUB2_1 LOAD1x8 0 - KERNEL1x8_E 128,32, 0,1 + KERNEL1x8_E 128,16, 0,1 MY_ALIGN ZGEMM_L1x8_SUB2_1: andi. T1,L, 1 ble ZGEMM_L1x8_SAVE KERNEL1x8 - -/* addic. L, L, -1 - bgt ZGEMM_L1x8_SUB2_1*/ + ZGEMM_L1x8_SAVE: @@ -592,7 +732,7 @@ ZGEMM_L1x4_BEGIN: andi. T1, M, 4 ble ZGEMM_L1x4_END - mr BO, BBUFFER + mr BO, B mr T1, K addi T1,T1, -1 srawi. L, T1, 5 /**(K-1) % 16x */ @@ -605,26 +745,26 @@ ZGEMM_L1x4_LOOP_START: MY_ALIGN ZGEMM_L1x4_LOOP: - KERNEL1x4_L 64,32,0,0 - KERNEL1x4_L 64,32,1,0 - KERNEL1x4_L 64,32,2,0 - KERNEL1x4_L 64,32,3,0 - KERNEL1x4_L 64,32,4,0 - KERNEL1x4_L 64,32,5,0 - KERNEL1x4_L 64,32,6,0 - KERNEL1x4_L 64,32,7,0 - KERNEL1x4_L 64,32,8,0 - KERNEL1x4_L 64,32,9,0 - KERNEL1x4_L 64,32,10,0 - KERNEL1x4_L 64,32,11,0 - KERNEL1x4_L 64,32,12,0 - KERNEL1x4_L 64,32,13,0 - KERNEL1x4_L 64,32,14,0 - KERNEL1x4_L 64,32,15,1 + KERNEL1x4_L 64,16,0,0 + KERNEL1x4_L 64,16,1,0 + KERNEL1x4_L 64,16,2,0 + KERNEL1x4_L 64,16,3,0 + KERNEL1x4_L 64,16,4,0 + KERNEL1x4_L 64,16,5,0 + KERNEL1x4_L 64,16,6,0 + KERNEL1x4_L 64,16,7,0 + KERNEL1x4_L 64,16,8,0 + KERNEL1x4_L 64,16,9,0 + KERNEL1x4_L 64,16,10,0 + KERNEL1x4_L 64,16,11,0 + KERNEL1x4_L 64,16,12,0 + KERNEL1x4_L 64,16,13,0 + KERNEL1x4_L 64,16,14,0 + KERNEL1x4_L 64,16,15,1 bdnz ZGEMM_L1x4_LOOP MY_ALIGN ZGEMM_L1x4_LOOP_END: - END1x4 AO, BO, 64, 32 + END1x4 AO, BO, 64,16 b ZGEMM_L1x4_SUB1 @@ -646,24 +786,24 @@ ZGEMM_L1x4_SUB2: MY_ALIGN ZGEMM_L1x4_SUB2_LOOP: LOAD1x4 0 - KERNEL1x4_L 64,32, 0,0 - KERNEL1x4_L 64,32, 1,0 - KERNEL1x4_L 64,32, 2,0 - KERNEL1x4_E 64,32, 3,1 + KERNEL1x4_L 64,16, 0,0 + KERNEL1x4_L 64,16, 1,0 + KERNEL1x4_L 64,16, 2,0 + KERNEL1x4_E 64,16, 3,1 bdnz ZGEMM_L1x4_SUB2_LOOP MY_ALIGN ZGEMM_L1x4_SUB2_4: andi. T1,L, 4 ble ZGEMM_L1x4_SUB2_2 LOAD1x4 0 - KERNEL1x4_L 64,32, 0,0 - KERNEL1x4_E 64,32, 1,1 + KERNEL1x4_L 64,16, 0,0 + KERNEL1x4_E 64,16, 1,1 MY_ALIGN ZGEMM_L1x4_SUB2_2: andi. T1,L, 2 ble ZGEMM_L1x4_SUB2_1 LOAD1x4 0 - KERNEL1x4_E 64,32, 0,1 + KERNEL1x4_E 64,16, 0,1 MY_ALIGN ZGEMM_L1x4_SUB2_1: andi. T1,L, 1 @@ -681,7 +821,7 @@ ZGEMM_L1x2_BEGIN: andi. T1, M, 2 ble ZGEMM_L1x2_END - mr BO, BBUFFER + mr BO, B mr T1, K addi T1,T1, -1 srawi. L, T1, 5 /**(K-1) % 16x */ @@ -694,26 +834,26 @@ ZGEMM_L1x2_LOOP_START: MY_ALIGN ZGEMM_L1x2_LOOP: - KERNEL1x2_L 32,32,0,0 - KERNEL1x2_L 32,32,1,0 - KERNEL1x2_L 32,32,2,0 - KERNEL1x2_L 32,32,3,0 - KERNEL1x2_L 32,32,4,0 - KERNEL1x2_L 32,32,5,0 - KERNEL1x2_L 32,32,6,0 - KERNEL1x2_L 32,32,7,0 - KERNEL1x2_L 32,32,8,0 - KERNEL1x2_L 32,32,9,0 - KERNEL1x2_L 32,32,10,0 - KERNEL1x2_L 32,32,11,0 - KERNEL1x2_L 32,32,12,0 - KERNEL1x2_L 32,32,13,0 - KERNEL1x2_L 32,32,14,0 - KERNEL1x2_L 32,32,15,1 + KERNEL1x2_L 32,16,0,0 + KERNEL1x2_L 32,16,1,0 + KERNEL1x2_L 32,16,2,0 + KERNEL1x2_L 32,16,3,0 + KERNEL1x2_L 32,16,4,0 + KERNEL1x2_L 32,16,5,0 + KERNEL1x2_L 32,16,6,0 + KERNEL1x2_L 32,16,7,0 + KERNEL1x2_L 32,16,8,0 + KERNEL1x2_L 32,16,9,0 + KERNEL1x2_L 32,16,10,0 + KERNEL1x2_L 32,16,11,0 + KERNEL1x2_L 32,16,12,0 + KERNEL1x2_L 32,16,13,0 + KERNEL1x2_L 32,16,14,0 + KERNEL1x2_L 32,16,15,1 bdnz ZGEMM_L1x2_LOOP MY_ALIGN ZGEMM_L1x2_LOOP_END: - END1x2 AO, BO, 32, 32 + END1x2 AO, BO, 32,16 b ZGEMM_L1x2_SUB1 @@ -735,24 +875,24 @@ ZGEMM_L1x2_SUB2: MY_ALIGN ZGEMM_L1x2_SUB2_LOOP: LOAD1x2 0 - KERNEL1x2_L 32,32, 0,0 - KERNEL1x2_L 32,32, 1,0 - KERNEL1x2_L 32,32, 2,0 - KERNEL1x2_E 32,32, 3,1 + KERNEL1x2_L 32,16, 0,0 + KERNEL1x2_L 32,16, 1,0 + KERNEL1x2_L 32,16, 2,0 + KERNEL1x2_E 32,16, 3,1 bdnz ZGEMM_L1x2_SUB2_LOOP MY_ALIGN ZGEMM_L1x2_SUB2_4: andi. T1,L, 4 ble ZGEMM_L1x2_SUB2_2 LOAD1x2 0 - KERNEL1x2_L 32,32, 0,0 - KERNEL1x2_E 32,32, 1,1 + KERNEL1x2_L 32,16, 0,0 + KERNEL1x2_E 32,16, 1,1 MY_ALIGN ZGEMM_L1x2_SUB2_2: andi. T1,L, 2 ble ZGEMM_L1x2_SUB2_1 LOAD1x2 0 - KERNEL1x2_E 32,32, 0,1 + KERNEL1x2_E 32,16, 0,1 MY_ALIGN ZGEMM_L1x2_SUB2_1: andi. T1,L, 1 @@ -769,7 +909,7 @@ ZGEMM_L1x1_BEGIN: andi. T1, M, 1 ble ZGEMM_L1x1_END - mr BO, BBUFFER + mr BO, B mr T1, K addi T1,T1, -1 srawi. L, T1, 5 /**(K-1) % 16x */ @@ -783,26 +923,26 @@ ZGEMM_L1x1_LOOP_START: MY_ALIGN ZGEMM_L1x1_LOOP: - KERNEL1x1_L 16,32,0,0 - KERNEL1x1_L 16,32,1,0 - KERNEL1x1_L 16,32,2,0 - KERNEL1x1_L 16,32,3,0 - KERNEL1x1_L 16,32,4,0 - KERNEL1x1_L 16,32,5,0 - KERNEL1x1_L 16,32,6,0 - KERNEL1x1_L 16,32,7,0 - KERNEL1x1_L 16,32,8,0 - KERNEL1x1_L 16,32,9,0 - KERNEL1x1_L 16,32,10,0 - KERNEL1x1_L 16,32,11,0 - KERNEL1x1_L 16,32,12,0 - KERNEL1x1_L 16,32,13,0 - KERNEL1x1_L 16,32,14,0 - KERNEL1x1_L 16,32,15,1 + KERNEL1x1_L 16,16,0,0 + KERNEL1x1_L 16,16,1,0 + KERNEL1x1_L 16,16,2,0 + KERNEL1x1_L 16,16,3,0 + KERNEL1x1_L 16,16,4,0 + KERNEL1x1_L 16,16,5,0 + KERNEL1x1_L 16,16,6,0 + KERNEL1x1_L 16,16,7,0 + KERNEL1x1_L 16,16,8,0 + KERNEL1x1_L 16,16,9,0 + KERNEL1x1_L 16,16,10,0 + KERNEL1x1_L 16,16,11,0 + KERNEL1x1_L 16,16,12,0 + KERNEL1x1_L 16,16,13,0 + KERNEL1x1_L 16,16,14,0 + KERNEL1x1_L 16,16,15,1 bdnz ZGEMM_L1x1_LOOP MY_ALIGN ZGEMM_L1x1_LOOP_END: - END1x1 AO, BO, 16, 32 + END1x1 AO, BO, 16, 16 b ZGEMM_L1x1_SUB1 @@ -824,24 +964,24 @@ ZGEMM_L1x1_SUB2: MY_ALIGN ZGEMM_L1x1_SUB2_LOOP: LOAD1x1 0 - KERNEL1x1_L 16,32, 0,0 - KERNEL1x1_L 16,32, 1,0 - KERNEL1x1_L 16,32, 2,0 - KERNEL1x1_E 16,32, 3,1 + KERNEL1x1_L 16,16, 0,0 + KERNEL1x1_L 16,16, 1,0 + KERNEL1x1_L 16,16, 2,0 + KERNEL1x1_E 16,16, 3,1 bdnz ZGEMM_L1x1_SUB2_LOOP MY_ALIGN ZGEMM_L1x1_SUB2_4: andi. T1,L, 4 ble ZGEMM_L1x1_SUB2_2 LOAD1x1 0 - KERNEL1x1_L 16,32, 0,0 - KERNEL1x1_E 16,32, 1,1 + KERNEL1x1_L 16,16, 0,0 + KERNEL1x1_E 16,16, 1,1 MY_ALIGN ZGEMM_L1x1_SUB2_2: andi. T1,L, 2 ble ZGEMM_L1x1_SUB2_1 LOAD1x1 0 - KERNEL1x1_E 16,32, 0,1 + KERNEL1x1_E 16,16, 0,1 MY_ALIGN ZGEMM_L1x1_SUB2_1: andi. T1,L, 1 diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S index 93a309ad1..10d9e4cc3 100644 --- a/kernel/power/zgemm_macros_power9.S +++ b/kernel/power/zgemm_macros_power9.S @@ -25,68 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - - #define XSFADD_R1 xsadddp - #define XSFADD_R2 xssubdp - #define XSFADD_I1 xsadddp - #define XSFADD_I2 xsadddp - -#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) - - #define XSFADD_R1 xsadddp - #define XSFADD_R2 xsadddp - #define XSFADD_I1 xssubdp - #define XSFADD_I2 xsadddp - -#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) - - #define XSFADD_R1 xsadddp - #define XSFADD_R2 xsadddp - #define XSFADD_I1 xsadddp - #define XSFADD_I2 xssubdp - -#else // CC || CR || RC || RR - - #define XSFADD_R1 xsadddp - #define XSFADD_R2 xssubdp - #define XSFADD_I1 xssubdp - #define XSFADD_I2 xssubdp - -#endif - -.macro AGGREGATE_INTO_COMPLEX FIRST_V, SECOND_V, OUTPUT_V - AGGREGATE_INTO_COMPLEX_INNER \FIRST_V, \SECOND_V, \OUTPUT_V, vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7 -.endm - -.macro AGGREGATE_INTO_COMPLEX_INNER FIRST_V, SECOND_V, OUTPUT_V ,TEMP1,TEMP2,TEMP3,TEMP4,TEMP5,TEMP6,TEMP7,TEMP8 - xxlxor \TEMP1, \TEMP1, \TEMP1 - xxlxor \TEMP2, \TEMP2, \TEMP2 - - xxswapd \SECOND_V, \SECOND_V // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB - - XSFADD_I1 \TEMP2, \TEMP2, \FIRST_V // realA*imagB - XSFADD_I2 \TEMP2, \TEMP2, \SECOND_V // imagA*realB - - xxswapd \FIRST_V, \FIRST_V //imagA*realB, realA*realB -> realA*realB, imagA*realB - xxswapd \SECOND_V, \SECOND_V // reverse to original imagA*imagB, realA*imagB - - XSFADD_R1 \TEMP1, \TEMP1, \FIRST_V // realA*realB - XSFADD_R2 \TEMP1, \TEMP1, \SECOND_V // imagA*imagB - - xsmuldp \TEMP3, \TEMP2, alpha_i // imag*alpha_i - xsmuldp \TEMP4, \TEMP2, alpha_r // imag*alpha_r - xsmuldp \TEMP5, \TEMP1, alpha_r // real*alpha_r - xsmuldp \TEMP6, \TEMP1, alpha_i // real*alpha_i - - xssubdp \TEMP7, \TEMP5, \TEMP3 // real*alpha_r - imag*alpha_i - xsadddp \TEMP8, \TEMP6, \TEMP4 // real*alpha_i + imag*alpha_r - xxpermdi \OUTPUT_V, \TEMP8, \TEMP7, 0 // merge real and imag part -.endm - -/********************************************************************************************** -* Macros for N=2 and M=8 -**********************************************************************************************/ #define unit_size 16 #define DISP32(ind,disp) (ind*unit_size*32+disp) @@ -95,338 +33,457 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DISP4(ind,disp) (ind*unit_size*4+disp) #define DISP2(ind,disp) (ind*unit_size*2+disp) #define DISP1(ind,disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) + +/* HELPERS FOR SAVE */ + +/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */ +.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET +#ifndef TRMMKERNEL + lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) + lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) + xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 + xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 +#endif +.endm + +/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ +.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ +.endm + +/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ +.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ +.endm + +/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead instead to fix sign*/ + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm + +/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */ +.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 +#ifndef TRMMKERNEL + xvmsubadp \VSOUT1,\VSINII, alpha_i + xvmaddadp \VSOUT2,\VSINRR, alpha_i +#else + xvmuldp \VSOUT1,\VSINII, alpha_i + xvmuldp \VSOUT2,\VSINRR, alpha_i +#endif +.endm + +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ +.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmsubadp \VSOUT1,\VSINRR, alpha_r + xvmaddadp \VSOUT2,\VSINII, alpha_r +.endm + +/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */ +.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrghd \VSOUT1,\VSIN2,\VSIN1 + xxmrgld \VSOUT2,\VSIN2,\VSIN1 +.endm +.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2 + stxv \VSIN1, DISPX(\LOFFSET)(\REG) + stxv \VSIN2, DISPX(\LOFFSET+16)(\REG) +.endm + +.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 + LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 + LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 + LOAD_COUPLE_AS_RR_II vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64) + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 + LOAD_COUPLE_AS_RR_II vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs12,vs13 + AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4 + MULT_APLHA_PART1 vs6,vs8,vs16,vs17 + MULT_APLHA_PART2 vs2,vs4,vs14,vs15 + AGGREGATE_REALS_IMAGES vs10,vs11,vs12,vs13 + MULT_APLHA_PART2 vs6,vs8,vs16,vs17 + AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + MULT_APLHA_PART1 vs10,vs12, vs24,vs25 + UNPACK_FOR_STORE vs16,vs17,vs3,vs5 + MULT_APLHA_PART1 \VSRes1,\VSRes3, vs26,vs27 + STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 + MULT_APLHA_PART2 vs10,vs12,vs24,vs25 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 + MULT_APLHA_PART2 \VSRes1,\VSRes3, vs26,vs27 + UNPACK_FOR_STORE vs24,vs25,vs10,vs12 + UNPACK_FOR_STORE vs26,vs27,\VSRes1,\VSRes3 + STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs10,vs12 + STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3 +.endm + +.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 + LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 + LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + MULT_APLHA_PART1 vs6,vs8, vs16,vs17 + MULT_APLHA_PART2 vs2,vs4, vs14,vs15 + MULT_APLHA_PART2 vs6,vs8,vs16,vs17 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + UNPACK_FOR_STORE vs16,vs17,vs3,vs5 + STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 +.endm + + +.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 + LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + MULT_APLHA_PART2 vs2,vs4, vs14,vs15 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 +.endm + + +.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3 +#ifndef TRMMKERNEL + lxv vs18, (\LOFFSET)(\BASE_REG) + xxmrgld vs14,vs18,vs18 + xxmrghd vs15,vs18,vs18 +#endif + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs4,vs5 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + MULT_APLHA_PART2 vs2,vs4, vs14,vs15 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + xxmrghd vs7,vs15,vs14 + stxv vs7, (\LOFFSET)(\BASE_REG) +.endm + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ .macro Zero2x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 .endm .macro LOAD2x8 Zero - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B - lxv vs18, 32(BO) // load real part from B - lxv vs19, 48(BO) // load imag part from B + lxv vs16, 0(BO) // load real imag from B + lxv vs18, 16(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A - lxv vs4, 64(AO) // load real,imag from A - lxv vs5, 80(AO) // load real,imag from A - lxv vs6, 96(AO) // load real,imag from A - lxv vs7, 112(AO) // load real,imag from A + lxv vs4, 64(AO) // load real,imag from A + lxv vs5, 80(AO) // load real,imag from A + lxv vs6, 96(AO) // load real,imag from A + lxv vs7, 112(AO) // load real,imag from A .if \Zero==1 - Zero2x8 + Zero2x8 .endif .endm .macro END2x8_NORMAL - END2x8 AO,BO,128,64 + END2x8 AO,BO,128,32 .endm -.macro END2x8 AREG, BREG, OffsetA, OffsetB +.macro END2x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA .endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag - xvmaddadp vs40, vs4, vs16 // real*real, imag*real - xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag - xvmaddadp vs42, vs5, vs16 // real*real, imag*real - xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag - xvmaddadp vs44, vs6, vs16 // real*real, imag*real - xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag - xvmaddadp vs46, vs7, vs16 // real*real, imag*real - xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs48, vs0, vs18 - xvmaddadp vs48, vs0, vs18 // real*real, imag*real - xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag - xvmaddadp vs50, vs1, vs18 // real*real, imag*real - xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag - xvmaddadp vs52, vs2, vs18 // real*real, imag*real - xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag - xvmaddadp vs54, vs3, vs18 // real*real, imag*real - xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag - xvmaddadp vs56, vs4, vs18 // real*real, imag*real - xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag - xvmaddadp vs58, vs5, vs18 // real*real, imag*real - xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag - xvmaddadp vs60, vs6, vs18 // real*real, imag*real - xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag - xvmaddadp vs62, vs7, vs18 // real*real, imag*real - xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs50, vs1, vs18 + + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs52, vs2, vs18 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs54, vs3, vs18 + + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs56, vs4, vs18 + + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs58, vs5, vs18 + + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs60, vs6, vs18 + + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs62, vs7, vs18 + + + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs49, vs0, vs19 + + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs51, vs1, vs19 + + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs53, vs2, vs19 + + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs55, vs3, vs19 + + xvmaddadp vs41, vs4, vs17 + xvmaddadp vs57, vs4, vs19 + + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs59, vs5, vs19 + + xvmaddadp vs45, vs6, vs17 + xvmaddadp vs61, vs6, vs19 + + xvmaddadp vs47, vs7, vs17 + xvmaddadp vs63, vs7, vs19 .endm -.macro KERNEL2x8_L OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.macro KERNEL2x8_L OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm -.macro KERNEL2x8_E OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL2x8_E OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs48, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs49, vs0, vs19 -lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B - lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B - lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B + xxswapd vs21, vs20 + xxswapd vs23, vs22 - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag - xvmaddadp vs40, vs4, vs16 // real*real, imag*real - xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag - xvmaddadp vs42, vs5, vs16 // real*real, imag*real - xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag - xvmaddadp vs44, vs6, vs16 // real*real, imag*real - xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag - xvmaddadp vs46, vs7, vs16 // real*real, imag*real - xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs50, vs1, vs18 - xvmaddadp vs48, vs0, vs18 // real*real, imag*real - xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag - xvmaddadp vs50, vs1, vs18 // real*real, imag*real - xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag - xvmaddadp vs52, vs2, vs18 // real*real, imag*real - xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag - xvmaddadp vs54, vs3, vs18 // real*real, imag*real - xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag - xvmaddadp vs56, vs4, vs18 // real*real, imag*real - xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag - xvmaddadp vs58, vs5, vs18 // real*real, imag*real - xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag - xvmaddadp vs60, vs6, vs18 // real*real, imag*real - xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag - xvmaddadp vs62, vs7, vs18 // real*real, imag*real - xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag + lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs51, vs1, vs19 + + lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs52, vs2, vs18 + + lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A + + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs53, vs2, vs19 + + lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) +.endif +.endif + + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs54, vs3, vs18 .if \Complete==0 - lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A - - lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A - - lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B - lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B - lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B + lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A .endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) - addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) -.else - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP8(\Index,128) + + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs55, vs3, vs19 + +.if \Complete==0 + lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A .endif -.endif + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs56, vs4, vs18 - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - xvmaddadp vs34, vs9, vs20 // real*real, imag*real - xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag - xvmaddadp vs36, vs10, vs20 // real*real, imag*real - xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag - xvmaddadp vs38, vs11, vs20 // real*real, imag*real - xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag - xvmaddadp vs40, vs12, vs20 // real*real, imag*real - xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag - xvmaddadp vs42, vs13, vs20 // real*real, imag*real - xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag - xvmaddadp vs44, vs14, vs20 // real*real, imag*real - xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag - xvmaddadp vs46, vs15, vs20 // real*real, imag*real - xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + xvmaddadp vs41, vs4, vs17 + xvmaddadp vs57, vs4, vs19 - xvmaddadp vs48, vs8, vs22 // real*real, imag*real - xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag - xvmaddadp vs50, vs9, vs22 // real*real, imag*real - xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag - xvmaddadp vs52, vs10, vs22 // real*real, imag*real - xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag - xvmaddadp vs54, vs11, vs22 // real*real, imag*real - xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag - xvmaddadp vs56, vs12, vs22 // real*real, imag*real - xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag - xvmaddadp vs58, vs13, vs22 // real*real, imag*real - xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag - xvmaddadp vs60, vs14, vs22 // real*real, imag*real - xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag - xvmaddadp vs62, vs15, vs22 // real*real, imag*real - xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs58, vs5, vs18 + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs59, vs5, vs19 + +.if \Complete==0 + lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A +.endif + + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs60, vs6, vs18 + xvmaddadp vs45, vs6, vs17 + xvmaddadp vs61, vs6, vs19 + + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs62, vs7, vs18 + xvmaddadp vs47, vs7, vs17 + xvmaddadp vs63, vs7, vs19 + +.if \Complete==0 + lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A +.endif + + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs48, vs8, vs22 +.if \Complete==0 + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 +.if \IsLast==1 + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP4(\Index,64) +.endif + +.endif + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs49, vs8, vs23 + +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs50, vs9, vs22 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs51, vs9, vs23 + + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs52, vs10, vs22 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs53, vs10, vs23 + + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs54, vs11, vs22 + xvmaddadp vs39, vs11, vs21 + xvmaddadp vs55, vs11, vs23 + + xvmaddadp vs40, vs12, vs20 + xvmaddadp vs56, vs12, vs22 + xvmaddadp vs41, vs12, vs21 + xvmaddadp vs57, vs12, vs23 + + xvmaddadp vs42, vs13, vs20 + xvmaddadp vs58, vs13, vs22 + xvmaddadp vs43, vs13, vs21 + xvmaddadp vs59, vs13, vs23 + + xvmaddadp vs44, vs14, vs20 + xvmaddadp vs60, vs14, vs22 + xvmaddadp vs45, vs14, vs21 + xvmaddadp vs61, vs14, vs23 + + xvmaddadp vs46, vs15, vs20 + xvmaddadp vs62, vs15, vs22 + xvmaddadp vs47, vs15, vs21 + xvmaddadp vs63, vs15, vs23 .endm -.macro KERNEL2x8 +.macro KERNEL2x8 LOAD2x8 0 - END2x8 AO, BO, 128,64 + END2x8 AO, BO, 128,32 .endm .macro SAVE2x8 - mr T1, CO - addi T2, T1, 64 - -#ifndef TRMMKERNEL - - lxv vs16, 0(T1) - lxv vs17, 16(T1) - lxv vs18, 32(T1) - lxv vs19, 48(T1) - lxv vs20, 0(T2) - lxv vs21, 16(T2) - lxv vs22, 32(T2) - lxv vs23, 48(T2) - -#endif - - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 - AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 - AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 - AGGREGATE_INTO_COMPLEX vs40,vs41,vs12 - AGGREGATE_INTO_COMPLEX vs42,vs43,vs13 - AGGREGATE_INTO_COMPLEX vs44,vs45,vs14 - AGGREGATE_INTO_COMPLEX vs46,vs47,vs15 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - xvadddp vs10, vs10, vs18 - xvadddp vs11, vs11, vs19 - xvadddp vs12, vs12, vs20 - xvadddp vs13, vs13, vs21 - xvadddp vs14, vs14, vs22 - xvadddp vs15, vs15, vs23 - -#endif - - stxv vs8, 0(T1) - stxv vs9, 16(T1) - stxv vs10, 32(T1) - stxv vs11, 48(T1) - stxv vs12, 0(T2) - stxv vs13, 16(T2) - stxv vs14, 32(T2) - stxv vs15, 48(T2) - - add T1, T1, LDC - add T2, T2, LDC - -#ifndef TRMMKERNEL - - lxv vs16, 0(T1) - lxv vs17, 16(T1) - lxv vs18, 32(T1) - lxv vs19, 48(T1) - lxv vs20, 0(T2) - lxv vs21, 16(T2) - lxv vs22, 32(T2) - lxv vs23, 48(T2) - -#endif - - AGGREGATE_INTO_COMPLEX vs48,vs49,vs8 - AGGREGATE_INTO_COMPLEX vs50,vs51,vs9 - AGGREGATE_INTO_COMPLEX vs52,vs53,vs10 - AGGREGATE_INTO_COMPLEX vs54,vs55,vs11 - AGGREGATE_INTO_COMPLEX vs56,vs57,vs12 - AGGREGATE_INTO_COMPLEX vs58,vs59,vs13 - AGGREGATE_INTO_COMPLEX vs60,vs61,vs14 - AGGREGATE_INTO_COMPLEX vs62,vs63,vs15 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - xvadddp vs10, vs10, vs18 - xvadddp vs11, vs11, vs19 - xvadddp vs12, vs12, vs20 - xvadddp vs13, vs13, vs21 - xvadddp vs14, vs14, vs22 - xvadddp vs15, vs15, vs23 - -#endif - - stxv vs8, 0(T1) - stxv vs9, 16(T1) - stxv vs10, 32(T1) - stxv vs11, 48(T1) - stxv vs12, 0(T2) - stxv vs13, 16(T2) - stxv vs14, 32(T2) - stxv vs15, 48(T2) - - addi CO, CO, 128 + add T1, CO ,LDC + SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 + SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0 + addi CO, CO, 128 .endm @@ -435,223 +492,178 @@ lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B **********************************************************************************************/ .macro Zero2x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 .endm .macro LOAD2x4 Zero - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B - lxv vs18, 32(BO) // load real part from B - lxv vs19, 48(BO) // load imag part from B + lxv vs16, 0(BO) // load real imag from B + lxv vs18, 16(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A - .if \Zero==1 - Zero2x4 + Zero2x4 .endif .endm .macro END2x4_NORMAL - END2x4 AO,BO,64,64 + END2x4 AO,BO,64,32 .endm -.macro END2x4 AREG, BREG, OffsetA, OffsetB +.macro END2x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA .endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs40, vs0, vs18 + xvmaddadp vs41, vs0, vs19 - xvmaddadp vs40, vs0, vs18 // real*real, imag*real - xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag - xvmaddadp vs42, vs1, vs18 // real*real, imag*real - xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag - xvmaddadp vs44, vs2, vs18 // real*real, imag*real - xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag - xvmaddadp vs46, vs3, vs18 // real*real, imag*real - xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs42, vs1, vs18 + xvmaddadp vs43, vs1, vs19 + + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs44, vs2, vs18 + xvmaddadp vs45, vs2, vs19 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs47, vs3, vs19 .endm -.macro KERNEL2x4_L OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.macro KERNEL2x4_L OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm -.macro KERNEL2x4_E OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL2x4_E OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xxswapd vs21, vs20 + xxswapd vs23, vs22 + lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs40, vs0, vs18 + xvmaddadp vs41, vs0, vs19 + lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) + addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) +.endif +.endif -lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B - lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B - lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs42, vs1, vs18 + xvmaddadp vs43, vs1, vs19 + + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 +.if \Complete==0 + lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs44, vs2, vs18 + xvmaddadp vs45, vs2, vs19 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs47, vs3, vs19 - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag - xvmaddadp vs40, vs0, vs18 // real*real, imag*real - xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag - xvmaddadp vs42, vs1, vs18 // real*real, imag*real - xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag - xvmaddadp vs44, vs2, vs18 // real*real, imag*real - xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag - xvmaddadp vs46, vs3, vs18 // real*real, imag*real - xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag +.if \Complete==0 + lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.if \IsLast==1 + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif .if \Complete==0 - lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A - - lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B - lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B - lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 .endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) - addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) -.else - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP8(\Index,128) -.endif -.endif - - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - xvmaddadp vs34, vs9, vs20 // real*real, imag*real - xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag - xvmaddadp vs36, vs10, vs20 // real*real, imag*real - xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag - xvmaddadp vs38, vs11, vs20 // real*real, imag*real - xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag - xvmaddadp vs40, vs8, vs22 // real*real, imag*real - xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag - xvmaddadp vs42, vs9, vs22 // real*real, imag*real - xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag - xvmaddadp vs44, vs10, vs22 // real*real, imag*real - xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag - xvmaddadp vs46, vs11, vs22 // real*real, imag*real - xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag + xvmaddadp vs40, vs8, vs22 + xvmaddadp vs41, vs8, vs23 + + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs42, vs9, vs22 + xvmaddadp vs43, vs9, vs23 + + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs44, vs10, vs22 + xvmaddadp vs45, vs10, vs23 + + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs39, vs11, vs21 + xvmaddadp vs46, vs11, vs22 + xvmaddadp vs47, vs11, vs23 .endm -.macro KERNEL2x4 +.macro KERNEL2x4 LOAD2x4 0 - END2x4 AO, BO, 64,64 + END2x4 AO, BO, 64,32 .endm -.macro SAVE2x4 - - mr T1, CO - -#ifndef TRMMKERNEL - - lxv vs16, 0(T1) - lxv vs17, 16(T1) - lxv vs18, 32(T1) - lxv vs19, 48(T1) - -#endif - - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 - AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 - AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - xvadddp vs10, vs10, vs18 - xvadddp vs11, vs11, vs19 - -#endif - - stxv vs8, 0(T1) - stxv vs9, 16(T1) - stxv vs10, 32(T1) - stxv vs11, 48(T1) - - add T1, T1, LDC - -#ifndef TRMMKERNEL - - lxv vs16, 0(T1) - lxv vs17, 16(T1) - lxv vs18, 32(T1) - lxv vs19, 48(T1) - -#endif - - AGGREGATE_INTO_COMPLEX vs40,vs41,vs8 - AGGREGATE_INTO_COMPLEX vs42,vs43,vs9 - AGGREGATE_INTO_COMPLEX vs44,vs45,vs10 - AGGREGATE_INTO_COMPLEX vs46,vs47,vs11 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - xvadddp vs10, vs10, vs18 - xvadddp vs11, vs11, vs19 - -#endif - - stxv vs8, 0(T1) - stxv vs9, 16(T1) - stxv vs10, 32(T1) - stxv vs11, 48(T1) - - addi CO, CO, 64 +.macro SAVE2x4 + add T1, CO ,LDC + SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 + SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0 + addi CO, CO, 64 .endm @@ -660,170 +672,131 @@ lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B **********************************************************************************************/ .macro Zero2x2 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 .endm .macro LOAD2x2 Zero - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B - lxv vs18, 32(BO) // load real part from B - lxv vs19, 48(BO) // load imag part from B + lxv vs16, 0(BO) // load real imag from B + lxv vs18, 16(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - .if \Zero==1 - Zero2x2 -.endif - + Zero2x2 +.endif .endm .macro END2x2_NORMAL - END2x2 AO,BO,32,64 + END2x2 AO,BO,32,32 .endm -.macro END2x2 AREG, BREG, OffsetA, OffsetB +.macro END2x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA .endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs36, vs0, vs18 + xvmaddadp vs37, vs0, vs19 + + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs39, vs1, vs19 - xvmaddadp vs36, vs0, vs18 // real*real, imag*real - xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag - xvmaddadp vs38, vs1, vs18 // real*real, imag*real - xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag - .endm -.macro KERNEL2x2_L OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.macro KERNEL2x2_L OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm -.macro KERNEL2x2_E OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL2x2_E OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xxswapd vs21, vs20 + xxswapd vs23, vs22 -lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B - lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B - lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B + lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) +.endif +.endif + xvmaddadp vs36, vs0, vs18 + xvmaddadp vs37, vs0, vs19 - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - - xvmaddadp vs36, vs0, vs18 // real*real, imag*real - xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag - xvmaddadp vs38, vs1, vs18 // real*real, imag*real - xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs39, vs1, vs19 .if \Complete==0 - lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A - - lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B - lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B - lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B + lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP4(\Index,64) +.endif .endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) - addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,64) - addi \BREG, \BREG, DISP8(\Index,128) -.endif -.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 + +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs36, vs8, vs22 + xvmaddadp vs37, vs8, vs23 + + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 + + xvmaddadp vs38, vs9, vs22 + xvmaddadp vs39, vs9, vs23 - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - xvmaddadp vs34, vs9, vs20 // real*real, imag*real - xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag - - xvmaddadp vs36, vs8, vs22 // real*real, imag*real - xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag - xvmaddadp vs38, vs9, vs22 // real*real, imag*real - xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag - .endm -.macro KERNEL2x2 +.macro KERNEL2x2 LOAD2x2 0 - END2x2 AO, BO, 32,64 + END2x2 AO, BO, 32,32 .endm -.macro SAVE2x2 - - mr T1, CO - -#ifndef TRMMKERNEL - - lxv vs16, 0(T1) - lxv vs17, 16(T1) - -#endif - - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - -#endif - - stxv vs8, 0(T1) - stxv vs9, 16(T1) - - add T1, T1, LDC - -#ifndef TRMMKERNEL - - lxv vs16, 0(T1) - lxv vs17, 16(T1) - -#endif - - AGGREGATE_INTO_COMPLEX vs36,vs37,vs8 - AGGREGATE_INTO_COMPLEX vs38,vs39,vs9 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - -#endif - - stxv vs8, 0(T1) - stxv vs9, 16(T1) - - addi CO, CO, 32 - +.macro SAVE2x2 + add T1, CO ,LDC + SAVE2 vs32,vs33,vs34,vs35,CO,0 + SAVE2 vs36,vs37,vs38,vs39,T1,0 + addi CO, CO, 32 .endm /********************************************************************************************** @@ -831,348 +804,288 @@ lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B **********************************************************************************************/ .macro Zero2x1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 .endm .macro LOAD2x1 Zero - lxv vs0, 0(AO) // load real,imag from A + lxv vs0, 0(AO) // load real,imag from A - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B - lxv vs18, 32(BO) // load real part from B - lxv vs19, 48(BO) // load imag part from B + lxv vs16, 0(BO) // load real imag from B + lxv vs18, 16(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 .if \Zero==1 - Zero2x1 -.endif - + Zero2x1 +.endif .endm .macro END2x1_NORMAL - END2x1 AO,BO,16,64 + END2x1 AO,BO,16,32 .endm -.macro END2x1 AREG, BREG, OffsetA, OffsetB +.macro END2x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB .endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs35, vs0, vs19 - xvmaddadp vs34, vs0, vs18 // real*real, imag*real - xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag - .endm -.macro KERNEL2x1_L OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.macro KERNEL2x1_L OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm -.macro KERNEL2x1_E OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL2x1_E OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B -lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B - lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B - lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B + lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xxswapd vs21, vs20 + xxswapd vs23, vs22 +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) +.endif +.endif - xvmaddadp vs34, vs0, vs18 // real*real, imag*real - xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs35, vs0, vs19 .if \Complete==0 - lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B - lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B - lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B .endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) - addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,32) - addi \BREG, \BREG, DISP8(\Index,128) +.if \Complete==0 + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.if \IsLast==1 + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP4(\Index,64) +.endif .endif -.endif - - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - xvmaddadp vs34, vs8, vs22 // real*real, imag*real - xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag - +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 + + xvmaddadp vs34, vs8, vs22 + xvmaddadp vs35, vs8, vs23 + .endm -.macro KERNEL2x1 +.macro KERNEL2x1 LOAD2x1 0 - END2x1 AO, BO, 16,64 + END2x1 AO, BO, 16,32 .endm .macro SAVE2x1 - - mr T1, CO -#ifndef TRMMKERNEL - lxv vs16, 0(T1) -#endif - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - -#ifndef TRMMKERNEL - xvadddp vs8, vs8, vs16 -#endif - - stxv vs8, 0(T1) - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxv vs16, 0(T1) -#endif - - AGGREGATE_INTO_COMPLEX vs34,vs35,vs8 - -#ifndef TRMMKERNEL - xvadddp vs8, vs8, vs16 -#endif - - stxv vs8, 0(T1) - - addi CO, CO, 16 - + add T1, CO ,LDC + SAVE1 vs32,vs33,CO,0 + SAVE1 vs34,vs35,T1,0 + addi CO, CO, 16 .endm /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ .macro Zero1x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 .endm .macro LOAD1x8 Zero - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B + lxv vs16, 0(BO) // load real imag from B + xxswapd vs17, vs16 + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A - - lxv vs4, 64(AO) // load real,imag from A - lxv vs5, 80(AO) // load real,imag from A - lxv vs6, 96(AO) // load real,imag from A - lxv vs7, 112(AO) // load real,imag from A + lxv vs4, 64(AO) // load real,imag from A + lxv vs5, 80(AO) // load real,imag from A + lxv vs6, 96(AO) // load real,imag from A + lxv vs7, 112(AO) // load real,imag from A .if \Zero==1 - Zero1x8 + Zero1x8 .endif .endm .macro END1x8_NORMAL - END1x8 AO,BO,128,32 + END1x8 AO,BO,128,16 .endm -.macro END1x8 AREG, BREG, OffsetA, OffsetB +.macro END1x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA .endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag - xvmaddadp vs40, vs4, vs16 // real*real, imag*real - xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag - xvmaddadp vs42, vs5, vs16 // real*real, imag*real - xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag - xvmaddadp vs44, vs6, vs16 // real*real, imag*real - xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag - xvmaddadp vs46, vs7, vs16 // real*real, imag*real - xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs41, vs4, vs17 + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs45, vs6, vs17 + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs47, vs7, vs17 .endm -.macro KERNEL1x8_L OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.macro KERNEL1x8_L OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm -.macro KERNEL1x8_E OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL1x8_E OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + xxswapd vs21, vs20 - lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A - lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B - - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag - xvmaddadp vs40, vs4, vs16 // real*real, imag*real - xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag - xvmaddadp vs42, vs5, vs16 // real*real, imag*real - xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag - xvmaddadp vs44, vs6, vs16 // real*real, imag*real - xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag - xvmaddadp vs46, vs7, vs16 // real*real, imag*real - xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 .if \Complete==0 - lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs41, vs4, vs17 +.if \Complete==0 + lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs45, vs6, vs17 +.if \Complete==0 + lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs47, vs7, vs17 - lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP4(\Index,48+\OffsetB)(\BREG) // load imag part from B + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A .endif - -.if \IsLast==1 + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 +.if \Complete==0 + lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B + xxswapd vs17,vs16 +.endif +.if \IsLast==1 .if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) -.else - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP4(\Index,64) + addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) + addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP2(\Index,32) .endif -.endif +.endif + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs37, vs10, vs21 - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - xvmaddadp vs34, vs9, vs20 // real*real, imag*real - xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag - xvmaddadp vs36, vs10, vs20 // real*real, imag*real - xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag - xvmaddadp vs38, vs11, vs20 // real*real, imag*real - xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag - xvmaddadp vs40, vs12, vs20 // real*real, imag*real - xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag - xvmaddadp vs42, vs13, vs20 // real*real, imag*real - xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag - xvmaddadp vs44, vs14, vs20 // real*real, imag*real - xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag - xvmaddadp vs46, vs15, vs20 // real*real, imag*real - xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs39, vs11, vs21 + + xvmaddadp vs40, vs12, vs20 + xvmaddadp vs41, vs12, vs21 + xvmaddadp vs42, vs13, vs20 + xvmaddadp vs43, vs13, vs21 + xvmaddadp vs44, vs14, vs20 + xvmaddadp vs45, vs14, vs21 + xvmaddadp vs46, vs15, vs20 + xvmaddadp vs47, vs15, vs21 .endm -.macro KERNEL1x8 +.macro KERNEL1x8 LOAD1x8 0 - END1x8 AO, BO, 128,32 + END1x8 AO, BO, 128,16 .endm .macro SAVE1x8 - mr T1, CO - addi T2, T1, 64 - -#ifndef TRMMKERNEL - - lxv vs16, 0(T1) - lxv vs17, 16(T1) - lxv vs18, 32(T1) - lxv vs19, 48(T1) - lxv vs20, 0(T2) - lxv vs21, 16(T2) - lxv vs22, 32(T2) - lxv vs23, 48(T2) - -#endif - - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 - AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 - AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 - AGGREGATE_INTO_COMPLEX vs40,vs41,vs12 - AGGREGATE_INTO_COMPLEX vs42,vs43,vs13 - AGGREGATE_INTO_COMPLEX vs44,vs45,vs14 - AGGREGATE_INTO_COMPLEX vs46,vs47,vs15 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - xvadddp vs10, vs10, vs18 - xvadddp vs11, vs11, vs19 - xvadddp vs12, vs12, vs20 - xvadddp vs13, vs13, vs21 - xvadddp vs14, vs14, vs22 - xvadddp vs15, vs15, vs23 - -#endif - - stxv vs8, 0(T1) - stxv vs9, 16(T1) - stxv vs10, 32(T1) - stxv vs11, 48(T1) - stxv vs12, 0(T2) - stxv vs13, 16(T2) - stxv vs14, 32(T2) - stxv vs15, 48(T2) - - addi CO, CO, 128 + SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 + addi CO, CO, 128 .endm @@ -1181,170 +1094,143 @@ lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B **********************************************************************************************/ .macro Zero1x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 .endm .macro LOAD1x4 Zero - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B + lxv vs16, 0(BO) // load real imag from B + xxswapd vs17,vs16 + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A - .if \Zero==1 - Zero1x4 + Zero1x4 .endif .endm .macro END1x4_NORMAL - END1x4 AO,BO,64,32 + END1x4 AO,BO,64,16 .endm -.macro END1x4 AREG, BREG, OffsetA, OffsetB +.macro END1x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA .endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 .endm -.macro KERNEL1x4_L OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.macro KERNEL1x4_L OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm -.macro KERNEL1x4_E OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL1x4_E OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + xxswapd vs21,vs20 -lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A - xvmaddadp vs40, vs0, vs18 // real*real, imag*real - xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag - xvmaddadp vs42, vs1, vs18 // real*real, imag*real - xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag - xvmaddadp vs44, vs2, vs18 // real*real, imag*real - xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag - xvmaddadp vs46, vs3, vs18 // real*real, imag*real - xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + + xvmaddadp vs40, vs0, vs18 + xvmaddadp vs41, vs0, vs19 + xvmaddadp vs42, vs1, vs18 + xvmaddadp vs43, vs1, vs19 + xvmaddadp vs44, vs2, vs18 + xvmaddadp vs45, vs2, vs19 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs47, vs3, vs19 .if \Complete==0 - lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A - - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B + lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A .endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) -.else - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif - - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - xvmaddadp vs34, vs9, vs20 // real*real, imag*real - xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag - xvmaddadp vs36, vs10, vs20 // real*real, imag*real - xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag - xvmaddadp vs38, vs11, vs20 // real*real, imag*real - xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag +.if \Complete==0 + lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A - xvmaddadp vs40, vs8, vs22 // real*real, imag*real - xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag - xvmaddadp vs42, vs9, vs22 // real*real, imag*real - xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag - xvmaddadp vs44, vs10, vs22 // real*real, imag*real - xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag - xvmaddadp vs46, vs11, vs22 // real*real, imag*real - xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B + xxswapd vs17,vs16 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) + addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs39, vs11, vs21 + + xvmaddadp vs40, vs8, vs22 + xvmaddadp vs41, vs8, vs23 + xvmaddadp vs42, vs9, vs22 + xvmaddadp vs43, vs9, vs23 + xvmaddadp vs44, vs10, vs22 + xvmaddadp vs45, vs10, vs23 + xvmaddadp vs46, vs11, vs22 + xvmaddadp vs47, vs11, vs23 .endm -.macro KERNEL1x4 +.macro KERNEL1x4 LOAD1x4 0 - END1x4 AO, BO, 64,32 + END1x4 AO, BO, 64,16 .endm .macro SAVE1x4 - - mr T1, CO - -#ifndef TRMMKERNEL - - lxv vs16, 0(T1) - lxv vs17, 16(T1) - lxv vs18, 32(T1) - lxv vs19, 48(T1) - -#endif - - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 - AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 - AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - xvadddp vs10, vs10, vs18 - xvadddp vs11, vs11, vs19 - -#endif - - stxv vs8, 0(T1) - stxv vs9, 16(T1) - stxv vs10, 32(T1) - stxv vs11, 48(T1) - - addi CO, CO, 64 + SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 + addi CO, CO, 64 .endm @@ -1353,122 +1239,99 @@ lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B **********************************************************************************************/ .macro Zero1x2 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 .endm .macro LOAD1x2 Zero - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B - - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A + lxv vs16, 0(BO) // load real imag from B + xxswapd vs17,vs16 + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A .if \Zero==1 - Zero1x2 + Zero1x2 .endif .endm .macro END1x2_NORMAL - END1x2 AO,BO,32,32 + END1x2 AO,BO,32,16 .endm -.macro END1x2 AREG, BREG, OffsetA, OffsetB +.macro END1x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA .endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + .endm -.macro KERNEL1x2_L OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.macro KERNEL1x2_L OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm -.macro KERNEL1x2_E OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL1x2_E OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + xxswapd vs21,vs20 -lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 .if \Complete==0 - lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A - - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B + lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A .endif - -.if \IsLast==1 +.if \Complete==0 + lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B + xxswapd vs17,vs16 +.endif +.if \IsLast==1 .if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,64) - addi \BREG, \BREG, DISP4(\Index,64) + addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) + addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP2(\Index,32) +.endif .endif -.endif - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - xvmaddadp vs34, vs9, vs20 // real*real, imag*real - xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 .endm -.macro KERNEL1x2 +.macro KERNEL1x2 LOAD1x2 0 - END1x2 AO, BO, 32,32 + END1x2 AO, BO, 32,16 .endm .macro SAVE1x2 - - mr T1, CO - -#ifndef TRMMKERNEL - - lxv vs16, 0(T1) - lxv vs17, 16(T1) - -#endif - - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - -#endif - - stxv vs8, 0(T1) - stxv vs9, 16(T1) - -addi CO, CO, 32 - + SAVE2 vs32,vs33,vs34,vs35,CO,0 + addi CO, CO, 32 .endm /********************************************************************************************** @@ -1476,189 +1339,89 @@ addi CO, CO, 32 **********************************************************************************************/ .macro Zero1x1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 .endm .macro LOAD1x1 Zero - lxv vs0, 0(AO) // load real,imag from A - - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B + lxv vs0, 0(AO) // load real,imag from A + lxv vs16, 0(BO) // load real imag from B + xxswapd vs17, vs16 .if \Zero==1 - Zero1x1 + Zero1x1 .endif - + .endm .macro END1x1_NORMAL - END1x1 AO,BO,16,32 + END1x1 AO,BO,16,16 .endm -.macro END1x1 AREG, BREG, OffsetA, OffsetB +.macro END1x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB .endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + +.endm + +.macro KERNEL1x1_L OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL1x1_E OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + xxswapd vs21, vs20 + + lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - -.endm - -.macro KERNEL1x1_L OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - -.macro KERNEL1x1_E OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - -.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - - lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B - - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 .if \Complete==0 - lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B + lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B + xxswapd vs17, vs16 .endif - -.if \IsLast==1 +.if \IsLast==1 .if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,32) - addi \BREG, \BREG, DISP4(\Index,64) + addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP2(\Index,32) .endif .endif - - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - - + + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 + + .endm -.macro KERNEL1x1 +.macro KERNEL1x1 LOAD1x1 0 - END1x1 AO, BO, 16,32 - -.endm - -.macro SAVE1x1 - - mr T1, CO -#ifndef TRMMKERNEL - lxv vs16, 0(T1) -#endif - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - -#ifndef TRMMKERNEL - xvadddp vs8, vs8, vs16 -#endif - - stxv vs8, 0(T1) - -addi CO, CO, 16 + END1x1 AO, BO, 16,16 .endm - -.macro ZCOPYB_2 - - lxv vs32, 0(BO) - lxv vs33, 16(BO) - addi BO, BO, 32 - xxspltd vs40, vs32, 1 - xxspltd vs41, vs32, 0 - xxspltd vs42, vs33, 1 - xxspltd vs43, vs33, 0 - - stxv vs40, 0(BBO) - stxv vs41, 16(BBO) - stxv vs42, 32(BBO) - stxv vs43, 48(BBO) - addi BBO, BBO, 64 - -.endm - -.macro ZCOPYB_1 - - lxv vs32, 0(BO) - addi BO, BO, 16 - xxspltd vs40, vs32, 1 - xxspltd vs41, vs32, 0 - stxv vs40, 0(BBO) - stxv vs41, 16(BBO) - - addi BBO, BBO, 32 - -.endm - -.macro ZCOPYB_8 - - lxv vs32, 0(BO) - lxv vs33, 16(BO) - lxv vs34, 32(BO) - lxv vs35, 48(BO) - - lxv vs36, 64+0(BO) - lxv vs37, 64+16(BO) - lxv vs38, 64+32(BO) - lxv vs39, 64+48(BO) - addi BO, BO, 128 - xxspltd vs40, vs32, 1 - xxspltd vs41, vs32, 0 - xxspltd vs42, vs33, 1 - xxspltd vs43, vs33, 0 - xxspltd vs44, vs34, 1 - xxspltd vs45, vs34, 0 - xxspltd vs46, vs35, 1 - xxspltd vs47, vs35, 0 - - xxspltd vs48, vs36, 1 - xxspltd vs49, vs36, 0 - xxspltd vs50, vs37, 1 - xxspltd vs51, vs37, 0 - xxspltd vs52, vs38, 1 - xxspltd vs53, vs38, 0 - xxspltd vs54, vs39, 1 - xxspltd vs55, vs39, 0 - - stxv vs40, 0(BBO) - stxv vs41, 16(BBO) - stxv vs42, 32(BBO) - stxv vs43, 48(BBO) - - stxv vs44, 64+0(BBO) - stxv vs45, 64+16(BBO) - stxv vs46, 64+32(BBO) - stxv vs47, 64+48(BBO) - - stxv vs48, 128+ 0(BBO) - stxv vs49, 128+ 16(BBO) - stxv vs50, 128+ 32(BBO) - stxv vs51, 128+ 48(BBO) - - stxv vs52, 192 + 0(BBO) - stxv vs53, 192 + 16(BBO) - stxv vs54, 192+ 32(BBO) - stxv vs55, 192 + 48(BBO) - addi BBO, BBO, 256 - +.macro SAVE1x1 + SAVE1 vs32,vs33,CO,0 + addi CO, CO, 16 .endm diff --git a/param.h b/param.h index d0b8518c9..8f78a6a64 100644 --- a/param.h +++ b/param.h @@ -2248,15 +2248,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 640 +#define SGEMM_DEFAULT_P 832 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 640 -#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_P 256 -#define SGEMM_DEFAULT_Q 1408 +#define SGEMM_DEFAULT_Q 1025 #define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 640 -#define ZGEMM_DEFAULT_Q 1152 +#define ZGEMM_DEFAULT_Q 1025 #define SYMV_P 8