diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 0e0d62393..5c10ad64a 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -38,7 +38,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy.o CGEMMINCOPYOBJ = cgemm_incopy.o CGEMMITCOPYOBJ = cgemm_itcopy.o -ZGEMMKERNEL = zgemm_kernel_8x2_power8.S +ZGEMMKERNEL = zgemm_kernel_power9.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S index a44659468..f408cdc17 100644 --- a/kernel/power/sgemm_kernel_power9.S +++ b/kernel/power/sgemm_kernel_power9.S @@ -168,7 +168,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*alpha is stored in f1. convert to single and splat*/ - xscvdpspn alpha_r,vs1 + xscvdpspn alpha_r,vs1 xxspltw alpha_r,alpha_r,0 diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S index 300e30470..c149cb903 100644 --- a/kernel/power/sgemm_logic_power9.S +++ b/kernel/power/sgemm_logic_power9.S @@ -53,9 +53,9 @@ LSGEMM_L8x16_BEGIN: LSGEMM_L8x16_LOOP_START: LOAD8x16_0 /*we already zeroed */ - ##OffsetA=64 OffsetB=32 - addi AO,AO,2112 - addi BO,BO,32 + /*##OffsetA=64 OffsetB=32 + #addi AO,AO,2112 + #addi BO,BO,32 */ mtctr L @@ -63,29 +63,29 @@ LSGEMM_L8x16_LOOP_START: LSGEMM_L8x16_LOOP: - KERNEL8x16_I1_L4_2 -2048,0, 0,0 - KERNEL8x16_I1_L4_2 -2048,0, 1,0 - KERNEL8x16_I1_L4_2 -2048,0, 2,0 - KERNEL8x16_I1_L4_2 -2048,0, 3,0 - KERNEL8x16_I1_L4_2 -2048,0, 4,0 - KERNEL8x16_I1_L4_2 -2048,0, 5,0 - KERNEL8x16_I1_L4_2 -2048,0, 6,0 - KERNEL8x16_I1_L4_2 -2048,0, 7,0 - KERNEL8x16_I1_L4_2 -2048,0, 8,0 - KERNEL8x16_I1_L4_2 -2048,0, 9,0 - KERNEL8x16_I1_L4_2 -2048,0, 10,0 - KERNEL8x16_I1_L4_2 -2048,0, 11,0 - KERNEL8x16_I1_L4_2 -2048,0, 12,0 - KERNEL8x16_I1_L4_2 -2048,0, 13,0 - KERNEL8x16_I1_L4_2 -2048,0, 14,0 - KERNEL8x16_I1_L4_2 -2048,0, 15,1 + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_2 64,32, 1,0 + KERNEL8x16_I1_L4_2 64,32, 2,0 + KERNEL8x16_I1_L4_2 64,32, 3,0 + KERNEL8x16_I1_L4_2 64,32, 4,0 + KERNEL8x16_I1_L4_2 64,32, 5,0 + KERNEL8x16_I1_L4_2 64,32, 6,0 + KERNEL8x16_I1_L4_2 64,32, 7,0 + KERNEL8x16_I1_L4_2 64,32, 8,0 + KERNEL8x16_I1_L4_2 64,32, 9,0 + KERNEL8x16_I1_L4_2 64,32, 10,0 + KERNEL8x16_I1_L4_2 64,32, 11,0 + KERNEL8x16_I1_L4_2 64,32, 12,0 + KERNEL8x16_I1_L4_2 64,32, 13,0 + KERNEL8x16_I1_L4_2 64,32, 14,0 + KERNEL8x16_I1_L4_2 64,32, 15,1 bdnz LSGEMM_L8x16_LOOP MY_ALIGN LSGEMM_L8x16_LOOP_END: - END8x16 0, AO, BO, -2048, 0 + END8x16 0, AO, BO, 64, 32 b LSGEMM_L8x16_SUB1 MY_ALIGN diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S new file mode 100644 index 000000000..e655f0bfe --- /dev/null +++ b/kernel/power/zgemm_kernel_power9.S @@ -0,0 +1,257 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define LOAD ld + +#define STACKSIZE 32192 + +#define FZERO 312+192(SP) + + +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + + +#define o0 0 +#define alpha_r vs30 +#define alpha_i vs31 + +#define VECSAVE r11 + +#define FRAMEPOINTER r12 + +#define BBUFFER r14 + +#define L r15 +#define ALPHA r16 +#define T5 r17 +#define T2 r19 +#define BBO r20 +#define o8 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define T3 r28 +#define T4 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + mr FRAMEPOINTER, SP + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv v20, 288(SP) + stxv v21, 304(SP) + stxv v22, 320(SP) + stxv v23, 336(SP) + stxv v24, 352(SP) + stxv v25, 368(SP) + stxv v26, 384(SP) + stxv v27, 400(SP) + stxv v28, 416(SP) + stxv v29, 432(SP) + stxv v30, 448(SP) + stxv v31, 464(SP) + + + stw r0, FZERO + +#ifdef linux + ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) +#endif + + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) +#endif +#endif + + +#include "zgemm_macros_power9.S" + + cmpwi cr0, M, 0 + ble L999 + cmpwi cr0, N, 0 + ble L999 + cmpwi cr0, K, 0 + ble L999 + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 512 + li o8 , 8 + li o16 , 16 + + addi BBUFFER, SP, 512+4096 + li T1, -4096 + and BBUFFER, BBUFFER, T1 + + + addi ALPHA, SP, 296+192 + + xxlor alpha_r,vs1,vs1 /*copy from register f1 */ + xxlor alpha_i,vs2,vs2 /*copy from register f2 */ + + .align 4 + +#include "zgemm_logic_power9.S" + +L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + lxv v20, 288(SP) + lxv v21, 304(SP) + lxv v22, 320(SP) + lxv v23, 336(SP) + lxv v24, 352(SP) + lxv v25, 368(SP) + lxv v26, 384(SP) + lxv v27, 400(SP) + lxv v28, 416(SP) + lxv v29, 432(SP) + lxv v30, 448(SP) + lxv v31, 464(SP) + + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif \ No newline at end of file diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S new file mode 100644 index 000000000..77ce36294 --- /dev/null +++ b/kernel/power/zgemm_logic_power9.S @@ -0,0 +1,857 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define MY_ALIGN .align 3 + + srawi. J, N, 1 + ble ZGEMM_L2_END + +ZGEMM_L2_BEGIN: + + mr BO, B + mr BBO, BBUFFER + srawi. T1, K, 2 + ble ZGEMM_L2_COPYB1 + +ZGEMM_L2_COPYB8: + + addi T2, PRE, 128 + dcbt BO, PRE + dcbtst BBO, PRE + dcbtst BBO, T2 + ZCOPYB_8 + addic. T1, T1, -1 + + bgt ZGEMM_L2_COPYB8 + +ZGEMM_L2_COPYB1: + + andi. T1, K, 3 + ble ZGEMM_L2_COPYB_END + +ZGEMM_L2_COPYB_LOOP: + + ZCOPYB_2 + addic. T1, T1, -1 + + bgt ZGEMM_L2_COPYB_LOOP + +ZGEMM_L2_COPYB_END: + + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + srawi. I, M, 3 + ble ZGEMM_L2x8_END + +ZGEMM_L2x8_BEGIN: + + + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 5 /**(K-1) % 32x */ + ZERO2x8 + ble ZGEMM_L2x8_SUB0 + + +ZGEMM_L2x8_LOOP_START: + + LOAD2x8 0 + li T2, 1024 + li T3, 1024+512 + li T4, 2048 + li T5, 2048+512 + mtctr L + + MY_ALIGN +ZGEMM_L2x8_LOOP: + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L 128,64,0,0 + KERNEL2x8_L 128,64,1,0 + dcbt AO, T2 + KERNEL2x8_L 128,64,2,0 + KERNEL2x8_L 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L 128,64,4,0 + KERNEL2x8_L 128,64,5,0 + dcbt AO, T4 + KERNEL2x8_L 128,64,6,0 + KERNEL2x8_L 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L 128,64,8,0 + KERNEL2x8_L 128,64,9,0 + KERNEL2x8_L 128,64,10,0 + KERNEL2x8_L 128,64,11,0 + dcbt BO, T4 + KERNEL2x8_L 128,64,12,0 + KERNEL2x8_L 128,64,13,0 + KERNEL2x8_L 128,64,14,0 + KERNEL2x8_L 128,64,15,1 + bdnz ZGEMM_L2x8_LOOP + MY_ALIGN +ZGEMM_L2x8_LOOP_END: + END2x8 AO, BO, 128, 64 + + b ZGEMM_L2x8_SUB1 + +ZGEMM_L2x8_SUB0: + + andi. L, K, 63 + + b ZGEMM_L2x8_SUB2 + +ZGEMM_L2x8_SUB1: + + andi. L, T1, 31 + ble ZGEMM_L2x8_SAVE + +ZGEMM_L2x8_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L2x8_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L2x8_SUB2_LOOP: + LOAD2x8 0 + KERNEL2x8_L 128,64, 0,0 + KERNEL2x8_L 128,64, 1,0 + KERNEL2x8_L 128,64, 2,0 + KERNEL2x8_E 128,64, 3,1 + bdnz ZGEMM_L2x8_SUB2_LOOP + MY_ALIGN +ZGEMM_L2x8_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L2x8_SUB2_2 + LOAD2x8 0 + KERNEL2x8_L 128,64, 0,0 + KERNEL2x8_E 128,64, 1,1 + MY_ALIGN +ZGEMM_L2x8_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L2x8_SUB2_1 + LOAD2x8 0 + KERNEL2x8_E 128,64, 0,1 + MY_ALIGN +ZGEMM_L2x8_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L2x8_SAVE + KERNEL2x8 + +/* addic. L, L, -1 + bgt ZGEMM_L2x8_SUB2_1*/ + +ZGEMM_L2x8_SAVE: + + SAVE2x8 + + addic. I, I, -1 + bgt ZGEMM_L2x8_BEGIN + +ZGEMM_L2x8_END: + +ZGEMM_L2x4_BEGIN: + + andi. T2, M, 7 + ble ZGEMM_L2x1_END + + andi. T1, M, 4 + ble ZGEMM_L2x4_END + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 4 /**(K-1) % 16x */ + ZERO2x4 + ble ZGEMM_L2x4_SUB0 + +ZGEMM_L2x4_LOOP_START: + LOAD2x4 0 + mtctr L + + MY_ALIGN +ZGEMM_L2x4_LOOP: + KERNEL2x4_L 64,64,0,0 + KERNEL2x4_L 64,64,1,0 + KERNEL2x4_L 64,64,2,0 + KERNEL2x4_L 64,64,3,0 + KERNEL2x4_L 64,64,4,0 + KERNEL2x4_L 64,64,5,0 + KERNEL2x4_L 64,64,6,0 + KERNEL2x4_L 64,64,7,1 + bdnz ZGEMM_L2x4_LOOP + MY_ALIGN +ZGEMM_L2x4_LOOP_END: + END2x4 AO, BO, 64, 64 + + b ZGEMM_L2x4_SUB1 + +ZGEMM_L2x4_SUB0: + + andi. L, K, 31 + + b ZGEMM_L2x4_SUB2 + +ZGEMM_L2x4_SUB1: + + andi. L, T1, 15 + ble ZGEMM_L2x4_SAVE + +ZGEMM_L2x4_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L2x4_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L2x4_SUB2_LOOP: + LOAD2x4 0 + KERNEL2x4_L 64,64, 0,0 + KERNEL2x4_L 64,64, 1,0 + KERNEL2x4_L 64,64, 2,0 + KERNEL2x4_E 64,64, 3,1 + bdnz ZGEMM_L2x4_SUB2_LOOP + MY_ALIGN +ZGEMM_L2x4_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L2x4_SUB2_2 + LOAD2x4 0 + KERNEL2x4_L 64,64, 0,0 + KERNEL2x4_E 64,64, 1,1 + MY_ALIGN +ZGEMM_L2x4_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L2x4_SUB2_1 + LOAD2x4 0 + KERNEL2x4_E 64,64, 0,1 + MY_ALIGN +ZGEMM_L2x4_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L2x4_SAVE + KERNEL2x4 + +ZGEMM_L2x4_SAVE: + + SAVE2x4 + +ZGEMM_L2x4_END: + +ZGEMM_L2x2_BEGIN: + + + andi. T1, M, 2 + ble ZGEMM_L2x2_END + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 4 /**(K-1) % 16x */ + ZERO2x2 + ble ZGEMM_L2x2_SUB0 + +ZGEMM_L2x2_LOOP_START: + LOAD2x2 0 + mtctr L + + MY_ALIGN +ZGEMM_L2x2_LOOP: + KERNEL2x2_L 32,64,0,0 + KERNEL2x2_L 32,64,1,0 + KERNEL2x2_L 32,64,2,0 + KERNEL2x2_L 32,64,3,0 + KERNEL2x2_L 32,64,4,0 + KERNEL2x2_L 32,64,5,0 + KERNEL2x2_L 32,64,6,0 + KERNEL2x2_L 32,64,7,1 + bdnz ZGEMM_L2x2_LOOP + MY_ALIGN +ZGEMM_L2x2_LOOP_END: + END2x2 AO, BO, 32, 64 + + b ZGEMM_L2x2_SUB1 + +ZGEMM_L2x2_SUB0: + + andi. L, K, 31 + + b ZGEMM_L2x2_SUB2 + +ZGEMM_L2x2_SUB1: + + andi. L, T1, 15 + ble ZGEMM_L2x2_SAVE + +ZGEMM_L2x2_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L2x2_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L2x2_SUB2_LOOP: + LOAD2x2 0 + KERNEL2x2_L 32,64, 0,0 + KERNEL2x2_L 32,64, 1,0 + KERNEL2x2_L 32,64, 2,0 + KERNEL2x2_E 32,64, 3,1 + bdnz ZGEMM_L2x2_SUB2_LOOP + MY_ALIGN +ZGEMM_L2x2_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L2x2_SUB2_2 + LOAD2x2 0 + KERNEL2x2_L 32,64, 0,0 + KERNEL2x2_E 32,64, 1,1 + MY_ALIGN +ZGEMM_L2x2_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L2x2_SUB2_1 + LOAD2x2 0 + KERNEL2x2_E 32,64, 0,1 + MY_ALIGN +ZGEMM_L2x2_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L2x2_SAVE + KERNEL2x2 +ZGEMM_L2x2_SAVE: + + SAVE2x2 + +ZGEMM_L2x2_END: + +ZGEMM_L2x1_BEGIN: + + + andi. T1, M, 1 + ble ZGEMM_L2x1_END + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 4 /**(K-1) % 16x */ + ZERO2x1 + ble ZGEMM_L2x1_SUB0 + +ZGEMM_L2x1_LOOP_START: + + LOAD2x1 0 + mtctr L + + MY_ALIGN +ZGEMM_L2x1_LOOP: + KERNEL2x1_L 16,64,0,0 + KERNEL2x1_L 16,64,1,0 + KERNEL2x1_L 16,64,2,0 + KERNEL2x1_L 16,64,3,0 + KERNEL2x1_L 16,64,4,0 + KERNEL2x1_L 16,64,5,0 + KERNEL2x1_L 16,64,6,0 + KERNEL2x1_L 16,64,7,1 + bdnz ZGEMM_L2x1_LOOP + MY_ALIGN +ZGEMM_L2x1_LOOP_END: + END2x1 AO, BO, 16, 64 + + b ZGEMM_L2x1_SUB1 + +ZGEMM_L2x1_SUB0: + + andi. L, K, 31 + + b ZGEMM_L2x1_SUB2 + +ZGEMM_L2x1_SUB1: + + andi. L, T1, 15 + ble ZGEMM_L2x1_SAVE + +ZGEMM_L2x1_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L2x1_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L2x1_SUB2_LOOP: + LOAD2x1 0 + KERNEL2x1_L 16,64, 0,0 + KERNEL2x1_L 16,64, 1,0 + KERNEL2x1_L 16,64, 2,0 + KERNEL2x1_E 16,64, 3,1 + bdnz ZGEMM_L2x1_SUB2_LOOP + MY_ALIGN +ZGEMM_L2x1_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L2x1_SUB2_2 + LOAD2x1 0 + KERNEL2x1_L 16,64, 0,0 + KERNEL2x1_E 16,64, 1,1 + MY_ALIGN +ZGEMM_L2x1_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L2x1_SUB2_1 + LOAD2x1 0 + KERNEL2x1_E 16,64, 0,1 + MY_ALIGN +ZGEMM_L2x1_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L2x1_SAVE + KERNEL2x1 + +ZGEMM_L2x1_SAVE: + + SAVE2x1 + +ZGEMM_L2x1_END: + + slwi T1, K, 5 + add B, B, T1 + + addic. J, J, -1 + bgt ZGEMM_L2_BEGIN + + andi. T2, N, 1 + ble L999 + +ZGEMM_L2_END: + + b ZGEMM_L1_BEGIN + +L999_H1: + + b L999 + +ZGEMM_L1_BEGIN: + andi. T1, N, 1 + ble ZGEMM_L1_END + + mr BO, B + mr BBO, BBUFFER + srawi. T1, K, 3 /*this time K/8 */ + ble ZGEMM_L1_COPYB1 + +ZGEMM_L1_COPYB8: + + addi T2, PRE, 128 + dcbt BO, PRE + dcbtst BBO, PRE + dcbtst BBO, T2 + ZCOPYB_8 + addic. T1, T1, -1 + + bgt ZGEMM_L1_COPYB8 + +ZGEMM_L1_COPYB1: + + andi. T1, K, 7 + ble ZGEMM_L1_COPYB_END + +ZGEMM_L1_COPYB_LOOP: + + ZCOPYB_1 + addic. T1, T1, -1 + + bgt ZGEMM_L1_COPYB_LOOP + +ZGEMM_L1_COPYB_END: + + mr CO, C + mr AO, A + srawi. I, M, 3 + ble ZGEMM_L1x8_END + +ZGEMM_L1x8_BEGIN: + + + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 5 /**(K-1) % 32x */ + ZERO1x8 + ble ZGEMM_L1x8_SUB0 + + +ZGEMM_L1x8_LOOP_START: + + LOAD1x8 0 + li T2, 1024 + li T3, 1024+512 + li T4, 2048 + li T5, 2048+512 + mtctr L + + MY_ALIGN +ZGEMM_L1x8_LOOP: + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L 128,32,0,0 + KERNEL1x8_L 128,32,1,0 + dcbt AO, T2 + KERNEL1x8_L 128,32,2,0 + KERNEL1x8_L 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L 128,32,4,0 + KERNEL1x8_L 128,32,5,0 + dcbt AO, T4 + KERNEL1x8_L 128,32,6,0 + KERNEL1x8_L 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L 128,32,8,0 + KERNEL1x8_L 128,32,9,0 + KERNEL1x8_L 128,32,10,0 + KERNEL1x8_L 128,32,11,0 + dcbt BO, T4 + KERNEL1x8_L 128,32,12,0 + KERNEL1x8_L 128,32,13,0 + KERNEL1x8_L 128,32,14,0 + KERNEL1x8_L 128,32,15,1 + bdnz ZGEMM_L1x8_LOOP + MY_ALIGN +ZGEMM_L1x8_LOOP_END: + END1x8 AO, BO, 128, 32 + + b ZGEMM_L1x8_SUB1 + +ZGEMM_L1x8_SUB0: + + andi. L, K, 63 + + b ZGEMM_L1x8_SUB2 + +ZGEMM_L1x8_SUB1: + + andi. L, T1, 31 + ble ZGEMM_L1x8_SAVE + +ZGEMM_L1x8_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L1x8_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L1x8_SUB2_LOOP: + LOAD1x8 0 + KERNEL1x8_L 128,32, 0,0 + KERNEL1x8_L 128,32, 1,0 + KERNEL1x8_L 128,32, 2,0 + KERNEL1x8_E 128,32, 3,1 + bdnz ZGEMM_L1x8_SUB2_LOOP + MY_ALIGN +ZGEMM_L1x8_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L1x8_SUB2_2 + LOAD1x8 0 + KERNEL1x8_L 128,32, 0,0 + KERNEL1x8_E 128,32, 1,1 + MY_ALIGN +ZGEMM_L1x8_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L1x8_SUB2_1 + LOAD1x8 0 + KERNEL1x8_E 128,32, 0,1 + MY_ALIGN +ZGEMM_L1x8_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L1x8_SAVE + KERNEL1x8 + +/* addic. L, L, -1 + bgt ZGEMM_L1x8_SUB2_1*/ + +ZGEMM_L1x8_SAVE: + + SAVE1x8 + + addic. I, I, -1 + bgt ZGEMM_L1x8_BEGIN + +ZGEMM_L1x8_END: + +ZGEMM_L1x4_BEGIN: + + andi. T2, M, 7 + ble ZGEMM_L1x1_END + + andi. T1, M, 4 + ble ZGEMM_L1x4_END + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 5 /**(K-1) % 16x */ + ZERO1x4 + ble ZGEMM_L1x4_SUB0 + +ZGEMM_L1x4_LOOP_START: + LOAD1x4 0 + mtctr L + + MY_ALIGN +ZGEMM_L1x4_LOOP: + KERNEL1x4_L 64,32,0,0 + KERNEL1x4_L 64,32,1,0 + KERNEL1x4_L 64,32,2,0 + KERNEL1x4_L 64,32,3,0 + KERNEL1x4_L 64,32,4,0 + KERNEL1x4_L 64,32,5,0 + KERNEL1x4_L 64,32,6,0 + KERNEL1x4_L 64,32,7,0 + KERNEL1x4_L 64,32,8,0 + KERNEL1x4_L 64,32,9,0 + KERNEL1x4_L 64,32,10,0 + KERNEL1x4_L 64,32,11,0 + KERNEL1x4_L 64,32,12,0 + KERNEL1x4_L 64,32,13,0 + KERNEL1x4_L 64,32,14,0 + KERNEL1x4_L 64,32,15,1 + bdnz ZGEMM_L1x4_LOOP + MY_ALIGN +ZGEMM_L1x4_LOOP_END: + END1x4 AO, BO, 64, 32 + + b ZGEMM_L1x4_SUB1 + +ZGEMM_L1x4_SUB0: + + andi. L, K, 63 + + b ZGEMM_L1x4_SUB2 + +ZGEMM_L1x4_SUB1: + + andi. L, T1, 31 + ble ZGEMM_L1x4_SAVE + +ZGEMM_L1x4_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L1x4_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L1x4_SUB2_LOOP: + LOAD1x4 0 + KERNEL1x4_L 64,32, 0,0 + KERNEL1x4_L 64,32, 1,0 + KERNEL1x4_L 64,32, 2,0 + KERNEL1x4_E 64,32, 3,1 + bdnz ZGEMM_L1x4_SUB2_LOOP + MY_ALIGN +ZGEMM_L1x4_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L1x4_SUB2_2 + LOAD1x4 0 + KERNEL1x4_L 64,32, 0,0 + KERNEL1x4_E 64,32, 1,1 + MY_ALIGN +ZGEMM_L1x4_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L1x4_SUB2_1 + LOAD1x4 0 + KERNEL1x4_E 64,32, 0,1 + MY_ALIGN +ZGEMM_L1x4_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L1x4_SAVE + KERNEL1x4 + +ZGEMM_L1x4_SAVE: + + SAVE1x4 + +ZGEMM_L1x4_END: + +ZGEMM_L1x2_BEGIN: + + + andi. T1, M, 2 + ble ZGEMM_L1x2_END + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 5 /**(K-1) % 16x */ + ZERO1x2 + ble ZGEMM_L1x2_SUB0 + +ZGEMM_L1x2_LOOP_START: + LOAD1x2 0 + mtctr L + + MY_ALIGN +ZGEMM_L1x2_LOOP: + KERNEL1x2_L 32,32,0,0 + KERNEL1x2_L 32,32,1,0 + KERNEL1x2_L 32,32,2,0 + KERNEL1x2_L 32,32,3,0 + KERNEL1x2_L 32,32,4,0 + KERNEL1x2_L 32,32,5,0 + KERNEL1x2_L 32,32,6,0 + KERNEL1x2_L 32,32,7,0 + KERNEL1x2_L 32,32,8,0 + KERNEL1x2_L 32,32,9,0 + KERNEL1x2_L 32,32,10,0 + KERNEL1x2_L 32,32,11,0 + KERNEL1x2_L 32,32,12,0 + KERNEL1x2_L 32,32,13,0 + KERNEL1x2_L 32,32,14,0 + KERNEL1x2_L 32,32,15,1 + bdnz ZGEMM_L1x2_LOOP + MY_ALIGN +ZGEMM_L1x2_LOOP_END: + END1x2 AO, BO, 32, 32 + + b ZGEMM_L1x2_SUB1 + +ZGEMM_L1x2_SUB0: + + andi. L, K, 63 + + b ZGEMM_L1x2_SUB2 + +ZGEMM_L1x2_SUB1: + + andi. L, T1, 31 + ble ZGEMM_L1x2_SAVE + +ZGEMM_L1x2_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L1x2_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L1x2_SUB2_LOOP: + LOAD1x2 0 + KERNEL1x2_L 32,32, 0,0 + KERNEL1x2_L 32,32, 1,0 + KERNEL1x2_L 32,32, 2,0 + KERNEL1x2_E 32,32, 3,1 + bdnz ZGEMM_L1x2_SUB2_LOOP + MY_ALIGN +ZGEMM_L1x2_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L1x2_SUB2_2 + LOAD1x2 0 + KERNEL1x2_L 32,32, 0,0 + KERNEL1x2_E 32,32, 1,1 + MY_ALIGN +ZGEMM_L1x2_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L1x2_SUB2_1 + LOAD1x2 0 + KERNEL1x2_E 32,32, 0,1 + MY_ALIGN +ZGEMM_L1x2_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L1x2_SAVE + KERNEL1x2 +ZGEMM_L1x2_SAVE: + + SAVE1x2 + +ZGEMM_L1x2_END: + +ZGEMM_L1x1_BEGIN: + + + andi. T1, M, 1 + ble ZGEMM_L1x1_END + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 5 /**(K-1) % 16x */ + ZERO1x1 + ble ZGEMM_L1x1_SUB0 + +ZGEMM_L1x1_LOOP_START: + + LOAD1x1 0 + mtctr L + + MY_ALIGN +ZGEMM_L1x1_LOOP: + KERNEL1x1_L 16,32,0,0 + KERNEL1x1_L 16,32,1,0 + KERNEL1x1_L 16,32,2,0 + KERNEL1x1_L 16,32,3,0 + KERNEL1x1_L 16,32,4,0 + KERNEL1x1_L 16,32,5,0 + KERNEL1x1_L 16,32,6,0 + KERNEL1x1_L 16,32,7,0 + KERNEL1x1_L 16,32,8,0 + KERNEL1x1_L 16,32,9,0 + KERNEL1x1_L 16,32,10,0 + KERNEL1x1_L 16,32,11,0 + KERNEL1x1_L 16,32,12,0 + KERNEL1x1_L 16,32,13,0 + KERNEL1x1_L 16,32,14,0 + KERNEL1x1_L 16,32,15,1 + bdnz ZGEMM_L1x1_LOOP + MY_ALIGN +ZGEMM_L1x1_LOOP_END: + END1x1 AO, BO, 16, 32 + + b ZGEMM_L1x1_SUB1 + +ZGEMM_L1x1_SUB0: + + andi. L, K, 63 + + b ZGEMM_L1x1_SUB2 + +ZGEMM_L1x1_SUB1: + + andi. L, T1, 31 + ble ZGEMM_L1x1_SAVE + +ZGEMM_L1x1_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L1x1_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L1x1_SUB2_LOOP: + LOAD1x1 0 + KERNEL1x1_L 16,32, 0,0 + KERNEL1x1_L 16,32, 1,0 + KERNEL1x1_L 16,32, 2,0 + KERNEL1x1_E 16,32, 3,1 + bdnz ZGEMM_L1x1_SUB2_LOOP + MY_ALIGN +ZGEMM_L1x1_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L1x1_SUB2_2 + LOAD1x1 0 + KERNEL1x1_L 16,32, 0,0 + KERNEL1x1_E 16,32, 1,1 + MY_ALIGN +ZGEMM_L1x1_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L1x1_SUB2_1 + LOAD1x1 0 + KERNEL1x1_E 16,32, 0,1 + MY_ALIGN +ZGEMM_L1x1_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L1x1_SAVE + KERNEL1x1 + +ZGEMM_L1x1_SAVE: + + SAVE1x1 + +ZGEMM_L1x1_END: + +ZGEMM_L1_END: diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S new file mode 100644 index 000000000..93a309ad1 --- /dev/null +++ b/kernel/power/zgemm_macros_power9.S @@ -0,0 +1,1664 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xsadddp + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xsadddp + +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xssubdp + +#else // CC || CR || RC || RR + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xssubdp + +#endif + +.macro AGGREGATE_INTO_COMPLEX FIRST_V, SECOND_V, OUTPUT_V + AGGREGATE_INTO_COMPLEX_INNER \FIRST_V, \SECOND_V, \OUTPUT_V, vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7 +.endm + +.macro AGGREGATE_INTO_COMPLEX_INNER FIRST_V, SECOND_V, OUTPUT_V ,TEMP1,TEMP2,TEMP3,TEMP4,TEMP5,TEMP6,TEMP7,TEMP8 + xxlxor \TEMP1, \TEMP1, \TEMP1 + xxlxor \TEMP2, \TEMP2, \TEMP2 + + xxswapd \SECOND_V, \SECOND_V // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 \TEMP2, \TEMP2, \FIRST_V // realA*imagB + XSFADD_I2 \TEMP2, \TEMP2, \SECOND_V // imagA*realB + + xxswapd \FIRST_V, \FIRST_V //imagA*realB, realA*realB -> realA*realB, imagA*realB + xxswapd \SECOND_V, \SECOND_V // reverse to original imagA*imagB, realA*imagB + + XSFADD_R1 \TEMP1, \TEMP1, \FIRST_V // realA*realB + XSFADD_R2 \TEMP1, \TEMP1, \SECOND_V // imagA*imagB + + xsmuldp \TEMP3, \TEMP2, alpha_i // imag*alpha_i + xsmuldp \TEMP4, \TEMP2, alpha_r // imag*alpha_r + xsmuldp \TEMP5, \TEMP1, alpha_r // real*alpha_r + xsmuldp \TEMP6, \TEMP1, alpha_i // real*alpha_i + + xssubdp \TEMP7, \TEMP5, \TEMP3 // real*alpha_r - imag*alpha_i + xsadddp \TEMP8, \TEMP6, \TEMP4 // real*alpha_i + imag*alpha_r + xxpermdi \OUTPUT_V, \TEMP8, \TEMP7, 0 // merge real and imag part +.endm + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +#define unit_size 16 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) + +.macro Zero2x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endm + +.macro LOAD2x8 Zero + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + lxv vs18, 32(BO) // load real part from B + lxv vs19, 48(BO) // load imag part from B + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A + + lxv vs4, 64(AO) // load real,imag from A + lxv vs5, 80(AO) // load real,imag from A + lxv vs6, 96(AO) // load real,imag from A + lxv vs7, 112(AO) // load real,imag from A + +.if \Zero==1 + Zero2x8 +.endif + +.endm + +.macro END2x8_NORMAL + END2x8 AO,BO,128,64 +.endm + +.macro END2x8 AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + xvmaddadp vs48, vs0, vs18 // real*real, imag*real + xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs50, vs1, vs18 // real*real, imag*real + xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs52, vs2, vs18 // real*real, imag*real + xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs54, vs3, vs18 // real*real, imag*real + xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag + xvmaddadp vs56, vs4, vs18 // real*real, imag*real + xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag + xvmaddadp vs58, vs5, vs18 // real*real, imag*real + xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag + xvmaddadp vs60, vs6, vs18 // real*real, imag*real + xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag + xvmaddadp vs62, vs7, vs18 // real*real, imag*real + xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag + +.endm + +.macro KERNEL2x8_L OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL2x8_E OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + +lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B + lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B + lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + xvmaddadp vs48, vs0, vs18 // real*real, imag*real + xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs50, vs1, vs18 // real*real, imag*real + xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs52, vs2, vs18 // real*real, imag*real + xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs54, vs3, vs18 // real*real, imag*real + xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag + xvmaddadp vs56, vs4, vs18 // real*real, imag*real + xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag + xvmaddadp vs58, vs5, vs18 // real*real, imag*real + xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag + xvmaddadp vs60, vs6, vs18 // real*real, imag*real + xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag + xvmaddadp vs62, vs7, vs18 // real*real, imag*real + xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag + +.if \Complete==0 + lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B + lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B + lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP8(\Index,128) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + xvmaddadp vs48, vs8, vs22 // real*real, imag*real + xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs50, vs9, vs22 // real*real, imag*real + xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs52, vs10, vs22 // real*real, imag*real + xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs54, vs11, vs22 // real*real, imag*real + xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag + xvmaddadp vs56, vs12, vs22 // real*real, imag*real + xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag + xvmaddadp vs58, vs13, vs22 // real*real, imag*real + xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag + xvmaddadp vs60, vs14, vs22 // real*real, imag*real + xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag + xvmaddadp vs62, vs15, vs22 // real*real, imag*real + xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag + +.endm + +.macro KERNEL2x8 + LOAD2x8 0 + END2x8 AO, BO, 128,64 +.endm + +.macro SAVE2x8 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + lxv vs18, 32(T1) + lxv vs19, 48(T1) + lxv vs20, 0(T2) + lxv vs21, 16(T2) + lxv vs22, 32(T2) + lxv vs23, 48(T2) + +#endif + + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 + AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 + AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 + AGGREGATE_INTO_COMPLEX vs40,vs41,vs12 + AGGREGATE_INTO_COMPLEX vs42,vs43,vs13 + AGGREGATE_INTO_COMPLEX vs44,vs45,vs14 + AGGREGATE_INTO_COMPLEX vs46,vs47,vs15 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + stxv vs10, 32(T1) + stxv vs11, 48(T1) + stxv vs12, 0(T2) + stxv vs13, 16(T2) + stxv vs14, 32(T2) + stxv vs15, 48(T2) + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + lxv vs18, 32(T1) + lxv vs19, 48(T1) + lxv vs20, 0(T2) + lxv vs21, 16(T2) + lxv vs22, 32(T2) + lxv vs23, 48(T2) + +#endif + + AGGREGATE_INTO_COMPLEX vs48,vs49,vs8 + AGGREGATE_INTO_COMPLEX vs50,vs51,vs9 + AGGREGATE_INTO_COMPLEX vs52,vs53,vs10 + AGGREGATE_INTO_COMPLEX vs54,vs55,vs11 + AGGREGATE_INTO_COMPLEX vs56,vs57,vs12 + AGGREGATE_INTO_COMPLEX vs58,vs59,vs13 + AGGREGATE_INTO_COMPLEX vs60,vs61,vs14 + AGGREGATE_INTO_COMPLEX vs62,vs63,vs15 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + stxv vs10, 32(T1) + stxv vs11, 48(T1) + stxv vs12, 0(T2) + stxv vs13, 16(T2) + stxv vs14, 32(T2) + stxv vs15, 48(T2) + + addi CO, CO, 128 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro Zero2x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + +.macro LOAD2x4 Zero + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + lxv vs18, 32(BO) // load real part from B + lxv vs19, 48(BO) // load imag part from B + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A + +.if \Zero==1 + Zero2x4 +.endif + +.endm + +.macro END2x4_NORMAL + END2x4 AO,BO,64,64 +.endm + +.macro END2x4 AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + xvmaddadp vs40, vs0, vs18 // real*real, imag*real + xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs42, vs1, vs18 // real*real, imag*real + xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs44, vs2, vs18 // real*real, imag*real + xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs46, vs3, vs18 // real*real, imag*real + xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + +.endm + +.macro KERNEL2x4_L OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL2x4_E OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + +lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B + lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B + lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + xvmaddadp vs40, vs0, vs18 // real*real, imag*real + xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs42, vs1, vs18 // real*real, imag*real + xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs44, vs2, vs18 // real*real, imag*real + xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs46, vs3, vs18 // real*real, imag*real + xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + +.if \Complete==0 + lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B + lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B + lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP8(\Index,128) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + xvmaddadp vs40, vs8, vs22 // real*real, imag*real + xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs42, vs9, vs22 // real*real, imag*real + xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs44, vs10, vs22 // real*real, imag*real + xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs46, vs11, vs22 // real*real, imag*real + xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag + +.endm + +.macro KERNEL2x4 + LOAD2x4 0 + END2x4 AO, BO, 64,64 +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + lxv vs18, 32(T1) + lxv vs19, 48(T1) + +#endif + + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 + AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 + AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + stxv vs10, 32(T1) + stxv vs11, 48(T1) + + add T1, T1, LDC + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + lxv vs18, 32(T1) + lxv vs19, 48(T1) + +#endif + + AGGREGATE_INTO_COMPLEX vs40,vs41,vs8 + AGGREGATE_INTO_COMPLEX vs42,vs43,vs9 + AGGREGATE_INTO_COMPLEX vs44,vs45,vs10 + AGGREGATE_INTO_COMPLEX vs46,vs47,vs11 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + stxv vs10, 32(T1) + stxv vs11, 48(T1) + + addi CO, CO, 64 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro Zero2x2 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 +.endm + +.macro LOAD2x2 Zero + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + lxv vs18, 32(BO) // load real part from B + lxv vs19, 48(BO) // load imag part from B + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + +.if \Zero==1 + Zero2x2 +.endif + +.endm + +.macro END2x2_NORMAL + END2x2 AO,BO,32,64 +.endm + +.macro END2x2 AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + xvmaddadp vs36, vs0, vs18 // real*real, imag*real + xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs38, vs1, vs18 // real*real, imag*real + xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag + +.endm + +.macro KERNEL2x2_L OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL2x2_E OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + +lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B + lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B + lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + xvmaddadp vs36, vs0, vs18 // real*real, imag*real + xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs38, vs1, vs18 // real*real, imag*real + xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag + +.if \Complete==0 + lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B + lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B + lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP8(\Index,128) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + xvmaddadp vs36, vs8, vs22 // real*real, imag*real + xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs38, vs9, vs22 // real*real, imag*real + xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag + +.endm + +.macro KERNEL2x2 + LOAD2x2 0 + END2x2 AO, BO, 32,64 +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + +#endif + + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + + add T1, T1, LDC + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + +#endif + + AGGREGATE_INTO_COMPLEX vs36,vs37,vs8 + AGGREGATE_INTO_COMPLEX vs38,vs39,vs9 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + + addi CO, CO, 32 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro Zero2x1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 +.endm + +.macro LOAD2x1 Zero + lxv vs0, 0(AO) // load real,imag from A + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + lxv vs18, 32(BO) // load real part from B + lxv vs19, 48(BO) // load imag part from B + +.if \Zero==1 + Zero2x1 +.endif + +.endm + +.macro END2x1_NORMAL + END2x1 AO,BO,16,64 +.endm + +.macro END2x1 AREG, BREG, OffsetA, OffsetB + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + xvmaddadp vs34, vs0, vs18 // real*real, imag*real + xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag + +.endm + +.macro KERNEL2x1_L OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL2x1_E OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + +lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B + lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B + lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + xvmaddadp vs34, vs0, vs18 // real*real, imag*real + xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag + +.if \Complete==0 + lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B + lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B + lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP8(\Index,128) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + xvmaddadp vs34, vs8, vs22 // real*real, imag*real + xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag + +.endm + +.macro KERNEL2x1 + LOAD2x1 0 + END2x1 AO, BO, 16,64 +.endm + +.macro SAVE2x1 + + mr T1, CO +#ifndef TRMMKERNEL + lxv vs16, 0(T1) +#endif + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + +#ifndef TRMMKERNEL + xvadddp vs8, vs8, vs16 +#endif + + stxv vs8, 0(T1) + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxv vs16, 0(T1) +#endif + + AGGREGATE_INTO_COMPLEX vs34,vs35,vs8 + +#ifndef TRMMKERNEL + xvadddp vs8, vs8, vs16 +#endif + + stxv vs8, 0(T1) + + addi CO, CO, 16 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ +.macro Zero1x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + +.macro LOAD1x8 Zero + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A + + lxv vs4, 64(AO) // load real,imag from A + lxv vs5, 80(AO) // load real,imag from A + lxv vs6, 96(AO) // load real,imag from A + lxv vs7, 112(AO) // load real,imag from A + +.if \Zero==1 + Zero1x8 +.endif + +.endm + +.macro END1x8_NORMAL + END1x8 AO,BO,128,32 +.endm + +.macro END1x8 AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + +.endm + +.macro KERNEL1x8_L OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL1x8_E OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + +.if \Complete==0 + lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP4(\Index,48+\OffsetB)(\BREG) // load imag part from B +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + +.endm + +.macro KERNEL1x8 + LOAD1x8 0 + END1x8 AO, BO, 128,32 +.endm + +.macro SAVE1x8 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + lxv vs18, 32(T1) + lxv vs19, 48(T1) + lxv vs20, 0(T2) + lxv vs21, 16(T2) + lxv vs22, 32(T2) + lxv vs23, 48(T2) + +#endif + + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 + AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 + AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 + AGGREGATE_INTO_COMPLEX vs40,vs41,vs12 + AGGREGATE_INTO_COMPLEX vs42,vs43,vs13 + AGGREGATE_INTO_COMPLEX vs44,vs45,vs14 + AGGREGATE_INTO_COMPLEX vs46,vs47,vs15 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + stxv vs10, 32(T1) + stxv vs11, 48(T1) + stxv vs12, 0(T2) + stxv vs13, 16(T2) + stxv vs14, 32(T2) + stxv vs15, 48(T2) + + addi CO, CO, 128 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro Zero1x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 +.endm + +.macro LOAD1x4 Zero + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A + +.if \Zero==1 + Zero1x4 +.endif + +.endm + +.macro END1x4_NORMAL + END1x4 AO,BO,64,32 +.endm + +.macro END1x4 AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + +.endm + +.macro KERNEL1x4_L OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL1x4_E OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + +lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + xvmaddadp vs40, vs0, vs18 // real*real, imag*real + xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs42, vs1, vs18 // real*real, imag*real + xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs44, vs2, vs18 // real*real, imag*real + xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs46, vs3, vs18 // real*real, imag*real + xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + +.if \Complete==0 + lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + xvmaddadp vs40, vs8, vs22 // real*real, imag*real + xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs42, vs9, vs22 // real*real, imag*real + xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs44, vs10, vs22 // real*real, imag*real + xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs46, vs11, vs22 // real*real, imag*real + xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag + +.endm + +.macro KERNEL1x4 + LOAD1x4 0 + END1x4 AO, BO, 64,32 +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + lxv vs18, 32(T1) + lxv vs19, 48(T1) + +#endif + + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 + AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 + AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + stxv vs10, 32(T1) + stxv vs11, 48(T1) + + addi CO, CO, 64 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro Zero1x2 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 +.endm + +.macro LOAD1x2 Zero + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + +.if \Zero==1 + Zero1x2 +.endif + +.endm + +.macro END1x2_NORMAL + END1x2 AO,BO,32,32 +.endm + +.macro END1x2 AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + +.endm + +.macro KERNEL1x2_L OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL1x2_E OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + +lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag +.if \Complete==0 + lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + +.endm + +.macro KERNEL1x2 + LOAD1x2 0 + END1x2 AO, BO, 32,32 +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + +#endif + + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + +addi CO, CO, 32 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro Zero1x1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 +.endm + +.macro LOAD1x1 Zero + lxv vs0, 0(AO) // load real,imag from A + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + +.if \Zero==1 + Zero1x1 +.endif + +.endm + +.macro END1x1_NORMAL + END1x1 AO,BO,16,32 +.endm + +.macro END1x1 AREG, BREG, OffsetA, OffsetB + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_L OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL1x1_E OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + +.if \Complete==0 + lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1 + LOAD1x1 0 + END1x1 AO, BO, 16,32 + +.endm + +.macro SAVE1x1 + + mr T1, CO +#ifndef TRMMKERNEL + lxv vs16, 0(T1) +#endif + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + +#ifndef TRMMKERNEL + xvadddp vs8, vs8, vs16 +#endif + + stxv vs8, 0(T1) + +addi CO, CO, 16 + +.endm + + +.macro ZCOPYB_2 + + lxv vs32, 0(BO) + lxv vs33, 16(BO) + addi BO, BO, 32 + xxspltd vs40, vs32, 1 + xxspltd vs41, vs32, 0 + xxspltd vs42, vs33, 1 + xxspltd vs43, vs33, 0 + + stxv vs40, 0(BBO) + stxv vs41, 16(BBO) + stxv vs42, 32(BBO) + stxv vs43, 48(BBO) + addi BBO, BBO, 64 + +.endm + +.macro ZCOPYB_1 + + lxv vs32, 0(BO) + addi BO, BO, 16 + xxspltd vs40, vs32, 1 + xxspltd vs41, vs32, 0 + stxv vs40, 0(BBO) + stxv vs41, 16(BBO) + + addi BBO, BBO, 32 + +.endm + +.macro ZCOPYB_8 + + lxv vs32, 0(BO) + lxv vs33, 16(BO) + lxv vs34, 32(BO) + lxv vs35, 48(BO) + + lxv vs36, 64+0(BO) + lxv vs37, 64+16(BO) + lxv vs38, 64+32(BO) + lxv vs39, 64+48(BO) + addi BO, BO, 128 + xxspltd vs40, vs32, 1 + xxspltd vs41, vs32, 0 + xxspltd vs42, vs33, 1 + xxspltd vs43, vs33, 0 + xxspltd vs44, vs34, 1 + xxspltd vs45, vs34, 0 + xxspltd vs46, vs35, 1 + xxspltd vs47, vs35, 0 + + xxspltd vs48, vs36, 1 + xxspltd vs49, vs36, 0 + xxspltd vs50, vs37, 1 + xxspltd vs51, vs37, 0 + xxspltd vs52, vs38, 1 + xxspltd vs53, vs38, 0 + xxspltd vs54, vs39, 1 + xxspltd vs55, vs39, 0 + + stxv vs40, 0(BBO) + stxv vs41, 16(BBO) + stxv vs42, 32(BBO) + stxv vs43, 48(BBO) + + stxv vs44, 64+0(BBO) + stxv vs45, 64+16(BBO) + stxv vs46, 64+32(BBO) + stxv vs47, 64+48(BBO) + + stxv vs48, 128+ 0(BBO) + stxv vs49, 128+ 16(BBO) + stxv vs50, 128+ 32(BBO) + stxv vs51, 128+ 48(BBO) + + stxv vs52, 192 + 0(BBO) + stxv vs53, 192 + 16(BBO) + stxv vs54, 192+ 32(BBO) + stxv vs55, 192 + 48(BBO) + addi BBO, BBO, 256 + +.endm + diff --git a/param.h b/param.h index 4dcd96a75..d0b8518c9 100644 --- a/param.h +++ b/param.h @@ -2251,12 +2251,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 640 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 640 -#define ZGEMM_DEFAULT_P 320 +#define ZGEMM_DEFAULT_P 512 #define SGEMM_DEFAULT_Q 1408 #define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 640 -#define ZGEMM_DEFAULT_Q 640 +#define ZGEMM_DEFAULT_Q 1152 #define SYMV_P 8