diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 00d31f8b6..4fc7190b0 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -10,7 +10,7 @@ else STRMMKERNEL = sgemm_kernel_power10.c DTRMMKERNEL = dgemm_kernel_power10.c CTRMMKERNEL = cgemm_kernel_power10.S -ZTRMMKERNEL = zgemm_kernel_power9.S +ZTRMMKERNEL = zgemm_kernel_power10.S SGEMMKERNEL = sgemm_kernel_power10.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c @@ -42,7 +42,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -ZGEMMKERNEL = zgemm_kernel_power9.S +ZGEMMKERNEL = zgemm_kernel_power10.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c diff --git a/kernel/power/zgemm_kernel_power10.S b/kernel/power/zgemm_kernel_power10.S new file mode 100644 index 000000000..fca389e69 --- /dev/null +++ b/kernel/power/zgemm_kernel_power10.S @@ -0,0 +1,245 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define LOAD ld + +#define STACKSIZE 512 + +#define FZERO 312+192(SP) + +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ + +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + + +#define o0 0 +#define alpha_r vs62 +#define alpha_i vs63 + +#define VECSAVE r11 + +#define FRAMEPOINTER r12 + +#define T10 r14 + +#define L r15 +#define T8 r16 +#define T5 r17 +#define T2 r19 +#define TEMP_REG r20 +#define T6 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T7 r27 +#define T3 r28 +#define T4 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + mr FRAMEPOINTER, SP + addi SP, SP, -STACKSIZE + mflr r0 + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + xxspltd alpha_r,vs1,0 /*copy from register f1 */ + xxspltd alpha_i,vs2,0 /*copy from register f2 */ + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs20, 288(SP) + stxv vs21, 304(SP) + stxv vs22, 320(SP) + stxv vs23, 336(SP) + stxv vs24, 352(SP) + stxv vs25, 368(SP) + stxv vs26, 384(SP) + stxv vs27, 400(SP) + stxv vs28, 416(SP) + stxv vs29, 432(SP) + stxv vs30, 448(SP) + stxv vs31, 464(SP) + + std r0, FLINK_SAVE(SP) + + +#if defined(linux) || defined(__FreeBSD__) + ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) +#endif + + +#ifdef TRMMKERNEL +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) +#endif +#endif + + +#include "zgemm_macros_power10.S" + + + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 512 + li r0, 0 + + +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegdp alpha_r,alpha_r + xvnegdp alpha_i,alpha_i +#endif + .align 4 + +#include "zgemm_logic_power10.S" + +L999: + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs20, 288(SP) + lxv vs21, 304(SP) + lxv vs22, 320(SP) + lxv vs23, 336(SP) + lxv vs24, 352(SP) + lxv vs25, 368(SP) + lxv vs26, 384(SP) + lxv vs27, 400(SP) + mtlr r0 + lxv vs28, 416(SP) + lxv vs29, 432(SP) + lxv vs30, 448(SP) + lxv vs31, 464(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_logic_power10.S b/kernel/power/zgemm_logic_power10.S new file mode 100644 index 000000000..1143733e0 --- /dev/null +++ b/kernel/power/zgemm_logic_power10.S @@ -0,0 +1,1735 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define MY_ALIGN .align 3 +b ZGEMM_L2 +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ + + +ZGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L2x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 +ZGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_2 8, 0 + KERNEL2x8_2 9, 0 + KERNEL2x8_2 10, 0 + KERNEL2x8_2 11, 0 + dcbt BO, T4 + KERNEL2x8_2 12, 0 + KERNEL2x8_2 13, 0 + KERNEL2x8_2 14, 0 + KERNEL2x8_2 15, 0 + KERNEL2x8_2 16, 0 + KERNEL2x8_2 17, 0 + KERNEL2x8_2 18, 0 + KERNEL2x8_2 19, 0 + KERNEL2x8_2 20, 0 + KERNEL2x8_2 21, 0 + KERNEL2x8_2 22, 0 + KERNEL2x8_2 23, 0 + KERNEL2x8_2 24, 0 + KERNEL2x8_2 25, 0 + KERNEL2x8_2 26, 0 + KERNEL2x8_2 27, 0 + KERNEL2x8_2 28, 0 + KERNEL2x8_2 29, 0 + KERNEL2x8_2 30, 0 + KERNEL2x8_2 31, 0 + KERNEL2x8_2 32, 0 + KERNEL2x8_2 33, 0 + KERNEL2x8_2 34, 0 + KERNEL2x8_2 35, 0 + KERNEL2x8_2 36, 0 + KERNEL2x8_2 37, 0 + KERNEL2x8_2 38, 0 + KERNEL2x8_2 39, 0 + KERNEL2x8_2 40, 0 + KERNEL2x8_2 41, 0 + KERNEL2x8_2 42, 0 + KERNEL2x8_2 43, 0 + KERNEL2x8_2 44, 0 + KERNEL2x8_2 45, 0 + KERNEL2x8_2 46, 0 + KERNEL2x8_2 47, 0 + KERNEL2x8_2 48, 0 + KERNEL2x8_2 49, 0 + KERNEL2x8_2 50, 0 + KERNEL2x8_2 51, 0 + KERNEL2x8_2 52, 0 + KERNEL2x8_2 53, 0 + KERNEL2x8_2 54, 0 + KERNEL2x8_2 55, 0 + KERNEL2x8_2 56, 0 + KERNEL2x8_2 57, 0 + KERNEL2x8_2 58, 0 + KERNEL2x8_2 59, 0 + KERNEL2x8_2 60, 0 + KERNEL2x8_2 61, 0 + KERNEL2x8_2 62, 0 + KERNEL2x8_2 63, 1 + bdz ZGEMM_L2x8_LOOP_END + b ZGEMM_L2x8_LOOP + MY_ALIGN + +ZGEMM_L2x8_LOOP_END: +/*----------------------------------------*/ + KERNEL2x8_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_2x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_2 0, 0 +ZGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_2 1, 0 + KERNEL2x4_2 2, 0 + KERNEL2x4_2 3, 0 + KERNEL2x4_2 4, 0 + KERNEL2x4_2 5, 0 + KERNEL2x4_2 6, 0 + KERNEL2x4_2 7, 0 + KERNEL2x4_2 8, 0 + KERNEL2x4_2 9, 0 + KERNEL2x4_2 10, 0 + KERNEL2x4_2 11, 0 + KERNEL2x4_2 12, 0 + KERNEL2x4_2 13, 0 + KERNEL2x4_2 14, 0 + KERNEL2x4_2 15, 1 + bdnz ZGEMM_L2x4_LOOP + MY_ALIGN +ZGEMM_L2x4_LOOP_END: +/*----------------------------------------*/ + KERNEL2x4_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_2 0, 0 +ZGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_2 1, 0 + KERNEL2x2_2 2, 0 + KERNEL2x2_2 3, 0 + KERNEL2x2_2 4, 0 + KERNEL2x2_2 5, 0 + KERNEL2x2_2 6, 0 + KERNEL2x2_2 7, 0 + KERNEL2x2_2 8, 0 + KERNEL2x2_2 9, 0 + KERNEL2x2_2 10, 0 + KERNEL2x2_2 11, 0 + KERNEL2x2_2 12, 0 + KERNEL2x2_2 13, 0 + KERNEL2x2_2 14, 0 + KERNEL2x2_2 15, 1 + bdnz ZGEMM_L2x2_LOOP + MY_ALIGN + + +ZGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + KERNEL2x2_2 0, 1 + blr + MY_ALIGN + +ZGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +ZGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 32, 64, 0, 0 +ZGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 32, 64, 1, 0 + KERNEL2x1_L2 32, 64, 2, 0 + KERNEL2x1_L2 32, 64, 3, 0 + KERNEL2x1_L2 32, 64, 4, 0 + KERNEL2x1_L2 32, 64, 5, 0 + KERNEL2x1_L2 32, 64, 6, 0 + KERNEL2x1_L2 32, 64, 7, 0 + KERNEL2x1_L2 32, 64, 8, 0 + KERNEL2x1_L2 32, 64, 9, 0 + KERNEL2x1_L2 32, 64, 10, 0 + KERNEL2x1_L2 32, 64, 11, 0 + KERNEL2x1_L2 32, 64, 12, 0 + KERNEL2x1_L2 32, 64, 13, 0 + KERNEL2x1_L2 32, 64, 14, 0 + KERNEL2x1_L2 32, 64, 15, 1 + bdnz ZGEMM_L2x1_LOOP + MY_ALIGN +ZGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 + blr + + MY_ALIGN + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +ZGEMM_L2: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + srawi. J, N, 1 + bgt ZGEMM_L2_BEGIN + b ZGEMM_L2_END + +ZGEMM_L2_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC, 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + bgt ZGEMM_L2_BEGIN_CONTINUE + b ZGEMM_L2x8_END + +ZGEMM_L2_BEGIN_CONTINUE: + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +ZGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 8, 2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 8, 2 + mr T1, T6 +#else + mr T1, K +#endif +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /* T8 <- T1 % 128 */ + + KERNEL2x8_PRELOAD + KERNEL2x8_ZERO_AND_PRIME_MMA + ble ZGEMM_L2x8_SUB0 + bl ZGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + + bgt ZGEMM_L2x8_BEGIN_CONTINUE + b ZGEMM_L2x8_SAVE + +ZGEMM_L2x8_BEGIN_CONTINUE: + b ZGEMM_L2x8_SUB2 + + +ZGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6, 129 +#else + andi. L, K, 255 + cmpwi K, 129 +#endif + li T8, 1 + bne CMP2x8_128K + LOAD_END_2x8 128, 32 + KERNEL2x8_PRELOAD + addi BO, BO, -64 + addi AO,AO, -256 + mtctr T8 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + +CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 128 +#else + cmpwi K, 128 +#endif + bne ZGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -256 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + MY_ALIGN + + +ZGEMM_L2x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L2x8_SUB2_32 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_2 8, 0 + KERNEL2x8_2 9, 0 + KERNEL2x8_2 10, 0 + KERNEL2x8_2 11, 0 + dcbt BO, T4 + KERNEL2x8_2 12, 0 + KERNEL2x8_2 13, 0 + KERNEL2x8_2 14, 0 + KERNEL2x8_2 15, 0 + KERNEL2x8_2 16, 0 + KERNEL2x8_2 17, 0 + KERNEL2x8_2 18, 0 + KERNEL2x8_2 19, 0 + KERNEL2x8_2 20, 0 + KERNEL2x8_2 21, 0 + KERNEL2x8_2 22, 0 + KERNEL2x8_2 23, 0 + KERNEL2x8_2 24, 0 + KERNEL2x8_2 25, 0 + KERNEL2x8_2 26, 0 + KERNEL2x8_2 27, 0 + KERNEL2x8_2 28, 0 + KERNEL2x8_2 29, 0 + KERNEL2x8_2 30, 0 + KERNEL2x8_2 31, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L2x8_SUB2_16 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_2 8, 0 + KERNEL2x8_2 9, 0 + KERNEL2x8_2 10, 0 + KERNEL2x8_2 11, 0 + dcbt BO, T4 + KERNEL2x8_2 12, 0 + KERNEL2x8_2 13, 0 + KERNEL2x8_2 14, 0 + KERNEL2x8_2 15, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x8_SUB2_8 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x8_SUB2_4 + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x8_SUB2_2 + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x8_SUB2_1 + KERNEL2x8_2 0, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x8_SAVE + LOAD_END_2x8 128, 32 + + +ZGEMM_L2x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + KERNEL2x8_UNPRIME_MMA + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 8, 2 +#endif + + ble ZGEMM_L2x8_SAVE_CONTINUE + b ZGEMM_L2x8_BEGIN + +ZGEMM_L2x8_SAVE_CONTINUE: + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END + b ZGEMM_L2x4_BEGIN + MY_ALIGN + + +ZGEMM_L2x8_END: +/*----------------------------------------*/ + + +ZGEMM_L2x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 4, 2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 4, 2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL2x4_PRELOAD + KERNEL2x4_ZERO_AND_PRIME_MMA + ble ZGEMM_L2x4_SUB0 + bl ZGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x4_SAVE + b ZGEMM_L2x4_SUB2 + + +ZGEMM_L2x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP2x4_32K + LOAD_END_2x4 64, 32 + KERNEL2x4_PRELOAD + addi BO, BO, -64 + addi AO,AO, -128 + mtctr T8 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -128 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x4_SUB2_8 + KERNEL2x4_2 0, 0 + KERNEL2x4_2 1, 0 + KERNEL2x4_2 2, 0 + KERNEL2x4_2 3, 0 + KERNEL2x4_2 4, 0 + KERNEL2x4_2 5, 0 + KERNEL2x4_2 6, 0 + KERNEL2x4_2 7, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x4_SUB2_4 + KERNEL2x4_2 0, 0 + KERNEL2x4_2 1, 0 + KERNEL2x4_2 2, 0 + KERNEL2x4_2 3, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x4_SUB2_2 + KERNEL2x4_2 0, 0 + KERNEL2x4_2 1, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x4_SUB2_1 + KERNEL2x4_2 0, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x4_SAVE + LOAD_END_2x4 64, 32 + + +ZGEMM_L2x4_SAVE: +/*----------------------------------------*/ + KERNEL2x4_UNPRIME_MMA + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 4, 2 +#endif + + +ZGEMM_L2x4_END: +/*----------------------------------------*/ + + +ZGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 2, 2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 2, 2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL2x2_PRELOAD + KERNEL2x2_ZERO_AND_PRIME_MMA + ble ZGEMM_L2x2_SUB0 + bl ZGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x2_SAVE + b ZGEMM_L2x2_SUB2 + + +ZGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP2x2_32K + LOAD_END_2x2 32, 32 + KERNEL2x2_PRELOAD + addi BO, BO, -64 + addi AO,AO, -64 + mtctr T8 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -64 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x2_SUB2_8 + KERNEL2x2_2 0, 0 + KERNEL2x2_2 1, 0 + KERNEL2x2_2 2, 0 + KERNEL2x2_2 3, 0 + KERNEL2x2_2 4, 0 + KERNEL2x2_2 5, 0 + KERNEL2x2_2 6, 0 + KERNEL2x2_2 7, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x2_SUB2_4 + KERNEL2x2_2 0, 0 + KERNEL2x2_2 1, 0 + KERNEL2x2_2 2, 0 + KERNEL2x2_2 3, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x2_SUB2_2 + KERNEL2x2_2 0, 0 + KERNEL2x2_2 1, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x2_SUB2_1 + KERNEL2x2_2 0, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x2_SAVE + LOAD_END_2x2 32, 32 + + +ZGEMM_L2x2_SAVE: +/*----------------------------------------*/ + KERNEL2x2_UNPRIME_MMA + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 2, 2 +#endif + + +ZGEMM_L2x2_END: +/*----------------------------------------*/ + + +ZGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 1, 2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 1, 2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble ZGEMM_L2x1_SUB0 + bl ZGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x1_SAVE + b ZGEMM_L2x1_SUB2 + + +ZGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP2x1_32K + addi BO, BO, -32 + addi AO,AO, -16 + LOAD2x1O 16, 32 + END2x1_WITHOUT_ADD + LOAD2x1_2O 32, 64 + mtctr T8 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -32 + LOAD2x1_2O 32, 64 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x1_SUB2_8 + LOAD2x1_2 + KERNEL2x1_L2 32, 64, 0, 0 + KERNEL2x1_L2 32, 64, 1, 0 + KERNEL2x1_L2 32, 64, 2, 0 + KERNEL2x1_L2 32, 64, 3, 0 + KERNEL2x1_L2 32, 64, 4, 0 + KERNEL2x1_L2 32, 64, 5, 0 + KERNEL2x1_L2 32, 64, 6, 0 + KERNEL2x1_E2 32, 64, 7, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x1_SUB2_4 + LOAD2x1_2 + KERNEL2x1_L2 32, 64, 0, 0 + KERNEL2x1_L2 32, 64, 1, 0 + KERNEL2x1_L2 32, 64, 2, 0 + KERNEL2x1_E2 32, 64, 3, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x1_SUB2_2 + LOAD2x1_2 + KERNEL2x1_L2 32, 64, 0, 0 + KERNEL2x1_E2 32, 64, 1, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x1_SUB2_1 + LOAD2x1_2 + KERNEL2x1_E2 32, 64, 0, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x1_SAVE + KERNEL2x1 + + +ZGEMM_L2x1_SAVE: +/*----------------------------------------*/ + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 1, 2 +#endif + + +ZGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + ble ZGEMM_L2_END + b ZGEMM_L2_BEGIN + +ZGEMM_L2_END: + +b ZGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ + + +ZGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 +ZGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_2 8, 0 + KERNEL1x8_2 9, 0 + KERNEL1x8_2 10, 0 + KERNEL1x8_2 11, 0 + dcbt BO, T4 + KERNEL1x8_2 12, 0 + KERNEL1x8_2 13, 0 + KERNEL1x8_2 14, 0 + KERNEL1x8_2 15, 0 + KERNEL1x8_2 16, 0 + KERNEL1x8_2 17, 0 + KERNEL1x8_2 18, 0 + KERNEL1x8_2 19, 0 + KERNEL1x8_2 20, 0 + KERNEL1x8_2 21, 0 + KERNEL1x8_2 22, 0 + KERNEL1x8_2 23, 0 + KERNEL1x8_2 24, 0 + KERNEL1x8_2 25, 0 + KERNEL1x8_2 26, 0 + KERNEL1x8_2 27, 0 + KERNEL1x8_2 28, 0 + KERNEL1x8_2 29, 0 + KERNEL1x8_2 30, 0 + KERNEL1x8_2 31, 0 + KERNEL1x8_2 32, 0 + KERNEL1x8_2 33, 0 + KERNEL1x8_2 34, 0 + KERNEL1x8_2 35, 0 + KERNEL1x8_2 36, 0 + KERNEL1x8_2 37, 0 + KERNEL1x8_2 38, 0 + KERNEL1x8_2 39, 0 + KERNEL1x8_2 40, 0 + KERNEL1x8_2 41, 0 + KERNEL1x8_2 42, 0 + KERNEL1x8_2 43, 0 + KERNEL1x8_2 44, 0 + KERNEL1x8_2 45, 0 + KERNEL1x8_2 46, 0 + KERNEL1x8_2 47, 0 + KERNEL1x8_2 48, 0 + KERNEL1x8_2 49, 0 + KERNEL1x8_2 50, 0 + KERNEL1x8_2 51, 0 + KERNEL1x8_2 52, 0 + KERNEL1x8_2 53, 0 + KERNEL1x8_2 54, 0 + KERNEL1x8_2 55, 0 + KERNEL1x8_2 56, 0 + KERNEL1x8_2 57, 0 + KERNEL1x8_2 58, 0 + KERNEL1x8_2 59, 0 + KERNEL1x8_2 60, 0 + KERNEL1x8_2 61, 0 + KERNEL1x8_2 62, 0 + KERNEL1x8_2 63, 1 + bdnz ZGEMM_L1x8_LOOP + MY_ALIGN +ZGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + KERNEL1x8_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN + + +ZGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_2 0, 0 + + +ZGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_2 1, 0 + KERNEL1x4_2 2, 0 + KERNEL1x4_2 3, 0 + KERNEL1x4_2 4, 0 + KERNEL1x4_2 5, 0 + KERNEL1x4_2 6, 0 + KERNEL1x4_2 7, 0 + KERNEL1x4_2 8, 0 + KERNEL1x4_2 9, 0 + KERNEL1x4_2 10, 0 + KERNEL1x4_2 11, 0 + KERNEL1x4_2 12, 0 + KERNEL1x4_2 13, 0 + KERNEL1x4_2 14, 0 + KERNEL1x4_2 15, 1 + bdnz ZGEMM_L1x4_LOOP + MY_ALIGN + + +ZGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + KERNEL1x4_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN + + +ZGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_2 0, 0 + + +ZGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_2 1, 0 + KERNEL1x2_2 2, 0 + KERNEL1x2_2 3, 0 + KERNEL1x2_2 4, 0 + KERNEL1x2_2 5, 0 + KERNEL1x2_2 6, 0 + KERNEL1x2_2 7, 0 + KERNEL1x2_2 8, 0 + KERNEL1x2_2 9, 0 + KERNEL1x2_2 10, 0 + KERNEL1x2_2 11, 0 + KERNEL1x2_2 12, 0 + KERNEL1x2_2 13, 0 + KERNEL1x2_2 14, 0 + KERNEL1x2_2 15, 1 + bdnz ZGEMM_L1x2_LOOP + MY_ALIGN + + +ZGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + KERNEL1x2_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN + + +ZGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 32, 32, 0, 0 + + +ZGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 32, 32, 1, 0 + KERNEL1x1_L2 32, 32, 2, 0 + KERNEL1x1_L2 32, 32, 3, 0 + KERNEL1x1_L2 32, 32, 4, 0 + KERNEL1x1_L2 32, 32, 5, 0 + KERNEL1x1_L2 32, 32, 6, 0 + KERNEL1x1_L2 32, 32, 7, 0 + KERNEL1x1_L2 32, 32, 8, 0 + KERNEL1x1_L2 32, 32, 9, 0 + KERNEL1x1_L2 32, 32, 10, 0 + KERNEL1x1_L2 32, 32, 11, 0 + KERNEL1x1_L2 32, 32, 12, 0 + KERNEL1x1_L2 32, 32, 13, 0 + KERNEL1x1_L2 32, 32, 14, 0 + KERNEL1x1_L2 32, 32, 15, 1 + bdnz ZGEMM_L1x1_LOOP + MY_ALIGN + + +ZGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + MY_ALIGN + + +/*----------------------N1 BEGINS---------*/ +ZGEMM_L1: +/*----------------------------------------*/ + andi. T1, N, 1 + ble ZGEMM_L1_END + +ZGEMM_L1_BEGIN: +/*----------------------------------------*/ + mr CO, C + + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble ZGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +ZGEMM_L1x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 8, 1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 8, 1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T11-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + KERNEL1x8_ZERO_AND_PRIME_MMA + ble ZGEMM_L1x8_SUB0 + bl ZGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble ZGEMM_L1x8_SAVE + b ZGEMM_L1x8_SUB2 + + +ZGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6, 129 +#else + andi. L, K, 255 + cmpwi K, 129 +#endif + li T8, 1 + bne CMP1x8_128K + LOAD_END_1x8 -128, -16 + mtctr T8 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 128 +#else + cmpwi K, 128 +#endif + bne ZGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -256 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + MY_ALIGN + + +ZGEMM_L1x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L1x8_SUB2_32 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_2 8, 0 + KERNEL1x8_2 9, 0 + KERNEL1x8_2 10, 0 + KERNEL1x8_2 11, 0 + dcbt BO, T4 + KERNEL1x8_2 12, 0 + KERNEL1x8_2 13, 0 + KERNEL1x8_2 14, 0 + KERNEL1x8_2 15, 0 + KERNEL1x8_2 16, 0 + KERNEL1x8_2 17, 0 + KERNEL1x8_2 18, 0 + KERNEL1x8_2 19, 0 + KERNEL1x8_2 20, 0 + KERNEL1x8_2 21, 0 + KERNEL1x8_2 22, 0 + KERNEL1x8_2 23, 0 + KERNEL1x8_2 24, 0 + KERNEL1x8_2 25, 0 + KERNEL1x8_2 26, 0 + KERNEL1x8_2 27, 0 + KERNEL1x8_2 28, 0 + KERNEL1x8_2 29, 0 + KERNEL1x8_2 30, 0 + KERNEL1x8_2 31, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L1x8_SUB2_16 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_2 8, 0 + KERNEL1x8_2 9, 0 + KERNEL1x8_2 10, 0 + KERNEL1x8_2 11, 0 + dcbt BO, T4 + KERNEL1x8_2 12, 0 + KERNEL1x8_2 13, 0 + KERNEL1x8_2 14, 0 + KERNEL1x8_2 15, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x8_SUB2_8 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x8_SUB2_4 + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x8_SUB2_2 + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x8_SUB2_1 + KERNEL1x8_2 0, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x8_SAVE + LOAD_END_1x8 128, 16 + + +ZGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + KERNEL1x8_UNPRIME_MMA + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 8, 1 +#endif + bgt ZGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END + b ZGEMM_L1x4_BEGIN + MY_ALIGN + + +ZGEMM_L1x8_END: +/*----------------------------------------*/ + + +ZGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 4, 1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 4, 1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL1x4_ZERO_AND_PRIME_MMA + ble ZGEMM_L1x4_SUB0 + bl ZGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x4_SAVE + b ZGEMM_L1x4_SUB2 + + +ZGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP1x4_32K + LOAD_END_1x4 -64, -16 + mtctr T8 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -128 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x4_SUB2_8 + KERNEL1x4_2 0, 0 + KERNEL1x4_2 1, 0 + KERNEL1x4_2 2, 0 + KERNEL1x4_2 3, 0 + KERNEL1x4_2 4, 0 + KERNEL1x4_2 5, 0 + KERNEL1x4_2 6, 0 + KERNEL1x4_2 7, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x4_SUB2_4 + KERNEL1x4_2 0, 0 + KERNEL1x4_2 1, 0 + KERNEL1x4_2 2, 0 + KERNEL1x4_2 3, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x4_SUB2_2 + KERNEL1x4_2 0, 0 + KERNEL1x4_2 1, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x4_SUB2_1 + KERNEL1x4_2 0, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x4_SAVE + LOAD_END_1x4 64,16 + + + +ZGEMM_L1x4_SAVE: +/*----------------------------------------*/ + KERNEL1x4_UNPRIME_MMA + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 4, 1 +#endif + + +ZGEMM_L1x4_END: +/*----------------------------------------*/ + + +ZGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 2, 1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 2, 1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL1x2_ZERO_AND_PRIME_MMA + ble ZGEMM_L1x2_SUB0 + bl ZGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x2_SAVE + b ZGEMM_L1x2_SUB2 + + +ZGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP1x2_32K + LOAD_END_1x2 -32, -16 + mtctr T8 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -64 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x2_SUB2_8 + KERNEL1x2_2 0, 0 + KERNEL1x2_2 1, 0 + KERNEL1x2_2 2, 0 + KERNEL1x2_2 3, 0 + KERNEL1x2_2 4, 0 + KERNEL1x2_2 5, 0 + KERNEL1x2_2 6, 0 + KERNEL1x2_2 7, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x2_SUB2_4 + KERNEL1x2_2 0, 0 + KERNEL1x2_2 1, 0 + KERNEL1x2_2 2, 0 + KERNEL1x2_2 3, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x2_SUB2_2 + KERNEL1x2_2 0, 0 + KERNEL1x2_2 1, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x2_SUB2_1 + KERNEL1x2_2 0, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x2_SAVE + LOAD_END_1x2 32,16 + + +ZGEMM_L1x2_SAVE: +/*----------------------------------------*/ + KERNEL1x2_UNPRIME_MMA + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 2, 1 +#endif + + +ZGEMM_L1x2_END: +/*----------------------------------------*/ + + +ZGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 1, 1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 1, 1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x1 + ble ZGEMM_L1x1_SUB0 + bl ZGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x1_SAVE + b ZGEMM_L1x1_SUB2 + + +ZGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP1x1_32K + addi BO, BO, -16 + addi AO,AO, -16 + LOAD1x1O 16, 16 + END1x1_WITHOUT_ADD + LOAD1x1_2O 32, 32 + mtctr T8 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -32 + LOAD1x1_2O 32, 32 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + MY_ALIGN + + +ZGEMM_L1x1_SUB2: +/*----------------------------------------*/ + andi. T1, L, 16 + ble ZGEMM_L1x1_SUB2_8 + LOAD1x1_2 + KERNEL1x1_L2 32, 32, 0, 0 + KERNEL1x1_L2 32, 32, 1, 0 + KERNEL1x1_L2 32, 32, 2, 0 + KERNEL1x1_L2 32, 32, 3, 0 + KERNEL1x1_L2 32, 32, 4, 0 + KERNEL1x1_L2 32, 32, 5, 0 + KERNEL1x1_L2 32, 32, 6, 0 + KERNEL1x1_E2 32, 32, 7, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1, L, 8 + ble ZGEMM_L1x1_SUB2_4 + LOAD1x1_2 + KERNEL1x1_L2 32, 32, 0, 0 + KERNEL1x1_L2 32, 32, 1, 0 + KERNEL1x1_L2 32, 32, 2, 0 + KERNEL1x1_E2 32, 32, 3, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x1_SUB2_2 + LOAD1x1_2 + KERNEL1x1_L2 32, 32, 0, 0 + KERNEL1x1_E2 32, 32, 1, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x1_SUB2_1 + LOAD1x1_2 + KERNEL1x1_E2 32, 32, 0, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x1_SAVE + KERNEL1x1 + + +ZGEMM_L1x1_SAVE: +/*----------------------------------------*/ + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 1, 1 +#endif + + +ZGEMM_L1x1_END: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + + +ZGEMM_L1_END: +/*----------------------------------------*/ diff --git a/kernel/power/zgemm_macros_power10.S b/kernel/power/zgemm_macros_power10.S new file mode 100644 index 000000000..42f9c5ad4 --- /dev/null +++ b/kernel/power/zgemm_macros_power10.S @@ -0,0 +1,1138 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define unit_size 16 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) +/* HELPERS FOR SAVE */ +/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */ + + +.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET +#ifndef TRMMKERNEL + lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) + lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) + xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 + xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 +#endif +.endm +/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ + + +.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ +.endm +/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ + + +.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ +.endm +/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ + + +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead instead to fix sign*/ + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm +/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */ + + +.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 +#ifndef TRMMKERNEL + xvmsubadp \VSOUT1,\VSINII, alpha_i + xvmaddadp \VSOUT2,\VSINRR, alpha_i +#else + xvmuldp \VSOUT1,\VSINII, alpha_i + xvmuldp \VSOUT2,\VSINRR, alpha_i +#endif +.endm +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + + +.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmsubadp \VSOUT1,\VSINRR, alpha_r + xvmaddadp \VSOUT2,\VSINII, alpha_r +.endm +/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */ + + +.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrghd \VSOUT1,\VSIN2,\VSIN1 + xxmrgld \VSOUT2,\VSIN2,\VSIN1 +.endm + + +.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2 + stxv \VSIN1, DISPX(\LOFFSET)(\REG) + stxv \VSIN2, DISPX(\LOFFSET+16)(\REG) +.endm + + +.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35 + LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37 + LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39 + LOAD_COUPLE_AS_RR_II vs56,vs57,vs50,vs51,\BASE_REG,(\LOFFSET +64) + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41 + LOAD_COUPLE_AS_RR_II vs58,vs59,vs52,vs53,\BASE_REG,(\LOFFSET+96) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs42,vs43 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs44,vs45 + AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41 + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4 + MULT_APLHA_PART1 vs38,vs40,vs48,vs49 + MULT_APLHA_PART2 vs34,vs36,vs46,vs47 + AGGREGATE_REALS_IMAGES vs42,vs43,vs44,vs45 + MULT_APLHA_PART2 vs38,vs40,vs48,vs49 + AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + MULT_APLHA_PART1 vs42,vs44, vs56,vs57 + UNPACK_FOR_STORE vs48,vs49,vs35,vs37 + MULT_APLHA_PART1 \VSRes1,\VSRes3, vs58,vs59 + STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41 + MULT_APLHA_PART2 vs42,vs44,vs56,vs57 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37 + MULT_APLHA_PART2 \VSRes1,\VSRes3, vs58,vs59 + UNPACK_FOR_STORE vs56,vs57,vs42,vs44 + UNPACK_FOR_STORE vs58,vs59,\VSRes1,\VSRes3 + STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs42,vs44 + STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3 +.endm + + +.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35 + LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37 + LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + MULT_APLHA_PART1 vs38,vs40, vs48,vs49 + MULT_APLHA_PART2 vs34,vs36, vs46,vs47 + MULT_APLHA_PART2 vs38,vs40,vs48,vs49 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + UNPACK_FOR_STORE vs48,vs49,vs35,vs37 + STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37 +.endm + + +.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35 + LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + MULT_APLHA_PART2 vs34,vs36, vs46,vs47 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41 +.endm + + +.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35 +#ifndef TRMMKERNEL + lxv vs50, (\LOFFSET)(\BASE_REG) + xxmrgld vs46,vs50,vs50 + xxmrghd vs47,vs50,vs50 +#endif + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs36,vs37 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + MULT_APLHA_PART2 vs34,vs36, vs46,vs47 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + xxmrghd vs39,vs47,vs46 + stxv vs39, (\LOFFSET)(\BASE_REG) +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=8 +**********************************************************************************************/ + +.macro KERNEL2x8_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 + xxsetaccz 4 + xxsetaccz 5 + xxsetaccz 6 + xxsetaccz 7 +.endm + + +.macro KERNEL2x8_PRELOAD + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxvp vs36, 64(AO) // load real,imag from A + lxvp vs38, 96(AO) // load real,imag from A + lxvp vs48, 0(BO) // load real imag from B +.endm + + +.macro KERNEL2x8_2 Index, IsLast + lxvp vs40, DISP16(\Index,128)(AO) // load real,imag from A + lxvp vs42, DISP16(\Index,160)(AO) // load real,imag from A + lxvp vs44, DISP16(\Index,192)(AO) // load real,imag from A + lxvp vs46, DISP16(\Index,224)(AO) // load real,imag from A + lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 4, vs32, vs48 + xvf64gerpp 5, vs34, vs48 + xvf64gerpp 6, vs36, vs48 + xvf64gerpp 7, vs38, vs48 + lxvp vs32, DISP16(\Index, 256)(AO) // load real,imag from A + lxvp vs34, DISP16(\Index, 288)(AO) // load real,imag from A + lxvp vs36, DISP16(\Index, 320)(AO) // load real,imag from A + lxvp vs38, DISP16(\Index, 352)(AO) // load real,imag from A + lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs42, vs51 + xvf64gerpp 2, vs44, vs51 + xvf64gerpp 3, vs46, vs51 + xvf64gerpp 4, vs40, vs50 + xvf64gerpp 5, vs42, vs50 + xvf64gerpp 6, vs44, vs50 + xvf64gerpp 7, vs46, vs50 +.if \IsLast==1 + addi AO, AO, DISP16(\Index,256) + addi BO, BO, DISP4(\Index,64) +.endif +.endm + + +.macro LOAD_END_2x8 OffsetA,OffsetB + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 4, vs32, vs48 + xvf64gerpp 5, vs34, vs48 + xvf64gerpp 6, vs36, vs48 + xvf64gerpp 7, vs38, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL2x8_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 + xxmfacc 2 + xxmfacc 3 + xxmfacc 4 + xxmfacc 5 + xxmfacc 6 + xxmfacc 7 +.endm + + +.macro SAVE2x8 + add T1, CO ,LDC + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + xxpermdi vs40, vs8, vs9, 0b01 + xxpermdi vs41, vs8, vs9, 0b10 + xxpermdi vs42, vs10, vs11, 0b01 + xxpermdi vs43, vs10, vs11, 0b10 + xxpermdi vs44, vs12, vs13, 0b01 + xxpermdi vs45, vs12, vs13, 0b10 + xxpermdi vs46, vs14, vs15, 0b01 + xxpermdi vs47, vs14, vs15, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + xxlor vs10, vs40, vs40 + xxlor vs11, vs41, vs41 + xxlor vs8, vs42, vs42 + xxlor vs9, vs43, vs43 + xxlor vs14, vs44, vs44 + xxlor vs15, vs45, vs45 + xxlor vs12, vs46, vs46 + xxlor vs13, vs47, vs47 + + xxpermdi vs32, vs16, vs17, 0b01 + xxpermdi vs33, vs16, vs17, 0b10 + xxpermdi vs34, vs18, vs19, 0b01 + xxpermdi vs35, vs18, vs19, 0b10 + xxpermdi vs36, vs20, vs21, 0b01 + xxpermdi vs37, vs20, vs21, 0b10 + xxpermdi vs38, vs22, vs23, 0b01 + xxpermdi vs39, vs22, vs23, 0b10 + xxpermdi vs40, vs24, vs25, 0b01 + xxpermdi vs41, vs24, vs25, 0b10 + xxpermdi vs42, vs26, vs27, 0b01 + xxpermdi vs43, vs26, vs27, 0b10 + xxpermdi vs44, vs28, vs29, 0b01 + xxpermdi vs45, vs28, vs29, 0b10 + xxpermdi vs46, vs30, vs31, 0b01 + xxpermdi vs47, vs30, vs31, 0b10 + + xxlor vs18, vs32, vs32 + xxlor vs19, vs33, vs33 + xxlor vs16, vs34, vs34 + xxlor vs17, vs35, vs35 + xxlor vs22, vs36, vs36 + xxlor vs23, vs37, vs37 + xxlor vs20, vs38, vs38 + xxlor vs21, vs39, vs39 + xxlor vs26, vs40, vs40 + xxlor vs27, vs41, vs41 + xxlor vs24, vs42, vs42 + xxlor vs25, vs43, vs43 + xxlor vs30, vs44, vs44 + xxlor vs31, vs45, vs45 + xxlor vs28, vs46, vs46 + xxlor vs29, vs47, vs47 + + SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 + SAVE8 vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0 + addi CO, CO, 128 +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=4 +**********************************************************************************************/ + +.macro KERNEL2x4_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + + +.macro KERNEL2x4_PRELOAD + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxvp vs48, 0(BO) // load real imag from B +.endm + + +.macro KERNEL2x4_2 Index, IsLast + lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A + lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A + lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs32, vs48 + xvf64gerpp 3, vs34, vs48 + lxvp vs32, DISP8(\Index, 128)(AO) // load real,imag from A + lxvp vs34, DISP8(\Index, 160)(AO) // load real,imag from A + lxvp vs48, DISP4(\Index, 64)(BO) // load real,imag from B + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs42, vs51 + xvf64gerpp 2, vs40, vs50 + xvf64gerpp 3, vs42, vs50 +.if \IsLast==1 + addi AO, AO, DISP8(\Index,128) + addi BO, BO, DISP4(\Index,64) +.endif +.endm + + +.macro LOAD_END_2x4 OffsetA, OffsetB + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs32, vs48 + xvf64gerpp 3, vs34, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL2x4_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 + xxmfacc 2 + xxmfacc 3 +.endm + + +.macro SAVE2x4 + add T1, CO ,LDC + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + xxpermdi vs40, vs8, vs9, 0b01 + xxpermdi vs41, vs8, vs9, 0b10 + xxpermdi vs42, vs10, vs11, 0b01 + xxpermdi vs43, vs10, vs11, 0b10 + xxpermdi vs44, vs12, vs13, 0b01 + xxpermdi vs45, vs12, vs13, 0b10 + xxpermdi vs46, vs14, vs15, 0b01 + xxpermdi vs47, vs14, vs15, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + xxlor vs10, vs40, vs40 + xxlor vs11, vs41, vs41 + xxlor vs8, vs42, vs42 + xxlor vs9, vs43, vs43 + xxlor vs14, vs44, vs44 + xxlor vs15, vs45, vs45 + xxlor vs12, vs46, vs46 + xxlor vs13, vs47, vs47 + + SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 + SAVE4 vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0 + addi CO, CO, 64 +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=2 +**********************************************************************************************/ + +.macro KERNEL2x2_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 +.endm + + +.macro KERNEL2x2_PRELOAD + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs48, 0(BO) // load real imag from B +.endm + + +.macro KERNEL2x2_2 Index, IsLast + lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A + lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs32, vs48 + lxvp vs32, DISP4(\Index, 64)(AO) // load real,imag from A + lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs40, vs50 +.if \IsLast==1 + addi AO, AO, DISP4(\Index,64) + addi BO, BO, DISP4(\Index,64) +.endif +.endm + + +.macro LOAD_END_2x2 OffsetA,OffsetB + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs32, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL2x2_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 +.endm + + +.macro SAVE2x2 + add T1, CO ,LDC + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + + SAVE2 vs0,vs1,vs2,vs3,CO,0 + SAVE2 vs4,vs5,vs6,vs7,T1,0 + addi CO, CO, 32 +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=1 +**********************************************************************************************/ + +.macro ZERO2x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + +.endm + + +.macro LOAD2x1 + LOAD2x1O 0,0 +.endm + + +.macro LOAD2x1O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs50, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs49, vs48 + xxswapd vs51, vs50 + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x1_WITHOUT_ADD + END2x1 AO,BO,0,0 +.endm + + +.macro END2x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs2, vs32, vs50 + xvmaddadp vs1, vs32, vs49 + xvmaddadp vs3, vs32, vs51 +.endm + + +.macro LOAD2x1_2 + LOAD2x1_2O 0,0 +.endm + + +.macro LOAD2x1_2O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs50, (\OffsetB+16)(BO) // load real,imag from B + lxv vs52, (\OffsetB+32)(BO) // load real,imag from B + lxv vs54, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs49, vs48 + xxswapd vs51, vs50 + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A + lxv vs40, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x1_2 + /*for load2 offset will be 32 and 64*/ + KERNEL2x1_2 AO,BO, 32,64,0 ,1,1 +.endm + + +.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs53, vs52 + xxswapd vs55, vs54 + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs2, vs32, vs50 + xvmaddadp vs1, vs32, vs49 + xvmaddadp vs3, vs32, vs51 +.if \Complete==0 + lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs48, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs50, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + xxswapd vs49, vs48 + xxswapd vs51, vs50 +.endif + xvmaddadp vs0, vs40, vs52 + xvmaddadp vs2, vs40, vs54 + xvmaddadp vs1, vs40, vs53 + xvmaddadp vs3, vs40, vs55 +.if \Complete==0 + lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs52, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs54, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + +.macro KERNEL2x1 + LOAD2x1 + END2x1 AO, BO, 16,32 +.endm + + +.macro SAVE2x1 + add T1, CO ,LDC + SAVE1 vs0,vs1,CO,0 + SAVE1 vs2,vs3,T1,0 + addi CO, CO, 16 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=8 +**********************************************************************************************/ + +.macro KERNEL1x8_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + + +.macro KERNEL1x8_2 Index,IsLast + lxvp vs32, DISP16(\Index, 0)(AO) // load real,imag from A + lxvp vs34, DISP16(\Index, 32)(AO) // load real,imag from A + lxvp vs36, DISP16(\Index, 64)(AO) // load real,imag from A + lxvp vs38, DISP16(\Index, 96)(AO) // load real,imag from A + lxvp vs40, DISP16(\Index, 128)(AO) // load real,imag from A + lxvp vs42, DISP16(\Index, 160)(AO) // load real,imag from A + lxvp vs44, DISP16(\Index, 192)(AO) // load real,imag from A + lxvp vs46, DISP16(\Index, 224)(AO) // load real,imag from A + lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 0, vs40, vs48 + xvf64gerpp 1, vs42, vs48 + xvf64gerpp 2, vs44, vs48 + xvf64gerpp 3, vs46, vs48 +.if \IsLast==1 + addi AO, AO, DISP16(\Index,256) + addi BO, BO, DISP2(\Index,32) +.endif +.endm + + +.macro LOAD_END_1x8 OffsetA,OffsetB + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxvp vs36, 64(AO) // load real,imag from A + lxvp vs38, 96(AO) // load real,imag from A + lxv vs48, 0(BO) // load real imag from B + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs36, vs48 + xvf64gerpp 3, vs38, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL1x8_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 + xxmfacc 2 + xxmfacc 3 +.endm + + +.macro SAVE1x8 + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + xxpermdi vs40, vs8, vs9, 0b01 + xxpermdi vs41, vs8, vs9, 0b10 + xxpermdi vs42, vs10, vs11, 0b01 + xxpermdi vs43, vs10, vs11, 0b10 + xxpermdi vs44, vs12, vs13, 0b01 + xxpermdi vs45, vs12, vs13, 0b10 + xxpermdi vs46, vs14, vs15, 0b01 + xxpermdi vs47, vs14, vs15, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + xxlor vs10, vs40, vs40 + xxlor vs11, vs41, vs41 + xxlor vs8, vs42, vs42 + xxlor vs9, vs43, vs43 + xxlor vs14, vs44, vs44 + xxlor vs15, vs45, vs45 + xxlor vs12, vs46, vs46 + xxlor vs13, vs47, vs47 + + SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 + addi CO, CO, 128 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=4 +**********************************************************************************************/ + +.macro KERNEL1x4_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 +.endm + + +.macro KERNEL1x4_2 Index,IsLast + lxvp vs32, DISP8(\Index, 0)(AO) // load real,imag from A + lxvp vs34, DISP8(\Index, 32)(AO) // load real,imag from A + lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A + lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A + lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 0, vs40, vs48 + xvf64gerpp 1, vs42, vs48 +.if \IsLast==1 + addi AO, AO, DISP8(\Index,128) + addi BO, BO, DISP2(\Index,32) +.endif +.endm + + +.macro LOAD_END_1x4 OffsetA,OffsetB + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxv vs48, 0(BO) // load real imag from B + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL1x4_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 +.endm + + +.macro SAVE1x4 + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + + SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 + addi CO, CO, 64 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=2 +**********************************************************************************************/ + +.macro KERNEL1x2_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 +.endm + + +.macro KERNEL1x2_2 Index,IsLast + lxvp vs32, DISP4(\Index, 0)(AO) // load real,imag from A + lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A + lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 0, vs40, vs48 +.if \IsLast==1 + addi AO, AO, DISP4(\Index,64) + addi BO, BO, DISP2(\Index,32) +.endif +.endm + + +.macro LOAD_END_1x2 OffsetA,OffsetB + lxvp vs32, 0(AO) // load real,imag from A + lxv vs48, 0(BO) // load real imag from B + xvf64gerpp 0, vs32, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL1x2_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 +.endm + + +.macro SAVE1x2 + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + + SAVE2 vs0,vs1,vs2,vs3,CO,0 + addi CO, CO, 32 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=1 +**********************************************************************************************/ + +.macro ZERO1x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +.endm + + +.macro LOAD1x1 + LOAD1x1O 0,0 +.endm + + +.macro LOAD1x1O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A + xxswapd vs49, vs48 + +.endm + + +.macro END1x1_WITHOUT_ADD + END1x1 AO,BO,0,0 +.endm + + +.macro END1x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs1, vs32, vs49 +.endm + + +.macro LOAD1x1_2 + LOAD1x1_2O 0,0 +.endm + + +.macro LOAD1x1_2O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs52, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs49, vs48 + + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A + lxv vs40, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x1_2 + /*for load2 offset will be 32 and 32*/ + KERNEL1x1_2 AO,BO, 32,32,0 ,1,1 +.endm + + + +.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs53, vs52 + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs1, vs32, vs49 +.if \Complete==0 + lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs48, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif +.if \Complete==0 + xxswapd vs49, vs48 +.endif + xvmaddadp vs0, vs40, vs52 + xvmaddadp vs1, vs40, vs53 +.if \Complete==0 + lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs52, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + +.macro KERNEL1x1 + LOAD1x1 + END1x1 AO, BO, 16,16 +.endm + + + +.macro SAVE1x1 + SAVE1 vs0,vs1,CO,0 + addi CO, CO, 16 +.endm + +/****************************TRMM POINTER REFRESH + +.macroSES*************************/ + + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 8 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 4 + .endif +.endm +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ + + +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ + + +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + #endif + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif +.endm +