diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index ab8fbfcd9..00d31f8b6 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -7,12 +7,12 @@ else #CGEMM_BETA = ../generic/zgemm_beta.c #ZGEMM_BETA = ../generic/zgemm_beta.c -STRMMKERNEL = sgemm_kernel_power9.S -DTRMMKERNEL = dgemm_kernel_power9.S -CTRMMKERNEL = cgemm_kernel_power9.S +STRMMKERNEL = sgemm_kernel_power10.c +DTRMMKERNEL = dgemm_kernel_power10.c +CTRMMKERNEL = cgemm_kernel_power10.S ZTRMMKERNEL = zgemm_kernel_power9.S -SGEMMKERNEL = sgemm_kernel_power9.S +SGEMMKERNEL = sgemm_kernel_power10.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c @@ -22,7 +22,7 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = dgemm_kernel_power9.S +DGEMMKERNEL = dgemm_kernel_power10.c DGEMMINCOPY = ../generic/gemm_ncopy_16.c DGEMMITCOPY = dgemm_tcopy_16_power8.S DGEMMONCOPY = dgemm_ncopy_4_power8.S @@ -32,7 +32,7 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -CGEMMKERNEL = cgemm_kernel_power9.S +CGEMMKERNEL = cgemm_kernel_power10.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMONCOPY = ../generic/zgemm_ncopy_4.c diff --git a/kernel/power/cgemm_kernel_power10.S b/kernel/power/cgemm_kernel_power10.S new file mode 100644 index 000000000..e04f948dd --- /dev/null +++ b/kernel/power/cgemm_kernel_power10.S @@ -0,0 +1,286 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld +#define STACKSIZE (512 ) +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + +#define alpha_r vs51 +#define alpha_i vs55 +#define save_permute_1 vs59 +#define permute_mask vs63 +#define o0 0 + + +#define T1 r11 +#define T2 r12 +#define T3 r14 +#define T4 r15 +#define T5 r16 +#define T6 r17 +#define L r18 +#define T7 r19 +#define T8 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T9 r27 +#define T10 r28 +#define PRE r29 + +#define T12 r30 +#define T13 r31 + +#include "cgemm_macros_power10.S" + +.equ perm_const1, 0x0405060700010203 +.equ perm_const2, 0x0c0d0e0f08090a0b +.equ save_permute_12, 0x0c0d0e0f1c1d1e1f +.equ save_permute_11, 0x0405060714151617 + + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + + addi SP, SP, -STACKSIZE + mflr r0 + + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + std r0, FLINK_SAVE(SP) + + + + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) + + + +#ifdef TRMMKERNEL + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif + slwi LDC, LDC, ZBASE_SHIFT + + + + /*alpha is stored in f1. convert to single and splat*/ + xscvdpspn alpha_r,vs1 + xscvdpspn alpha_i,vs2 + xxspltw alpha_r,alpha_r,0 + xxspltw alpha_i,alpha_i,0 +/*load reverse permute mask for big endian + uint128 = 0xc0d0e0f08090a0b0405060700010203 +*/ + + lis T2, perm_const2@highest + lis T1, perm_const1@highest + lis T3, save_permute_12@highest + lis T4, save_permute_11@highest + + + ori T2, T2, perm_const2@higher + ori T1, T1, perm_const1@higher + ori T3, T3, save_permute_12@higher + ori T4, T4, save_permute_11@higher + + + rldicr T2, T2, 32, 31 + rldicr T1, T1, 32, 31 + rldicr T3, T3, 32, 31 + rldicr T4, T4, 32, 31 + + oris T2, T2, perm_const2@h + oris T1, T1, perm_const1@h + oris T3, T3, save_permute_12@h + oris T4, T4, save_permute_11@h + + + ori T2, T2, perm_const2@l + ori T1, T1, perm_const1@l + ori T3, T3, save_permute_12@l + ori T4, T4, save_permute_11@l + + + li r0,0 + li PRE,512 + +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegsp alpha_r,alpha_r + xvnegsp alpha_i,alpha_i +#endif + + mtvsrdd permute_mask,T2,T1 + mtvsrdd save_permute_1,T3,T4 + + /*mask is reverse permute so we have to make it inner permute */ + xxpermdi permute_mask, permute_mask, permute_mask,2 + +#include "cgemm_logic_power10.S" + +.L999: + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + + EPILOGUE +#endif diff --git a/kernel/power/cgemm_logic_power10.S b/kernel/power/cgemm_logic_power10.S new file mode 100644 index 000000000..3700ac87b --- /dev/null +++ b/kernel/power/cgemm_logic_power10.S @@ -0,0 +1,2814 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define MY_ALIGN .align 3 +b CGEMM_L4 +/* MINI SUBROUTINES */ +/* 4x8 MAIN 128x+2 LOOP */ + + +CGEMM_L4x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x8_2 + MY_ALIGN +CGEMM_L4x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 +CGEMM_L4x8_K128: +/*----------------------------------------*/ + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_L2 128,64,15,0 + KERNEL4x8_L2 128,64,16,0 + KERNEL4x8_L2 128,64,17,0 + KERNEL4x8_L2 128,64,18,0 + KERNEL4x8_L2 128,64,19,0 + KERNEL4x8_L2 128,64,20,0 + KERNEL4x8_L2 128,64,21,0 + KERNEL4x8_L2 128,64,22,0 + KERNEL4x8_L2 128,64,23,0 + KERNEL4x8_L2 128,64,24,0 + KERNEL4x8_L2 128,64,25,0 + KERNEL4x8_L2 128,64,26,0 + KERNEL4x8_L2 128,64,27,0 + KERNEL4x8_L2 128,64,28,0 + KERNEL4x8_L2 128,64,29,0 + KERNEL4x8_L2 128,64,30,0 + KERNEL4x8_L2 128,64,31,0 + KERNEL4x8_L2 128,64,32,0 + KERNEL4x8_L2 128,64,33,0 + KERNEL4x8_L2 128,64,34,0 + KERNEL4x8_L2 128,64,35,0 + KERNEL4x8_L2 128,64,36,0 + KERNEL4x8_L2 128,64,37,0 + KERNEL4x8_L2 128,64,38,0 + KERNEL4x8_L2 128,64,39,0 + KERNEL4x8_L2 128,64,40,0 + KERNEL4x8_L2 128,64,41,0 + KERNEL4x8_L2 128,64,42,0 + KERNEL4x8_L2 128,64,43,0 + KERNEL4x8_L2 128,64,44,0 + KERNEL4x8_L2 128,64,45,0 + KERNEL4x8_L2 128,64,46,0 + KERNEL4x8_L2 128,64,47,0 + KERNEL4x8_L2 128,64,48,0 + KERNEL4x8_L2 128,64,49,0 + KERNEL4x8_L2 128,64,50,0 + KERNEL4x8_L2 128,64,51,0 + KERNEL4x8_L2 128,64,52,0 + KERNEL4x8_L2 128,64,53,0 + KERNEL4x8_L2 128,64,54,0 + KERNEL4x8_L2 128,64,55,0 + KERNEL4x8_L2 128,64,56,0 + KERNEL4x8_L2 128,64,57,0 + KERNEL4x8_L2 128,64,58,0 + KERNEL4x8_L2 128,64,59,0 + KERNEL4x8_L2 128,64,60,0 + KERNEL4x8_L2 128,64,61,0 + KERNEL4x8_L2 128,64,62,0 + KERNEL4x8_L2 128,64,63,1 + bdnz CGEMM_L4x8_LOOP + MY_ALIGN +CGEMM_L4x8_LOOP_END: +/*----------------------------------------*/ + END4x8_2 + blr + MY_ALIGN + + +CGEMM_4x8_L64_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_L2 128,64,15,0 + KERNEL4x8_L2 128,64,16,0 + KERNEL4x8_L2 128,64,17,0 + KERNEL4x8_L2 128,64,18,0 + KERNEL4x8_L2 128,64,19,0 + KERNEL4x8_L2 128,64,20,0 + KERNEL4x8_L2 128,64,21,0 + KERNEL4x8_L2 128,64,22,0 + KERNEL4x8_L2 128,64,23,0 + KERNEL4x8_L2 128,64,24,0 + KERNEL4x8_L2 128,64,25,0 + KERNEL4x8_L2 128,64,26,0 + KERNEL4x8_L2 128,64,27,0 + KERNEL4x8_L2 128,64,28,0 + KERNEL4x8_L2 128,64,29,0 + KERNEL4x8_L2 128,64,30,0 + KERNEL4x8_E2 128,64,31,1 + blr + MY_ALIGN + + +CGEMM_4x8_L32_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_E2 128,64,15,1 + blr + MY_ALIGN + + +CGEMM_4x8_L16_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_E2 128,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x4_2 + MY_ALIGN +CGEMM_L4x4_LOOP: +/*----------------------------------------*/ + KERNEL4x4_L2 64,64,0,0 +CGEMM_L4x4_K32: +/*----------------------------------------*/ + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_L2 64,64,3,0 + KERNEL4x4_L2 64,64,4,0 + KERNEL4x4_L2 64,64,5,0 + KERNEL4x4_L2 64,64,6,0 + KERNEL4x4_L2 64,64,7,0 + KERNEL4x4_L2 64,64,8,0 + KERNEL4x4_L2 64,64,9,0 + KERNEL4x4_L2 64,64,10,0 + KERNEL4x4_L2 64,64,11,0 + KERNEL4x4_L2 64,64,12,0 + KERNEL4x4_L2 64,64,13,0 + KERNEL4x4_L2 64,64,14,0 + KERNEL4x4_L2 64,64,15,1 + bdnz CGEMM_L4x4_LOOP + MY_ALIGN +CGEMM_L4x4_LOOP_END: +/*----------------------------------------*/ + END4x4_2 + blr + MY_ALIGN + + +CGEMM_4x4_L16_SUB: +/*----------------------------------------*/ + LOAD4x4_2 + KERNEL4x4_L2 64,64,0,0 + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_L2 64,64,3,0 + KERNEL4x4_L2 64,64,4,0 + KERNEL4x4_L2 64,64,5,0 + KERNEL4x4_L2 64,64,6,0 + KERNEL4x4_E2 64,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x4_L8_SUB: +/*----------------------------------------*/ + LOAD4x4_2 + KERNEL4x4_L2 64,64,0,0 + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_E2 64,64,3,1 + blr + + +CGEMM_4x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x2_2 + MY_ALIGN +CGEMM_L4x2_LOOP: +/*----------------------------------------*/ + KERNEL4x2_L2 32,64,0,0 +CGEMM_L4x2_K32: +/*----------------------------------------*/ + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_L2 32,64,3,0 + KERNEL4x2_L2 32,64,4,0 + KERNEL4x2_L2 32,64,5,0 + KERNEL4x2_L2 32,64,6,0 + KERNEL4x2_L2 32,64,7,0 + KERNEL4x2_L2 32,64,8,0 + KERNEL4x2_L2 32,64,9,0 + KERNEL4x2_L2 32,64,10,0 + KERNEL4x2_L2 32,64,11,0 + KERNEL4x2_L2 32,64,12,0 + KERNEL4x2_L2 32,64,13,0 + KERNEL4x2_L2 32,64,14,0 + KERNEL4x2_L2 32,64,15,1 + bdnz CGEMM_L4x2_LOOP + MY_ALIGN + + +CGEMM_L4x2_LOOP_END: +/*----------------------------------------*/ + END4x2_2 + blr + MY_ALIGN +CGEMM_4x2_L16_SUB: +/*----------------------------------------*/ + LOAD4x2_2 + KERNEL4x2_L2 32,64,0,0 + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_L2 32,64,3,0 + KERNEL4x2_L2 32,64,4,0 + KERNEL4x2_L2 32,64,5,0 + KERNEL4x2_L2 32,64,6,0 + KERNEL4x2_E2 32,64,7,1 + blr + MY_ALIGN +CGEMM_4x2_L8_SUB: +/*----------------------------------------*/ + LOAD4x2_2 + KERNEL4x2_L2 32,64,0,0 + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_E2 32,64,3,1 + blr + + +CGEMM_4x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x1_2 + MY_ALIGN +CGEMM_L4x1_LOOP: +/*----------------------------------------*/ + KERNEL4x1_L2 16,64,0,0 +CGEMM_L4x1_K32: +/*----------------------------------------*/ + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_L2 16,64,3,0 + KERNEL4x1_L2 16,64,4,0 + KERNEL4x1_L2 16,64,5,0 + KERNEL4x1_L2 16,64,6,0 + KERNEL4x1_L2 16,64,7,0 + KERNEL4x1_L2 16,64,8,0 + KERNEL4x1_L2 16,64,9,0 + KERNEL4x1_L2 16,64,10,0 + KERNEL4x1_L2 16,64,11,0 + KERNEL4x1_L2 16,64,12,0 + KERNEL4x1_L2 16,64,13,0 + KERNEL4x1_L2 16,64,14,0 + KERNEL4x1_L2 16,64,15,1 + bdnz CGEMM_L4x1_LOOP + MY_ALIGN +CGEMM_L4x1_LOOP_END: +/*----------------------------------------*/ + END4x1_2 + blr + + MY_ALIGN +CGEMM_4x1_L16_SUB: +/*----------------------------------------*/ + LOAD4x1_2 + KERNEL4x1_L2 16,64,0,0 + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_L2 16,64,3,0 + KERNEL4x1_L2 16,64,4,0 + KERNEL4x1_L2 16,64,5,0 + KERNEL4x1_L2 16,64,6,0 + KERNEL4x1_E2 16,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x1_L8_SUB: +/*----------------------------------------*/ + LOAD4x1_2 + KERNEL4x1_L2 16,64,0,0 + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_E2 16,64,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L4: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + /* Pre set value in vs57 as 0xffff0000ffff0000 for masking */ + vspltisb v24, -1 + vspltisb v25, 0 + xxsldwi vs57, vs56, vs57, 1 + xxpermdi vs57, vs57, vs57, 3 + srawi. J, N, 2 + ble CGEMM_L4_END + + +CGEMM_L4_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 2 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L4x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L4x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,4 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO4x8 + ble CGEMM_L4x8_SUB0 + bl CGEMM_L4x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L4x8_SAVE + b CGEMM_L4x8_SUB2 + + +CGEMM_L4x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP4x8_128K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD4x8O 64,32 + END4x8_WITHOUT_ADD + LOAD4x8_2O 128, 64 + mtctr T8 + bl CGEMM_L4x8_K128 + b CGEMM_L4x8_SAVE + CMP4x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L4x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD4x8_2O 128,64 + bl CGEMM_L4x8_K128 + b CGEMM_L4x8_SAVE + MY_ALIGN + + +CGEMM_L4x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L4x8_SUB2_32 + bl CGEMM_4x8_L64_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L4x8_SUB2_16 + bl CGEMM_4x8_L32_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x8_SUB2_8 + bl CGEMM_4x8_L16_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x8_SUB2_4 + LOAD4x8_2 + KERNEL4x8_L2 128,64, 0,0 + KERNEL4x8_L2 128,64, 1,0 + KERNEL4x8_L2 128,64, 2,0 + KERNEL4x8_E2 128,64, 3,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x8_SUB2_2 + LOAD4x8_2 + KERNEL4x8_L2 128,64, 0,0 + KERNEL4x8_E2 128,64, 1,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x8_SUB2_1 + LOAD4x8_2 + KERNEL4x8_E2 128,64, 0,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x8_SAVE + KERNEL4x8 + + MY_ALIGN +CGEMM_L4x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4 +#endif + bgt CGEMM_L4x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L4x1_END + andi. T1, M, 4 + ble CGEMM_L4x4_END + b CGEMM_L4x4_BEGIN + MY_ALIGN + + +CGEMM_L4x8_END: +/*----------------------------------------*/ + + +CGEMM_L4x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L4x1_END + andi. T1, M, 4 + ble CGEMM_L4x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x4 + ble CGEMM_L4x4_SUB0 + bl CGEMM_4x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x4_SAVE + b CGEMM_L4x4_SUB2 + + +CGEMM_L4x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x4_32K + addi BO,BO,-32 + addi AO,AO,-32 + LOAD4x4O 32,32 + END4x4_WITHOUT_ADD + LOAD4x4_2O 64, 64 + mtctr T8 + bl CGEMM_L4x4_K32 + b CGEMM_L4x4_SAVE + CMP4x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-64 + LOAD4x4_2O 64,64 + bl CGEMM_L4x4_K32 + b CGEMM_L4x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x4_SUB2_8 + bl CGEMM_4x4_L16_SUB + MY_ALIGN + + +CGEMM_L4x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x4_SUB2_4 + bl CGEMM_4x4_L8_SUB + MY_ALIGN + + +CGEMM_L4x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x4_SUB2_2 + LOAD4x4_2 + KERNEL4x4_L2 64,64, 0,0 + KERNEL4x4_E2 64,64, 1,1 + MY_ALIGN + + +CGEMM_L4x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x4_SUB2_1 + LOAD4x4_2 + KERNEL4x4_E2 64,64, 0,1 + MY_ALIGN + + +CGEMM_L4x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x4_SAVE + KERNEL4x4 + + +CGEMM_L4x4_SAVE: +/*----------------------------------------*/ + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4 +#endif + + +CGEMM_L4x4_END: +/*----------------------------------------*/ + + +CGEMM_L4x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L4x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x2 + ble CGEMM_L4x2_SUB0 + bl CGEMM_4x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x2_SAVE + b CGEMM_L4x2_SUB2 + + +CGEMM_L4x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x2_32K + addi BO,BO,-32 + addi AO,AO,-16 + LOAD4x2O 16,32 + END4x2_WITHOUT_ADD + LOAD4x2_2O 32, 64 + mtctr T8 + bl CGEMM_L4x2_K32 + b CGEMM_L4x2_SAVE + CMP4x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-32 + LOAD4x2_2O 32,64 + bl CGEMM_L4x2_K32 + b CGEMM_L4x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x2_SUB2_8 + bl CGEMM_4x2_L16_SUB + MY_ALIGN + + +CGEMM_L4x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x2_SUB2_4 + bl CGEMM_4x2_L8_SUB + MY_ALIGN + + +CGEMM_L4x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x2_SUB2_2 + LOAD4x2_2 + KERNEL4x2_L2 32,64, 0,0 + KERNEL4x2_E2 32,64, 1,1 + MY_ALIGN + + +CGEMM_L4x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x2_SUB2_1 + LOAD4x2_2 + KERNEL4x2_E2 32,64, 0,1 + MY_ALIGN + + +CGEMM_L4x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x2_SAVE + KERNEL4x2 + + MY_ALIGN +CGEMM_L4x2_SAVE: +/*----------------------------------------*/ + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4 +#endif + + +CGEMM_L4x2_END: +/*----------------------------------------*/ + + +CGEMM_L4x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x1 + ble CGEMM_L4x1_SUB0 + bl CGEMM_4x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x1_SAVE + b CGEMM_L4x1_SUB2 + + +CGEMM_L4x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x1_32K + addi BO,BO,-32 + addi AO,AO,-8 + LOAD4x1O 8,32 + END4x1_WITHOUT_ADD + LOAD4x1_2O 16, 64 + mtctr T8 + bl CGEMM_L4x1_K32 + b CGEMM_L4x1_SAVE + CMP4x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-16 + LOAD4x1_2O 16,64 + bl CGEMM_L4x1_K32 + b CGEMM_L4x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x1_SUB2_8 + bl CGEMM_4x1_L16_SUB + MY_ALIGN + + +CGEMM_L4x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x1_SUB2_4 + bl CGEMM_4x1_L8_SUB + MY_ALIGN + + +CGEMM_L4x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x1_SUB2_2 + LOAD4x1_2 + KERNEL4x1_L2 16,64, 0,0 + KERNEL4x1_E2 16,64, 1,1 + MY_ALIGN + + +CGEMM_L4x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x1_SUB2_1 + LOAD4x1_2 + KERNEL4x1_E2 16,64, 0,1 + MY_ALIGN + + +CGEMM_L4x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x1_SAVE + KERNEL4x1 + + MY_ALIGN +CGEMM_L4x1_SAVE: +/*----------------------------------------*/ + + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4 +#endif + + +CGEMM_L4x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + bgt CGEMM_L4_BEGIN + + +CGEMM_L4_END: + +b CGEMM_L2 +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ + + +CGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x8_2 + MY_ALIGN +CGEMM_L2x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 +CGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_L2 128,32,15,0 + KERNEL2x8_L2 128,32,16,0 + KERNEL2x8_L2 128,32,17,0 + KERNEL2x8_L2 128,32,18,0 + KERNEL2x8_L2 128,32,19,0 + KERNEL2x8_L2 128,32,20,0 + KERNEL2x8_L2 128,32,21,0 + KERNEL2x8_L2 128,32,22,0 + KERNEL2x8_L2 128,32,23,0 + KERNEL2x8_L2 128,32,24,0 + KERNEL2x8_L2 128,32,25,0 + KERNEL2x8_L2 128,32,26,0 + KERNEL2x8_L2 128,32,27,0 + KERNEL2x8_L2 128,32,28,0 + KERNEL2x8_L2 128,32,29,0 + KERNEL2x8_L2 128,32,30,0 + KERNEL2x8_L2 128,32,31,0 + KERNEL2x8_L2 128,32,32,0 + KERNEL2x8_L2 128,32,33,0 + KERNEL2x8_L2 128,32,34,0 + KERNEL2x8_L2 128,32,35,0 + KERNEL2x8_L2 128,32,36,0 + KERNEL2x8_L2 128,32,37,0 + KERNEL2x8_L2 128,32,38,0 + KERNEL2x8_L2 128,32,39,0 + KERNEL2x8_L2 128,32,40,0 + KERNEL2x8_L2 128,32,41,0 + KERNEL2x8_L2 128,32,42,0 + KERNEL2x8_L2 128,32,43,0 + KERNEL2x8_L2 128,32,44,0 + KERNEL2x8_L2 128,32,45,0 + KERNEL2x8_L2 128,32,46,0 + KERNEL2x8_L2 128,32,47,0 + KERNEL2x8_L2 128,32,48,0 + KERNEL2x8_L2 128,32,49,0 + KERNEL2x8_L2 128,32,50,0 + KERNEL2x8_L2 128,32,51,0 + KERNEL2x8_L2 128,32,52,0 + KERNEL2x8_L2 128,32,53,0 + KERNEL2x8_L2 128,32,54,0 + KERNEL2x8_L2 128,32,55,0 + KERNEL2x8_L2 128,32,56,0 + KERNEL2x8_L2 128,32,57,0 + KERNEL2x8_L2 128,32,58,0 + KERNEL2x8_L2 128,32,59,0 + KERNEL2x8_L2 128,32,60,0 + KERNEL2x8_L2 128,32,61,0 + KERNEL2x8_L2 128,32,62,0 + KERNEL2x8_L2 128,32,63,1 + bdnz CGEMM_L2x8_LOOP + MY_ALIGN +CGEMM_L2x8_LOOP_END: +/*----------------------------------------*/ + END2x8_2 + blr + MY_ALIGN + + +CGEMM_2x8_L64_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_L2 128,32,15,0 + KERNEL2x8_L2 128,32,16,0 + KERNEL2x8_L2 128,32,17,0 + KERNEL2x8_L2 128,32,18,0 + KERNEL2x8_L2 128,32,19,0 + KERNEL2x8_L2 128,32,20,0 + KERNEL2x8_L2 128,32,21,0 + KERNEL2x8_L2 128,32,22,0 + KERNEL2x8_L2 128,32,23,0 + KERNEL2x8_L2 128,32,24,0 + KERNEL2x8_L2 128,32,25,0 + KERNEL2x8_L2 128,32,26,0 + KERNEL2x8_L2 128,32,27,0 + KERNEL2x8_L2 128,32,28,0 + KERNEL2x8_L2 128,32,29,0 + KERNEL2x8_L2 128,32,30,0 + KERNEL2x8_E2 128,32,31,1 + blr + MY_ALIGN + + +CGEMM_2x8_L32_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_E2 128,32,15,1 + blr + MY_ALIGN + + +CGEMM_2x8_L16_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_E2 128,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x4_2 + MY_ALIGN +CGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_L2 64,32,0,0 +CGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_L2 64,32,3,0 + KERNEL2x4_L2 64,32,4,0 + KERNEL2x4_L2 64,32,5,0 + KERNEL2x4_L2 64,32,6,0 + KERNEL2x4_L2 64,32,7,0 + KERNEL2x4_L2 64,32,8,0 + KERNEL2x4_L2 64,32,9,0 + KERNEL2x4_L2 64,32,10,0 + KERNEL2x4_L2 64,32,11,0 + KERNEL2x4_L2 64,32,12,0 + KERNEL2x4_L2 64,32,13,0 + KERNEL2x4_L2 64,32,14,0 + KERNEL2x4_L2 64,32,15,1 + bdnz CGEMM_L2x4_LOOP + MY_ALIGN +CGEMM_L2x4_LOOP_END: +/*----------------------------------------*/ + END2x4_2 + blr + MY_ALIGN + + +CGEMM_2x4_L16_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 64,32,0,0 + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_L2 64,32,3,0 + KERNEL2x4_L2 64,32,4,0 + KERNEL2x4_L2 64,32,5,0 + KERNEL2x4_L2 64,32,6,0 + KERNEL2x4_E2 64,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x4_L8_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 64,32,0,0 + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_E2 64,32,3,1 + blr + + +CGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x2_2 + MY_ALIGN +CGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_L2 32,32,0,0 +CGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_L2 32,32,3,0 + KERNEL2x2_L2 32,32,4,0 + KERNEL2x2_L2 32,32,5,0 + KERNEL2x2_L2 32,32,6,0 + KERNEL2x2_L2 32,32,7,0 + KERNEL2x2_L2 32,32,8,0 + KERNEL2x2_L2 32,32,9,0 + KERNEL2x2_L2 32,32,10,0 + KERNEL2x2_L2 32,32,11,0 + KERNEL2x2_L2 32,32,12,0 + KERNEL2x2_L2 32,32,13,0 + KERNEL2x2_L2 32,32,14,0 + KERNEL2x2_L2 32,32,15,1 + bdnz CGEMM_L2x2_LOOP + MY_ALIGN + + +CGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + END2x2_2 + blr + MY_ALIGN +CGEMM_2x2_L16_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 32,32,0,0 + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_L2 32,32,3,0 + KERNEL2x2_L2 32,32,4,0 + KERNEL2x2_L2 32,32,5,0 + KERNEL2x2_L2 32,32,6,0 + KERNEL2x2_E2 32,32,7,1 + blr + MY_ALIGN +CGEMM_2x2_L8_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 32,32,0,0 + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_E2 32,32,3,1 + blr + + +CGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +CGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 16,32,0,0 +CGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_L2 16,32,3,0 + KERNEL2x1_L2 16,32,4,0 + KERNEL2x1_L2 16,32,5,0 + KERNEL2x1_L2 16,32,6,0 + KERNEL2x1_L2 16,32,7,0 + KERNEL2x1_L2 16,32,8,0 + KERNEL2x1_L2 16,32,9,0 + KERNEL2x1_L2 16,32,10,0 + KERNEL2x1_L2 16,32,11,0 + KERNEL2x1_L2 16,32,12,0 + KERNEL2x1_L2 16,32,13,0 + KERNEL2x1_L2 16,32,14,0 + KERNEL2x1_L2 16,32,15,1 + bdnz CGEMM_L2x1_LOOP + MY_ALIGN +CGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 + blr + + MY_ALIGN +CGEMM_2x1_L16_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 16,32,0,0 + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_L2 16,32,3,0 + KERNEL2x1_L2 16,32,4,0 + KERNEL2x1_L2 16,32,5,0 + KERNEL2x1_L2 16,32,6,0 + KERNEL2x1_E2 16,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x1_L8_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 16,32,0,0 + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_E2 16,32,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L2: +/*----------------------------------------*/ + + andi. J, N, 2 + ble CGEMM_L2_END + + +CGEMM_L2_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L2x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO2x8 + ble CGEMM_L2x8_SUB0 + bl CGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L2x8_SAVE + b CGEMM_L2x8_SUB2 + + +CGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP2x8_128K + addi BO,BO,-16 + addi AO,AO,-64 + LOAD2x8O 64,16 + END2x8_WITHOUT_ADD + LOAD2x8_2O 128, 32 + mtctr T8 + bl CGEMM_L2x8_K128 + b CGEMM_L2x8_SAVE + CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-128 + LOAD2x8_2O 128,32 + bl CGEMM_L2x8_K128 + b CGEMM_L2x8_SAVE + MY_ALIGN + + +CGEMM_L2x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L2x8_SUB2_32 + bl CGEMM_2x8_L64_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L2x8_SUB2_16 + bl CGEMM_2x8_L32_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x8_SUB2_8 + bl CGEMM_2x8_L16_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x8_SUB2_4 + LOAD2x8_2 + KERNEL2x8_L2 128,32, 0,0 + KERNEL2x8_L2 128,32, 1,0 + KERNEL2x8_L2 128,32, 2,0 + KERNEL2x8_E2 128,32, 3,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x8_SUB2_2 + LOAD2x8_2 + KERNEL2x8_L2 128,32, 0,0 + KERNEL2x8_E2 128,32, 1,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x8_SUB2_1 + LOAD2x8_2 + KERNEL2x8_E2 128,32, 0,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x8_SAVE + KERNEL2x8 + + MY_ALIGN +CGEMM_L2x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 +#endif + bgt CGEMM_L2x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L2x1_END + andi. T1, M, 4 + ble CGEMM_L2x4_END + b CGEMM_L2x4_BEGIN + MY_ALIGN + + +CGEMM_L2x8_END: +/*----------------------------------------*/ + + +CGEMM_L2x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L2x1_END + andi. T1, M, 4 + ble CGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x4 + ble CGEMM_L2x4_SUB0 + bl CGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x4_SAVE + b CGEMM_L2x4_SUB2 + + +CGEMM_L2x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x4_32K + addi BO,BO,-16 + addi AO,AO,-32 + LOAD2x4O 32,16 + END2x4_WITHOUT_ADD + LOAD2x4_2O 64, 32 + mtctr T8 + bl CGEMM_L2x4_K32 + b CGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-64 + LOAD2x4_2O 64,32 + bl CGEMM_L2x4_K32 + b CGEMM_L2x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x4_SUB2_8 + bl CGEMM_2x4_L16_SUB + MY_ALIGN + + +CGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x4_SUB2_4 + bl CGEMM_2x4_L8_SUB + MY_ALIGN + + +CGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x4_SUB2_2 + LOAD2x4_2 + KERNEL2x4_L2 64,32, 0,0 + KERNEL2x4_E2 64,32, 1,1 + MY_ALIGN + + +CGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x4_SUB2_1 + LOAD2x4_2 + KERNEL2x4_E2 64,32, 0,1 + MY_ALIGN + + +CGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x4_SAVE + KERNEL2x4 + + +CGEMM_L2x4_SAVE: +/*----------------------------------------*/ + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 +#endif + + +CGEMM_L2x4_END: +/*----------------------------------------*/ + + +CGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x2 + ble CGEMM_L2x2_SUB0 + bl CGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x2_SAVE + b CGEMM_L2x2_SUB2 + + +CGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x2_32K + addi BO,BO,-16 + addi AO,AO,-16 + LOAD2x2O 16,16 + END2x2_WITHOUT_ADD + LOAD2x2_2O 32, 32 + mtctr T8 + bl CGEMM_L2x2_K32 + b CGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-32 + LOAD2x2_2O 32,32 + bl CGEMM_L2x2_K32 + b CGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x2_SUB2_8 + bl CGEMM_2x2_L16_SUB + MY_ALIGN + + +CGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x2_SUB2_4 + bl CGEMM_2x2_L8_SUB + MY_ALIGN + + +CGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x2_SUB2_2 + LOAD2x2_2 + KERNEL2x2_L2 32,32, 0,0 + KERNEL2x2_E2 32,32, 1,1 + MY_ALIGN + + +CGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x2_SUB2_1 + LOAD2x2_2 + KERNEL2x2_E2 32,32, 0,1 + MY_ALIGN + + +CGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x2_SAVE + KERNEL2x2 + + MY_ALIGN +CGEMM_L2x2_SAVE: +/*----------------------------------------*/ + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 +#endif + + +CGEMM_L2x2_END: +/*----------------------------------------*/ + + +CGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble CGEMM_L2x1_SUB0 + bl CGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x1_SAVE + b CGEMM_L2x1_SUB2 + + +CGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x1_32K + addi BO,BO,-16 + addi AO,AO,-8 + LOAD2x1O 8,16 + END2x1_WITHOUT_ADD + LOAD2x1_2O 16, 32 + mtctr T8 + bl CGEMM_L2x1_K32 + b CGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-16 + LOAD2x1_2O 16,32 + bl CGEMM_L2x1_K32 + b CGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x1_SUB2_8 + bl CGEMM_2x1_L16_SUB + MY_ALIGN + + +CGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x1_SUB2_4 + bl CGEMM_2x1_L8_SUB + MY_ALIGN + + +CGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x1_SUB2_2 + LOAD2x1_2 + KERNEL2x1_L2 16,32, 0,0 + KERNEL2x1_E2 16,32, 1,1 + MY_ALIGN + + +CGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x1_SUB2_1 + LOAD2x1_2 + KERNEL2x1_E2 16,32, 0,1 + MY_ALIGN + + +CGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x1_SAVE + KERNEL2x1 + + MY_ALIGN +CGEMM_L2x1_SAVE: +/*----------------------------------------*/ + + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 +#endif + + +CGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 4 + + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + +CGEMM_L2_END: + + +b CGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ + + +CGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x8_2 + MY_ALIGN +CGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 +CGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_L2 128,16,15,0 + KERNEL1x8_L2 128,16,16,0 + KERNEL1x8_L2 128,16,17,0 + KERNEL1x8_L2 128,16,18,0 + KERNEL1x8_L2 128,16,19,0 + KERNEL1x8_L2 128,16,20,0 + KERNEL1x8_L2 128,16,21,0 + KERNEL1x8_L2 128,16,22,0 + KERNEL1x8_L2 128,16,23,0 + KERNEL1x8_L2 128,16,24,0 + KERNEL1x8_L2 128,16,25,0 + KERNEL1x8_L2 128,16,26,0 + KERNEL1x8_L2 128,16,27,0 + KERNEL1x8_L2 128,16,28,0 + KERNEL1x8_L2 128,16,29,0 + KERNEL1x8_L2 128,16,30,0 + KERNEL1x8_L2 128,16,31,0 + KERNEL1x8_L2 128,16,32,0 + KERNEL1x8_L2 128,16,33,0 + KERNEL1x8_L2 128,16,34,0 + KERNEL1x8_L2 128,16,35,0 + KERNEL1x8_L2 128,16,36,0 + KERNEL1x8_L2 128,16,37,0 + KERNEL1x8_L2 128,16,38,0 + KERNEL1x8_L2 128,16,39,0 + KERNEL1x8_L2 128,16,40,0 + KERNEL1x8_L2 128,16,41,0 + KERNEL1x8_L2 128,16,42,0 + KERNEL1x8_L2 128,16,43,0 + KERNEL1x8_L2 128,16,44,0 + KERNEL1x8_L2 128,16,45,0 + KERNEL1x8_L2 128,16,46,0 + KERNEL1x8_L2 128,16,47,0 + KERNEL1x8_L2 128,16,48,0 + KERNEL1x8_L2 128,16,49,0 + KERNEL1x8_L2 128,16,50,0 + KERNEL1x8_L2 128,16,51,0 + KERNEL1x8_L2 128,16,52,0 + KERNEL1x8_L2 128,16,53,0 + KERNEL1x8_L2 128,16,54,0 + KERNEL1x8_L2 128,16,55,0 + KERNEL1x8_L2 128,16,56,0 + KERNEL1x8_L2 128,16,57,0 + KERNEL1x8_L2 128,16,58,0 + KERNEL1x8_L2 128,16,59,0 + KERNEL1x8_L2 128,16,60,0 + KERNEL1x8_L2 128,16,61,0 + KERNEL1x8_L2 128,16,62,0 + KERNEL1x8_L2 128,16,63,1 + bdnz CGEMM_L1x8_LOOP + MY_ALIGN +CGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + END1x8_2 + blr + MY_ALIGN + + +CGEMM_1x8_L64_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_L2 128,16,15,0 + KERNEL1x8_L2 128,16,16,0 + KERNEL1x8_L2 128,16,17,0 + KERNEL1x8_L2 128,16,18,0 + KERNEL1x8_L2 128,16,19,0 + KERNEL1x8_L2 128,16,20,0 + KERNEL1x8_L2 128,16,21,0 + KERNEL1x8_L2 128,16,22,0 + KERNEL1x8_L2 128,16,23,0 + KERNEL1x8_L2 128,16,24,0 + KERNEL1x8_L2 128,16,25,0 + KERNEL1x8_L2 128,16,26,0 + KERNEL1x8_L2 128,16,27,0 + KERNEL1x8_L2 128,16,28,0 + KERNEL1x8_L2 128,16,29,0 + KERNEL1x8_L2 128,16,30,0 + KERNEL1x8_E2 128,16,31,1 + blr + MY_ALIGN + + +CGEMM_1x8_L32_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_E2 128,16,15,1 + blr + MY_ALIGN + + +CGEMM_1x8_L16_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_E2 128,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x4_2 + MY_ALIGN +CGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_L2 64,16,0,0 +CGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_L2 64,16,3,0 + KERNEL1x4_L2 64,16,4,0 + KERNEL1x4_L2 64,16,5,0 + KERNEL1x4_L2 64,16,6,0 + KERNEL1x4_L2 64,16,7,0 + KERNEL1x4_L2 64,16,8,0 + KERNEL1x4_L2 64,16,9,0 + KERNEL1x4_L2 64,16,10,0 + KERNEL1x4_L2 64,16,11,0 + KERNEL1x4_L2 64,16,12,0 + KERNEL1x4_L2 64,16,13,0 + KERNEL1x4_L2 64,16,14,0 + KERNEL1x4_L2 64,16,15,1 + bdnz CGEMM_L1x4_LOOP + MY_ALIGN +CGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + END1x4_2 + blr + MY_ALIGN + + +CGEMM_1x4_L16_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 64,16,0,0 + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_L2 64,16,3,0 + KERNEL1x4_L2 64,16,4,0 + KERNEL1x4_L2 64,16,5,0 + KERNEL1x4_L2 64,16,6,0 + KERNEL1x4_E2 64,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x4_L8_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 64,16,0,0 + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_E2 64,16,3,1 + blr + + +CGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x2_2 + MY_ALIGN +CGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_L2 32,16,0,0 +CGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_L2 32,16,3,0 + KERNEL1x2_L2 32,16,4,0 + KERNEL1x2_L2 32,16,5,0 + KERNEL1x2_L2 32,16,6,0 + KERNEL1x2_L2 32,16,7,0 + KERNEL1x2_L2 32,16,8,0 + KERNEL1x2_L2 32,16,9,0 + KERNEL1x2_L2 32,16,10,0 + KERNEL1x2_L2 32,16,11,0 + KERNEL1x2_L2 32,16,12,0 + KERNEL1x2_L2 32,16,13,0 + KERNEL1x2_L2 32,16,14,0 + KERNEL1x2_L2 32,16,15,1 + bdnz CGEMM_L1x2_LOOP + MY_ALIGN + + +CGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + END1x2_2 + blr + MY_ALIGN +CGEMM_1x2_L16_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 32,16,0,0 + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_L2 32,16,3,0 + KERNEL1x2_L2 32,16,4,0 + KERNEL1x2_L2 32,16,5,0 + KERNEL1x2_L2 32,16,6,0 + KERNEL1x2_E2 32,16,7,1 + blr + MY_ALIGN +CGEMM_1x2_L8_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 32,16,0,0 + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_E2 32,16,3,1 + blr + + +CGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN +CGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 16,16,0,0 +CGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_L2 16,16,3,0 + KERNEL1x1_L2 16,16,4,0 + KERNEL1x1_L2 16,16,5,0 + KERNEL1x1_L2 16,16,6,0 + KERNEL1x1_L2 16,16,7,0 + KERNEL1x1_L2 16,16,8,0 + KERNEL1x1_L2 16,16,9,0 + KERNEL1x1_L2 16,16,10,0 + KERNEL1x1_L2 16,16,11,0 + KERNEL1x1_L2 16,16,12,0 + KERNEL1x1_L2 16,16,13,0 + KERNEL1x1_L2 16,16,14,0 + KERNEL1x1_L2 16,16,15,1 + bdnz CGEMM_L1x1_LOOP + MY_ALIGN +CGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + + MY_ALIGN +CGEMM_1x1_L16_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 16,16,0,0 + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_L2 16,16,3,0 + KERNEL1x1_L2 16,16,4,0 + KERNEL1x1_L2 16,16,5,0 + KERNEL1x1_L2 16,16,6,0 + KERNEL1x1_E2 16,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x1_L8_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 16,16,0,0 + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_E2 16,16,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L1: +/*----------------------------------------*/ + + andi. J, N, 1 + ble CGEMM_L1_END + +CGEMM_L1_BEGIN: +/*----------------------------------------*/ + mr CO, C + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L1x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO1x8 + ble CGEMM_L1x8_SUB0 + bl CGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L1x8_SAVE + b CGEMM_L1x8_SUB2 + + +CGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP1x8_128K + addi BO,BO,-8 + addi AO,AO,-64 + LOAD1x8O 64,8 + END1x8_WITHOUT_ADD + LOAD1x8_2O 128, 16 + mtctr T8 + bl CGEMM_L1x8_K128 + b CGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-128 + LOAD1x8_2O 128,16 + bl CGEMM_L1x8_K128 + b CGEMM_L1x8_SAVE + MY_ALIGN + + +CGEMM_L1x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L1x8_SUB2_32 + bl CGEMM_1x8_L64_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L1x8_SUB2_16 + bl CGEMM_1x8_L32_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x8_SUB2_8 + bl CGEMM_1x8_L16_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x8_SUB2_4 + LOAD1x8_2 + KERNEL1x8_L2 128,16, 0,0 + KERNEL1x8_L2 128,16, 1,0 + KERNEL1x8_L2 128,16, 2,0 + KERNEL1x8_E2 128,16, 3,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x8_SUB2_2 + LOAD1x8_2 + KERNEL1x8_L2 128,16, 0,0 + KERNEL1x8_E2 128,16, 1,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x8_SUB2_1 + LOAD1x8_2 + KERNEL1x8_E2 128,16, 0,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x8_SAVE + KERNEL1x8 + + MY_ALIGN +CGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 +#endif + bgt CGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L1x1_END + andi. T1, M, 4 + ble CGEMM_L1x4_END + b CGEMM_L1x4_BEGIN + MY_ALIGN + + +CGEMM_L1x8_END: +/*----------------------------------------*/ + + +CGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L1x1_END + andi. T1, M, 4 + ble CGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x4 + ble CGEMM_L1x4_SUB0 + bl CGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x4_SAVE + b CGEMM_L1x4_SUB2 + + +CGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x4_32K + addi BO,BO,-8 + addi AO,AO,-32 + LOAD1x4O 32,8 + END1x4_WITHOUT_ADD + LOAD1x4_2O 64, 16 + mtctr T8 + bl CGEMM_L1x4_K32 + b CGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-64 + LOAD1x4_2O 64,16 + bl CGEMM_L1x4_K32 + b CGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x4_SUB2_8 + bl CGEMM_1x4_L16_SUB + MY_ALIGN + + +CGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x4_SUB2_4 + bl CGEMM_1x4_L8_SUB + MY_ALIGN + + +CGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x4_SUB2_2 + LOAD1x4_2 + KERNEL1x4_L2 64,16, 0,0 + KERNEL1x4_E2 64,16, 1,1 + MY_ALIGN + + +CGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x4_SUB2_1 + LOAD1x4_2 + KERNEL1x4_E2 64,16, 0,1 + MY_ALIGN + + +CGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x4_SAVE + KERNEL1x4 + + +CGEMM_L1x4_SAVE: +/*----------------------------------------*/ + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 +#endif + + +CGEMM_L1x4_END: +/*----------------------------------------*/ + + +CGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x2 + ble CGEMM_L1x2_SUB0 + bl CGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x2_SAVE + b CGEMM_L1x2_SUB2 + + +CGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x2_32K + addi BO,BO,-8 + addi AO,AO,-16 + LOAD1x2O 16,8 + END1x2_WITHOUT_ADD + LOAD1x2_2O 32, 16 + mtctr T8 + bl CGEMM_L1x2_K32 + b CGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-32 + LOAD1x2_2O 32,16 + bl CGEMM_L1x2_K32 + b CGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x2_SUB2_8 + bl CGEMM_1x2_L16_SUB + MY_ALIGN + + +CGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x2_SUB2_4 + bl CGEMM_1x2_L8_SUB + MY_ALIGN + + +CGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x2_SUB2_2 + LOAD1x2_2 + KERNEL1x2_L2 32,16, 0,0 + KERNEL1x2_E2 32,16, 1,1 + MY_ALIGN + + +CGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x2_SUB2_1 + LOAD1x2_2 + KERNEL1x2_E2 32,16, 0,1 + MY_ALIGN + + +CGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x2_SAVE + KERNEL1x2 + + MY_ALIGN +CGEMM_L1x2_SAVE: +/*----------------------------------------*/ + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 +#endif + + +CGEMM_L1x2_END: +/*----------------------------------------*/ + + +CGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x1 + ble CGEMM_L1x1_SUB0 + bl CGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x1_SAVE + b CGEMM_L1x1_SUB2 + + +CGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x1_32K + addi BO,BO,-8 + addi AO,AO,-8 + LOAD1x1O 8,8 + END1x1_WITHOUT_ADD + LOAD1x1_2O 16, 16 + mtctr T8 + bl CGEMM_L1x1_K32 + b CGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-16 + LOAD1x1_2O 16,16 + bl CGEMM_L1x1_K32 + b CGEMM_L1x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x1_SUB2_8 + bl CGEMM_1x1_L16_SUB + MY_ALIGN + + +CGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x1_SUB2_4 + bl CGEMM_1x1_L8_SUB + MY_ALIGN + + +CGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x1_SUB2_2 + LOAD1x1_2 + KERNEL1x1_L2 16,16, 0,0 + KERNEL1x1_E2 16,16, 1,1 + MY_ALIGN + + +CGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x1_SUB2_1 + LOAD1x1_2 + KERNEL1x1_E2 16,16, 0,1 + MY_ALIGN + + +CGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x1_SAVE + KERNEL1x1 + + MY_ALIGN +CGEMM_L1x1_SAVE: +/*----------------------------------------*/ + + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 +#endif + + +CGEMM_L1x1_END: +/*----------------------------------------*/ + slwi T1, K, 3 + + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + +CGEMM_L1_END: + + + + diff --git a/kernel/power/cgemm_macros_power10.S b/kernel/power/cgemm_macros_power10.S new file mode 100644 index 000000000..b66e93405 --- /dev/null +++ b/kernel/power/cgemm_macros_power10.S @@ -0,0 +1,2131 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define unit_size 8 +#define DISP32(ind, disp) (ind*unit_size*32+disp) +#define DISP16(ind, disp) (ind*unit_size*16+disp) +#define DISP8(ind, disp) (ind*unit_size*8+disp) +#define DISP4(ind, disp) (ind*unit_size*4+disp) +#define DISP2(ind, disp) (ind*unit_size*2+disp) +#define DISP1(ind, disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) + +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1, VSINR, VSINI_OUT2, VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvsubsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvsubsp \VSINI_OUT2, \VSINI, \VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubsp \VSINR_OUT1, \VSINR, \VSINR_OUT1 + /*we will negate alpha image instead to fix sign*/ + xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#endif +.endm + +.macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1, VSINR, VSINI_OUT2, VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvsubsp \VSINI_OUT2, \VSINI, \VSINI_OUT2 +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvsubsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubsp \VSINR_OUT1, \VSINR, \VSINR_OUT1 + /*we will negate alpha image instead to fix sign*/ + xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#endif +.endm + +/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */ + +.macro MULT_APLHA_PART1 VSINRR, VSINII, VSOUT1, VSOUT2 + xvmulsp \VSOUT1, \VSINII, alpha_i + xvmulsp \VSOUT2, \VSINRR, alpha_i +.endm + +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + +.macro MULT_APLHA_PART2 VSINRR, VSINII, VSOUT1, VSOUT2 + xvmsubasp \VSOUT1, \VSINRR, alpha_r + xvmaddasp \VSOUT2, \VSINII, alpha_r +.endm + +.macro PERMUTE1 OUT, R1, R2, R3, R4 + xxsel vs62, \R1, \R2, vs57 + xxsel \OUT, \R3, \R4, vs57 + xxpermdi \OUT, \OUT, vs62, 1 +.endm +.macro PERMUTE2 OUT, R1, R2, R3, R4 + xxsel vs62, \R2, \R1, vs57 + xxsel \OUT, \R4, \R3, vs57 + xxpermdi \OUT, vs62, \OUT, 1 + xxperm \OUT, \OUT, permute_mask +.endm +.macro PERMUTE3 OUT, R1, R2, R3, R4 + xxsel vs62, \R1, \R2, vs57 + xxsel \OUT, \R3, \R4, vs57 + xxpermdi \OUT, vs62, \OUT, 2 +.endm +.macro PERMUTE4 OUT, R1, R2, R3, R4 + xxsel vs62, \R2, \R1, vs57 + xxsel \OUT, \R4, \R3, vs57 + xxpermdi \OUT, \OUT, vs62, 2 + xxperm \OUT, \OUT, permute_mask +.endm +.macro GROUP1 + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + xxperm vs1, vs33, permute_mask + xxperm vs5, vs41, permute_mask + xxperm vs8, vs36, permute_mask + xxperm vs12, vs44, permute_mask + xxperm vs9, vs37, permute_mask + xxperm vs13, vs45, permute_mask +.endm +.macro AGG_GROUP1 + AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5 + AGGREGATE_REALS_IMAGES vs36, vs8, vs44, vs12 + AGGREGATE_REALS_IMAGES vs37, vs9, vs45, vs13 +.endm +.macro GROUP2 + xxperm vs0, vs34, permute_mask + xxperm vs4, vs42, permute_mask + xxperm vs1, vs35, permute_mask + xxperm vs5, vs43, permute_mask + xxperm vs8, vs38, permute_mask + xxperm vs12, vs46, permute_mask + xxperm vs9, vs39, permute_mask + xxperm vs13, vs47, permute_mask +.endm +.macro AGG_GROUP2 + AGGREGATE_REALS_IMAGES vs34, vs0, vs42, vs4 + AGGREGATE_REALS_IMAGES vs35, vs1, vs43, vs5 + AGGREGATE_REALS_IMAGES vs38, vs8, vs46, vs12 + AGGREGATE_REALS_IMAGES vs39, vs9, vs47, vs13 +.endm +.macro MULTIPLY_GROUP1 + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART1 vs33, vs41, vs2, vs3 + MULT_APLHA_PART1 vs36, vs44, vs8, vs9 + MULT_APLHA_PART1 vs37, vs45, vs10, vs11 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs33, vs41, vs2, vs3 + MULT_APLHA_PART2 vs36, vs44, vs8, vs9 + MULT_APLHA_PART2 vs37, vs45, vs10, vs11 +.endm +.macro MULTIPLY_GROUP2 + MULT_APLHA_PART1 vs34, vs42, vs4, vs5 + MULT_APLHA_PART1 vs35, vs43, vs6, vs7 + MULT_APLHA_PART1 vs38, vs46, vs12, vs13 + MULT_APLHA_PART1 vs39, vs47, vs14, vs15 + MULT_APLHA_PART2 vs34, vs42, vs4, vs5 + MULT_APLHA_PART2 vs35, vs43, vs6, vs7 + MULT_APLHA_PART2 vs38, vs46, vs12, vs13 + MULT_APLHA_PART2 vs39, vs47, vs14, vs15 +.endm +/* reconstruct r, i pairs*/ +.macro RECONSTRUCT_PAIR1 + xxperm vs0, vs1, save_permute_1 + xxperm vs2, vs3, save_permute_1 + xxperm vs8, vs9, save_permute_1 + xxperm vs10, vs11, save_permute_1 +.endm +.macro RECONSTRUCT_PAIR2 + xxperm vs4, vs5, save_permute_1 + xxperm vs6, vs7, save_permute_1 + xxperm vs12, vs13, save_permute_1 + xxperm vs14, vs15, save_permute_1 +.endm +.macro SHUFFLE_ACC ACC, R0, R1, R2, R3, O1, O2, O3, O4 + xxmfacc \ACC + PERMUTE1 \O1, \R3, \R2, \R1, \R0 + PERMUTE2 \O2, \R1, \R0, \R3, \R2 + PERMUTE3 \O3, \R1, \R0, \R3, \R2 + PERMUTE4 \O4, \R3, \R2, \R1, \R0 +.endm +/* macros for N=4 and M=8 +**********************************************************************************************/ +.macro ZERO4x8 + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 + xxsetaccz 4 + xxsetaccz 5 + xxsetaccz 6 + xxsetaccz 7 +.endm + +.macro LOAD4x8 + LOAD4x8O 0, 0 +.endm + +.macro LOAD4x8O OffsetA, OffsetB + lxvp vs34, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) + lxvp vs36, (\OffsetA+32)(AO) +.endm + +.macro END4x8_NORMAL + END4x8 AO, BO, 64, 32 +.endm + +.macro END4x8_WITHOUT_ADD + END4x8 AO, BO, 0, 0 +.endm + +.macro END4x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 3, 36, 35 + xvf32gerpp 2, 37, 35 + xvf32gerpp 1, 32, 35 + xvf32gerpp 0, 33, 35 + xvf32gerpp 7, 36, 34 + xvf32gerpp 6, 37, 34 + xvf32gerpp 5, 32, 34 + xvf32gerpp 4, 33, 34 +.endm + +.macro LOAD4x8_2 + LOAD4x8_2O 0, 0 +.endm + +.macro LOAD4x8_2O OffsetA, OffsetB + lxvp vs34, (\OffsetB)(BO) + lxvp vs38, (32+\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + lxvp vs36, (32+\OffsetA)(AO) + lxvp vs40, (64+\OffsetA)(AO) + lxvp vs42, (64+32+\OffsetA)(AO) +.endm + +.macro END4x8_2 + /*for load2 offset will be 128 and 64*/ + KERNEL4x8_2 AO, BO, 128, 64, 0, 1, 1 +.endm + +.macro KERNEL4x8_E2 OffsetA, OffsetB, Index, IsLast + KERNEL4x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL4x8_L2 OffsetA, OffsetB, Index, IsLast + KERNEL4x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 3, 36, 35 + xvf32gerpp 2, 37, 35 + xvf32gerpp 1, 32, 35 + xvf32gerpp 0, 33, 35 + xvf32gerpp 7, 36, 34 + xvf32gerpp 6, 37, 34 + xvf32gerpp 5, 32, 34 + xvf32gerpp 4, 33, 34 +.if \Complete==0 + lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) + lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) + lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) +.endif + xvf32gerpp 3, 42, 39 + xvf32gerpp 2, 43, 39 + xvf32gerpp 1, 40, 39 + xvf32gerpp 0, 41, 39 + xvf32gerpp 7, 42, 38 + xvf32gerpp 6, 43, 38 + xvf32gerpp 5, 40, 38 + xvf32gerpp 4, 41, 38 +.if \Complete==0 + lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG) + lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) + lxvp vs42, DISP16(\Index, 64+32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index, \OffsetB) + addi \AREG, \AREG, DISP16(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP8(\Index, 64) + addi \AREG, \AREG, DISP16(\Index, 128) +.endif +.endif +.endm + +.macro KERNEL4x8 + LOAD4x8 + END4x8 AO, BO, 64, 32 +.endm + +.macro SAVE4x8 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46 + SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47 + SHUFFLE_ACC 4, vs16, vs17, vs18, vs19, vs48, vs56, vs52, vs60 + SHUFFLE_ACC 5, vs20, vs21, vs22, vs23, vs49, vs16, vs53, vs61 + SHUFFLE_ACC 7, vs28, vs29, vs30, vs31, vs17, vs19, vs18, vs20 + SHUFFLE_ACC 6, vs24, vs25, vs26, vs27, vs50, vs58, vs54, vs21 + add T4, LDC, LDC + add T1, CO, LDC +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif +#ifndef TRMMKERNEL + lxvp vs26, 32(CO) +#endif +#ifndef TRMMKERNEL + lxvp vs28, 0(T1) +#endif + xxperm vs2, vs34, permute_mask + xxperm vs6, vs42, permute_mask +#ifndef TRMMKERNEL + lxvp vs30, 32(T1) +#endif + xxperm vs3, vs35, permute_mask + xxperm vs7, vs43, permute_mask + add T2, CO, T4 + add T3, T1, T4 + GROUP1 + AGG_GROUP1 + AGGREGATE_REALS_IMAGES vs34, vs2, vs42, vs6 + xxperm vs10, vs38, permute_mask + xxperm vs14, vs46, permute_mask + AGGREGATE_REALS_IMAGES vs35, vs3, vs43, vs7 + xxperm vs11, vs39, permute_mask + xxperm vs15, vs47, permute_mask + xxperm vs0, vs48, permute_mask + xxperm vs4, vs56, permute_mask + xxperm vs1, vs49, permute_mask + xxperm vs5, vs16, permute_mask + AGGREGATE_REALS_IMAGES vs38, vs10, vs46, vs14 + xxperm vs2, vs50, permute_mask + xxperm vs6, vs58, permute_mask + AGGREGATE_REALS_IMAGES vs39, vs11, vs47, vs15 + xxperm vs3, vs17, permute_mask + xxperm vs7, vs19, permute_mask + AGGREGATE_REALS_IMAGES vs48, vs0, vs56, vs4 + xxperm vs8, vs52, permute_mask + xxperm vs12, vs60, permute_mask + AGGREGATE_REALS_IMAGES vs49, vs1, vs16, vs5 + xxperm vs9, vs53, permute_mask + xxperm vs13, vs61, permute_mask + AGGREGATE_REALS_IMAGES vs50, vs2, vs58, vs6 + xxperm vs10, vs54, permute_mask + xxperm vs14, vs21, permute_mask + AGGREGATE_REALS_IMAGES vs17, vs3, vs19, vs7 + xxperm vs11, vs18, permute_mask + xxperm vs15, vs20, permute_mask + AGGREGATE_REALS_IMAGES vs52, vs8, vs60, vs12 + AGGREGATE_REALS_IMAGES vs53, vs9, vs61, vs13 +/*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + AGGREGATE_REALS_IMAGES vs54, vs10, vs21, vs14 + MULT_APLHA_PART1 vs33, vs41, vs2, vs3 + AGGREGATE_REALS_IMAGES vs18, vs11, vs20, vs15 + MULT_APLHA_PART1 vs34, vs42, vs4, vs5 + MULT_APLHA_PART1 vs35, vs43, vs6, vs7 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs33, vs41, vs2, vs3 + MULT_APLHA_PART2 vs34, vs42, vs4, vs5 + MULT_APLHA_PART2 vs35, vs43, vs6, vs7 +#ifndef TRMMKERNEL + lxvp vs32, 0(T2) +#endif + MULT_APLHA_PART1 vs36, vs44, vs8, vs9 + MULT_APLHA_PART1 vs37, vs45, vs10, vs11 +#ifndef TRMMKERNEL + lxvp vs40, 32(T2) +#endif + MULT_APLHA_PART1 vs38, vs46, vs12, vs13 + MULT_APLHA_PART1 vs39, vs47, vs14, vs15 +#ifndef TRMMKERNEL + lxvp vs34, 0(T3) +#endif + MULT_APLHA_PART2 vs36, vs44, vs8, vs9 + MULT_APLHA_PART2 vs37, vs45, vs10, vs11 +#ifndef TRMMKERNEL + lxvp vs42, 32(T3) +#endif + MULT_APLHA_PART2 vs38, vs46, vs12, vs13 + MULT_APLHA_PART2 vs39, vs47, vs14, vs15 + RECONSTRUCT_PAIR1 + RECONSTRUCT_PAIR2 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 2 + xxpermdi vs3, vs10, vs2, 2 + xxpermdi vs5, vs12, vs4, 2 + xxpermdi vs7, vs14, vs6, 2 + xxpermdi vs9, vs0, vs8, 2 + xxpermdi vs11, vs2, vs10, 2 + xvaddsp vs24, vs24, vs3 + xvaddsp vs25, vs25, vs1 + xxpermdi vs13, vs4, vs12, 2 + xxpermdi vs15, vs6, vs14, 2 + xvaddsp vs26, vs26, vs7 + xvaddsp vs27, vs27, vs5 + xvaddsp vs28, vs28, vs11 + xvaddsp vs29, vs29, vs9 + xvaddsp vs30, vs30, vs15 + xvaddsp vs31, vs31, vs13 +#else + xxpermdi vs25, vs8, vs0, 2 + xxpermdi vs24, vs10, vs2, 2 + xxpermdi vs27, vs12, vs4, 2 + xxpermdi vs26, vs14, vs6, 2 + xxpermdi vs29, vs0, vs8, 2 + xxpermdi vs28, vs2, vs10, 2 + xxpermdi vs31, vs4, vs12, 2 + xxpermdi vs30, vs6, vs14, 2 +#endif + stxvp vs24, 0(CO) + MULT_APLHA_PART1 vs48, vs56, vs0, vs1 + MULT_APLHA_PART1 vs49, vs16, vs2, vs3 + stxvp vs26, 32(CO) + MULT_APLHA_PART1 vs50, vs58, vs4, vs5 + MULT_APLHA_PART1 vs17, vs19, vs6, vs7 + stxvp vs28, 0(T1) + MULT_APLHA_PART2 vs48, vs56, vs0, vs1 + MULT_APLHA_PART2 vs49, vs16, vs2, vs3 + stxvp vs30, 32(T1) + MULT_APLHA_PART2 vs50, vs58, vs4, vs5 + MULT_APLHA_PART2 vs17, vs19, vs6, vs7 + MULT_APLHA_PART1 vs52, vs60, vs8, vs9 + MULT_APLHA_PART1 vs53, vs61, vs10, vs11 + MULT_APLHA_PART1 vs54, vs21, vs12, vs13 + MULT_APLHA_PART1 vs18, vs20, vs14, vs15 + MULT_APLHA_PART2 vs52, vs60, vs8, vs9 + MULT_APLHA_PART2 vs53, vs61, vs10, vs11 + MULT_APLHA_PART2 vs54, vs21, vs12, vs13 + MULT_APLHA_PART2 vs18, vs20, vs14, vs15 + RECONSTRUCT_PAIR1 + RECONSTRUCT_PAIR2 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 2 + xxpermdi vs3, vs10, vs2, 2 + xxpermdi vs5, vs12, vs4, 2 + xxpermdi vs7, vs14, vs6, 2 + xxpermdi vs9, vs0, vs8, 2 + xxpermdi vs11, vs2, vs10, 2 + xvaddsp vs32, vs32, vs3 + xvaddsp vs33, vs33, vs1 + xxpermdi vs13, vs4, vs12, 2 + xxpermdi vs15, vs6, vs14, 2 + xvaddsp vs40, vs40, vs7 + xvaddsp vs41, vs41, vs5 + xvaddsp vs34, vs34, vs11 + xvaddsp vs35, vs35, vs9 + xvaddsp vs42, vs42, vs15 + xvaddsp vs43, vs43, vs13 +#else + xxpermdi vs33, vs8, vs0, 2 + xxpermdi vs32, vs10, vs2, 2 + xxpermdi vs41, vs12, vs4, 2 + xxpermdi vs40, vs14, vs6, 2 + xxpermdi vs35, vs0, vs8, 2 + xxpermdi vs34, vs2, vs10, 2 + xxpermdi vs43, vs4, vs12, 2 + xxpermdi vs42, vs6, vs14, 2 +#endif + stxvp vs32, 0(T2) + stxvp vs40, 32(T2) + stxvp vs34, 0(T3) + stxvp vs42, 32(T3) + addi CO, CO, 64 +.endm + +/* macros for N=4 and M=4 +**********************************************************************************************/ + +.macro ZERO4x4 + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + +.macro LOAD4x4 + LOAD4x4O 0, 0 +.endm + +.macro LOAD4x4O OffsetA, OffsetB + lxvp vs34, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) +.endm + +.macro END4x4_NORMAL + END4x4 AO, BO, 32, 32 +.endm + +.macro END4x4_WITHOUT_ADD + END4x4 AO, BO, 0, 0 +.endm + +.macro END4x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 3, 32, 34 + xvf32gerpp 2, 33, 34 + xvf32gerpp 1, 32, 35 + xvf32gerpp 0, 33, 35 +.endm + +.macro LOAD4x4_2 + LOAD4x4_2O 0, 0 +.endm + +.macro LOAD4x4_2O OffsetA, OffsetB + lxvp vs34, (\OffsetB)(BO) + lxvp vs38, (32+\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + lxvp vs36, (32+\OffsetA)(AO) +.endm + +.macro END4x4_2 + /*for load2 offset will be 64 and 64*/ + KERNEL4x4_2 AO, BO, 64, 64, 0, 1, 1 +.endm + +.macro KERNEL4x4_E2 OffsetA, OffsetB, Index, IsLast + KERNEL4x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL4x4_L2 OffsetA, OffsetB, Index, IsLast + KERNEL4x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 3, 32, 34 + xvf32gerpp 2, 33, 34 + xvf32gerpp 1, 32, 35 + xvf32gerpp 0, 33, 35 +.if \Complete==0 + lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) + lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) +.endif + xvf32gerpp 3, 36, 38 + xvf32gerpp 2, 37, 38 + xvf32gerpp 1, 36, 39 + xvf32gerpp 0, 37, 39 +.if \Complete==0 + lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) + lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index, \OffsetB) + addi \AREG, \AREG, DISP8(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP8(\Index, 64) + addi \AREG, \AREG, DISP8(\Index, 64) +.endif +.endif +.endm + +.macro KERNEL4x4 + LOAD4x4 + END4x4 AO, BO, 32, 32 +.endm + +.macro SAVE4x4 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46 + SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47 + add T4, LDC, LDC + add T1, CO, LDC +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif + add T2, CO, T4 + add T3, T1, T4 +#ifndef TRMMKERNEL + lxvp vs26, 0(T1) +#endif + #ifndef TRMMKERNEL + lxvp vs28, 0(T2) +#endif +#ifndef TRMMKERNEL + lxvp vs30, 0(T3) +#endif + GROUP1 + AGG_GROUP1 + GROUP2 + AGG_GROUP2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULTIPLY_GROUP1 + MULTIPLY_GROUP2 +/* reconstruct r, i pairs*/ + RECONSTRUCT_PAIR1 + RECONSTRUCT_PAIR2 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 2 + xxpermdi vs3, vs10, vs2, 2 + xxpermdi vs9, vs0, vs8, 2 + xxpermdi vs11, vs2, vs10, 2 + xxpermdi vs5, vs12, vs4, 2 + xxpermdi vs7, vs14, vs6, 2 + xxpermdi vs13, vs4, vs12, 2 + xxpermdi vs15, vs6, vs14, 2 + xvaddsp vs24, vs24, vs3 + xvaddsp vs25, vs25, vs1 + xvaddsp vs26, vs26, vs11 + xvaddsp vs27, vs27, vs9 + xvaddsp vs28, vs28, vs7 + xvaddsp vs29, vs29, vs5 + xvaddsp vs30, vs30, vs15 + xvaddsp vs31, vs31, vs13 +#else + xxpermdi vs25, vs8, vs0, 2 + xxpermdi vs24, vs10, vs2, 2 + xxpermdi vs27, vs0, vs8, 2 + xxpermdi vs26, vs2, vs10, 2 + xxpermdi vs29, vs12, vs4, 2 + xxpermdi vs28, vs14, vs6, 2 + xxpermdi vs31, vs4, vs12, 2 + xxpermdi vs30, vs6, vs14, 2 +#endif + stxvp vs24, 0(CO) + stxvp vs26, 0(T1) + stxvp vs28, 0(T2) + stxvp vs30, 0(T3) + addi CO, CO, 32 +.endm + +/* macros for N=4 and M=2 +**********************************************************************************************/ + +.macro ZERO4x2 + xxsetaccz 0 + xxsetaccz 1 +.endm + +.macro LOAD4x2 + LOAD4x2O 0, 0 +.endm + +.macro LOAD4x2O OffsetA, OffsetB + lxv vs32, (\OffsetA+0)(AO) + lxvp vs34, (\OffsetB+0)(BO) +.endm + +.macro END4x2_NORMAL + END4x2 AO, BO, 16, 32 +.endm + +.macro END4x2_WITHOUT_ADD + END4x2 AO, BO, 0, 0 +.endm + +.macro END4x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 1, 34, 32 + xvf32gerpp 0, 35, 32 +.endm + +.macro LOAD4x2_2 + LOAD4x2_2O 0, 0 +.endm + +.macro LOAD4x2_2O OffsetA, OffsetB + lxvp vs32, (\OffsetA)(AO) + lxvp vs34, (0+\OffsetB)(BO) + lxvp vs36, (32+\OffsetB)(BO) +.endm + +.macro END4x2_2 + /*for load2 offset will be 32 and 64*/ + KERNEL4x2_2 AO, BO, 32, 64, 0, 1, 1 +.endm + +.macro KERNEL4x2_E2 OffsetA, OffsetB, Index, IsLast + KERNEL4x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL4x2_L2 OffsetA, OffsetB, Index, IsLast + KERNEL4x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 1, 34, 33 + xvf32gerpp 0, 35, 33 +.if \Complete==0 + lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) +.endif + xvf32gerpp 1, 36, 32 + xvf32gerpp 0, 37, 32 +.if \Complete==0 + lxvp vs32, DISP4(\Index, \OffsetA)(\AREG) + lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index, \OffsetA) + addi \BREG, \BREG, DISP8(\Index, \OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index, 32) + addi \BREG, \BREG, DISP8(\Index, 64) +.endif +.endif +.endm + +.macro KERNEL4x2 + LOAD4x2 + END4x2 AO, BO, 16, 32 +.endm + +.macro SAVE4x2 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + add T4, LDC, LDC + add T1, CO, LDC + add T2, CO, T4 + add T3, T1, T4 +#ifndef TRMMKERNEL + lxv vs24, 0(CO) +#endif +#ifndef TRMMKERNEL + lxv vs25, 0(T1) +#endif +#ifndef TRMMKERNEL + lxv vs26, 0(T2) +#endif +#ifndef TRMMKERNEL + lxv vs27, 0(T3) +#endif + GROUP1 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs37, vs9, vs45, vs13 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULTIPLY_GROUP1 +/* reconstruct r, i pairs*/ + RECONSTRUCT_PAIR1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 0 + xxpermdi vs9, vs10, vs2, 0 + xxpermdi vs3, vs0, vs8, 3 + xxpermdi vs11, vs2, vs10, 3 + xvaddsp vs24, vs24, vs1 + xvaddsp vs26, vs26, vs9 + xvaddsp vs25, vs25, vs3 + xvaddsp vs27, vs27, vs11 +#else + xxpermdi vs24, vs8, vs0, 0 + xxpermdi vs26, vs10, vs2, 0 + xxpermdi vs25, vs0, vs8, 3 + xxpermdi vs27, vs2, vs10, 3 +#endif + stxv vs24, 0(CO) + stxv vs25, 0(T1) + stxv vs26, 0(T2) + stxv vs27, 0(T3) + addi CO, CO, 16 +.endm + +/* macros for N=4 and M=2 +**********************************************************************************************/ + +.macro ZERO4x1 + xxsetaccz 0 + xxsetaccz 1 +.endm + +.macro LOAD4x1 + LOAD4x1O 0, 0 +.endm + +.macro LOAD4x1O OffsetA, OffsetB + lxsd v0, (\OffsetA+0)(AO) + lxvp vs34, (\OffsetB+0)(BO) +.endm + +.macro END4x1_NORMAL + END4x1 AO, BO,8, 32 +.endm + +.macro END4x1_WITHOUT_ADD + END4x1 AO, BO, 0, 0 +.endm + +.macro END4x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 0, 35, 32 + xvf32gerpp 1, 34, 32 +.endm + +.macro LOAD4x1_2 + LOAD4x1_2O 0, 0 +.endm + +.macro LOAD4x1_2O OffsetA, OffsetB + lxv vs32, (\OffsetA)(AO) + vspltisb v6, 0 + xxpermdi vs33, vs32, vs38, 0 + xxpermdi vs32, vs32, vs38, 2 + lxvp vs34, (0+\OffsetB)(BO) + lxvp vs36, (32+\OffsetB)(BO) +.endm + +.macro END4x1_2 + /*for load2 offset will be 16 and 64*/ + KERNEL4x1_2 AO, BO, 16, 64, 0, 1, 1 +.endm + +.macro KERNEL4x1_E2 OffsetA, OffsetB, Index, IsLast + KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL4x1_L2 OffsetA, OffsetB, Index, IsLast + KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 0, 35, 32 + xvf32gerpp 1, 34, 32 +.if \Complete==0 + lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) +.endif + xvf32gerpp 0, 37, 33 + xvf32gerpp 1, 36, 33 +.if \Complete==0 + lxv vs32, DISP2(\Index, \OffsetA)(\AREG) + lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) + xxpermdi vs33, vs32, vs38, 0 + xxpermdi vs32, vs32, vs38, 2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index, \OffsetA) + addi \BREG, \BREG, DISP8(\Index, \OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index, 16) + addi \BREG, \BREG, DISP8(\Index, 64) +.endif +.endif +.endm + +.macro KERNEL4x1 + LOAD4x1 + END4x1 AO, BO, 8, 32 +.endm + +.macro SAVE4x1 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + xxpermdi vs32, vs32, vs36, 1 + xxpermdi vs40, vs40, vs44, 1 + xxpermdi vs33, vs33, vs37, 1 + xxpermdi vs41, vs41, vs45, 1 + add T4, LDC, LDC + add T1, CO, LDC + add T2, CO, T4 + add T3, T1, T4 +#ifndef TRMMKERNEL + lxsd v4, 0(CO) +#endif +#ifndef TRMMKERNEL + lxsd v5, 0(T1) +#endif +#ifndef TRMMKERNEL + lxsd v6, 0(T2) +#endif +#ifndef TRMMKERNEL + lxsd v7, 0(T3) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + xxperm vs1, vs33, permute_mask + xxperm vs5, vs41, permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART1 vs33, vs41, vs2, vs3 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs33, vs41, vs2, vs3 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, save_permute_1 + xxperm vs2, vs3, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxspltd vs1, vs0, 0 + xxspltd vs3, vs0, 1 + xxspltd vs9, vs2, 0 + xxspltd vs11, vs2, 1 + /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ + xvaddsp vs36, vs36, vs1 + xvaddsp vs37, vs37, vs3 + xvaddsp vs38, vs38, vs9 + xvaddsp vs39, vs39, vs11 +#else + /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ + xxspltd vs36, vs0, 0 + xxspltd vs37, vs0, 1 + xxspltd vs38, vs2, 0 + xxspltd vs39, vs2, 1 +#endif + stxsd v4, 0(CO) + stxsd v5, 0(T1) + stxsd v6, 0(T2) + stxsd v7, 0(T3) + addi CO, CO, 8 +.endm + +/* macros for N=2 and M=8 +**********************************************************************************************/ + +.macro ZERO2x8 + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + +.macro LOAD2x8 + LOAD2x8O 0, 0 +.endm + +.macro LOAD2x8O OffsetA, OffsetB + lxv vs34, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) + lxvp vs36, (\OffsetA+32)(AO) +.endm + +.macro END2x8_NORMAL + END2x8 AO, BO, 64, 16 +.endm + +.macro END2x8_WITHOUT_ADD + END2x8 AO, BO, 0, 0 +.endm + +.macro END2x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 2, 37, 34 + xvf32gerpp 3, 36, 34 + xvf32gerpp 0, 33, 34 + xvf32gerpp 1, 32, 34 +.endm + +.macro LOAD2x8_2 + LOAD2x8_2O 0, 0 +.endm + +.macro LOAD2x8_2O OffsetA, OffsetB + lxvp vs34, (\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + lxvp vs36, (32+\OffsetA)(AO) + lxvp vs38, (64+\OffsetA)(AO) + lxvp vs40, (64+32+\OffsetA)(AO) +.endm + +.macro END2x8_2 + /*for load2 offset will be 128 and 32*/ + KERNEL2x8_2 AO, BO, 128, 32, 0, 1, 1 +.endm + +.macro KERNEL2x8_E2 OffsetA, OffsetB, Index, IsLast + KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL2x8_L2 OffsetA, OffsetB, Index, IsLast + KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 2, 37, 35 + xvf32gerpp 3, 36, 35 + xvf32gerpp 0, 33, 35 + xvf32gerpp 1, 32, 35 + +.if \Complete==0 + lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) + lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) +.endif + xvf32gerpp 2, 41, 34 + xvf32gerpp 3, 40, 34 + xvf32gerpp 0, 39, 34 + xvf32gerpp 1, 38, 34 + +.if \Complete==0 + lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) + lxvp vs38, DISP16(\Index, 64+\OffsetA)(\AREG) + lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index, \OffsetB) + addi \AREG, \AREG, DISP16(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP4(\Index, 32) + addi \AREG, \AREG, DISP16(\Index, 128) +.endif +.endif +.endm + +.macro KERNEL2x8 + LOAD2x8 + END2x8 AO, BO, 64, 16 +.endm + +.macro SAVE2x8 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46 + SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47 + add T1, CO, LDC +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif +#ifndef TRMMKERNEL + lxvp vs26, 32(CO) +#endif +#ifndef TRMMKERNEL + lxvp vs28, 0(T1) +#endif +#ifndef TRMMKERNEL + lxvp vs30, 32(T1) +#endif + add T2, CO, T4 + add T3, T1, T4 + GROUP1 + AGG_GROUP1 + GROUP2 + AGG_GROUP2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULTIPLY_GROUP1 + MULTIPLY_GROUP2 +/* reconstruct r, i pairs*/ + RECONSTRUCT_PAIR1 + RECONSTRUCT_PAIR2 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 2 + xxpermdi vs3, vs10, vs2, 2 + xxpermdi vs5, vs12, vs4, 2 + xxpermdi vs7, vs14, vs6, 2 + xxpermdi vs9, vs0, vs8, 2 + xxpermdi vs11, vs2, vs10, 2 + xvaddsp vs24, vs24, vs3 + xvaddsp vs25, vs25, vs1 + xxpermdi vs13, vs4, vs12, 2 + xxpermdi vs15, vs6, vs14, 2 + xvaddsp vs26, vs26, vs7 + xvaddsp vs27, vs27, vs5 + xvaddsp vs28, vs28, vs11 + xvaddsp vs29, vs29, vs9 + xvaddsp vs30, vs30, vs15 + xvaddsp vs31, vs31, vs13 +#else + xxpermdi vs25, vs8, vs0, 2 + xxpermdi vs24, vs10, vs2, 2 + xxpermdi vs27, vs12, vs4, 2 + xxpermdi vs26, vs14, vs6, 2 + xxpermdi vs29, vs0, vs8, 2 + xxpermdi vs28, vs2, vs10, 2 + xxpermdi vs31, vs4, vs12, 2 + xxpermdi vs30, vs6, vs14, 2 +#endif + stxvp vs24, 0(CO) + stxvp vs26, 32(CO) + stxvp vs28, 0(T1) + stxvp vs30, 32(T1) + addi CO, CO, 64 +.endm + +/* macros for N=2 and M=4 +**********************************************************************************************/ + +.macro ZERO2x4 + xxsetaccz 0 + xxsetaccz 1 +.endm + +.macro LOAD2x4 + LOAD2x4O 0, 0 +.endm + +.macro LOAD2x4O OffsetA, OffsetB + lxv vs34, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) +.endm + +.macro END2x4_NORMAL + END2x4 AO, BO, 32, 16 +.endm + +.macro END2x4_WITHOUT_ADD + END2x4 AO, BO, 0, 0 +.endm + +.macro END2x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 0, 33, 34 + xvf32gerpp 1, 32, 34 +.endm + +.macro LOAD2x4_2 + LOAD2x4_2O 0, 0 +.endm + +.macro LOAD2x4_2O OffsetA, OffsetB + lxvp vs34, (\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + lxvp vs36, (32+\OffsetA)(AO) +.endm + +.macro END2x4_2 + /*for load2 offset will be 64 and 32*/ + KERNEL2x4_2 AO, BO, 64, 32, 0, 1, 1 +.endm + +.macro KERNEL2x4_E2 OffsetA, OffsetB, Index, IsLast + KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL2x4_L2 OffsetA, OffsetB, Index, IsLast + KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 0, 33, 35 + xvf32gerpp 1, 32, 35 +.if \Complete==0 + lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) +.endif + xvf32gerpp 0, 37, 34 + xvf32gerpp 1, 36, 34 +.if \Complete==0 + lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) + lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index, \OffsetB) + addi \AREG, \AREG, DISP8(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP4(\Index, 32) + addi \AREG, \AREG, DISP8(\Index, 64) +.endif +.endif +.endm + +.macro KERNEL2x4 + LOAD2x4 + END2x4 AO, BO, 32, 16 +.endm + +.macro SAVE2x4 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + add T1, CO, LDC +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif +#ifndef TRMMKERNEL + lxvp vs26, 0(T1) +#endif + GROUP1 + AGG_GROUP1 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULTIPLY_GROUP1 +/* reconstruct r, i pairs*/ + RECONSTRUCT_PAIR1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 2 + xxpermdi vs3, vs10, vs2, 2 + xxpermdi vs9, vs0, vs8, 2 + xxpermdi vs11, vs2, vs10, 2 + xvaddsp vs24, vs24, vs3 + xvaddsp vs25, vs25, vs1 + xvaddsp vs26, vs26, vs11 + xvaddsp vs27, vs27, vs9 +#else + xxpermdi vs25, vs8, vs0, 2 + xxpermdi vs24, vs10, vs2, 2 + xxpermdi vs27, vs0, vs8, 2 + xxpermdi vs26, vs2, vs10, 2 +#endif + stxvp vs24, 0(CO) + stxvp vs26, 0(T1) + addi CO, CO, 32 +.endm + +/* macros for N=2 and M=2 +**********************************************************************************************/ + +.macro ZERO2x2 + xxsetaccz 0 +.endm + +.macro LOAD2x2 + LOAD2x2O 0, 0 +.endm + +.macro LOAD2x2O OffsetA, OffsetB + lxv vs32, (\OffsetA+0)(AO) + lxv vs34, (\OffsetB+0)(BO) +.endm + +.macro END2x2_NORMAL + END2x2 AO, BO, 16, 16 +.endm + +.macro END2x2_WITHOUT_ADD + END2x2 AO, BO, 0, 0 +.endm + +.macro END2x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 0, 34, 32 +.endm + +.macro LOAD2x2_2 + LOAD2x2_2O 0, 0 +.endm + +.macro LOAD2x2_2O OffsetA, OffsetB + lxvp vs32, (\OffsetA)(AO) + lxvp vs34, (0+\OffsetB)(BO) +.endm + +.macro END2x2_2 + /*for load2 offset will be 32 and 32*/ + KERNEL2x2_2 AO, BO, 32, 32, 0, 1, 1 +.endm + +.macro KERNEL2x2_E2 OffsetA, OffsetB, Index, IsLast + KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL2x2_L2 OffsetA, OffsetB, Index, IsLast + KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL2x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 0, 34, 32 + xvf32gerpp 0, 35, 33 +.if \Complete==0 + lxvp vs32, DISP4(\Index, \OffsetA)(\AREG) + lxvp vs34, DISP4(\Index, \OffsetA)(\BREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index, \OffsetA) + addi \BREG, \BREG, DISP4(\Index, \OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index, 32) + addi \BREG, \BREG, DISP4(\Index, 32) +.endif +.endif +.endm + +.macro KERNEL2x2 + LOAD2x2 + END2x2 AO, BO, 16, 16 +.endm + +.macro SAVE2x2 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs24, 0(CO) +#endif +#ifndef TRMMKERNEL + lxv vs26, 0(T1) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + xxperm vs8, vs36, permute_mask + xxperm vs12, vs44, permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART1 vs36, vs44, vs8, vs9 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs36, vs44, vs8, vs9 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, save_permute_1 + xxperm vs8, vs9, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 0 + xxpermdi vs9, vs0, vs8, 3 + xvaddsp vs24, vs24, vs1 + xvaddsp vs26, vs26, vs9 +#else + xxpermdi vs24, vs8, vs0, 0 + xxpermdi vs26, vs0, vs8, 3 +#endif + stxv vs24, 0(CO) + stxv vs26, 0(T1) + addi CO, CO, 16 +.endm + +/* macros for N=2 and M=1 +**********************************************************************************************/ + +.macro ZERO2x1 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + +.macro LOAD2x1 + LOAD2x1O 0, 0 +.endm + +.macro LOAD2x1O OffsetA, OffsetB + lxsd v4, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + xxspltd vs24, vs36, 0 + xxperm vs26, vs24, permute_mask +.endm + +.macro END2x1_NORMAL + END2x1 AO, BO,8, 16 +.endm + +.macro END2x1_WITHOUT_ADD + END2x1 AO, BO, 0, 0 +.endm + +.macro END2x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddasp vs32, vs0, vs24 + xvmaddasp vs40, vs0, vs26 +.endm + +.macro LOAD2x1_2 + LOAD2x1_2O 0, 0 +.endm + +.macro LOAD2x1_2O OffsetA, OffsetB + lxv vs27, (\OffsetA)(AO) + lxvp vs4, (0+\OffsetB)(BO) + xxspltd vs8, vs27, 1 + xxspltd vs24, vs27, 0 + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + +.macro END2x1_2 + /*for load2 offset will be 16 and 32*/ + KERNEL2x1_2 AO, BO, 16, 32, 0, 1, 1 +.endm + +.macro KERNEL2x1_E2 OffsetA, OffsetB, Index, IsLast + KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL2x1_L2 OffsetA, OffsetB, Index, IsLast + KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL2x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvmaddasp vs32, vs5, vs8 + xvmaddasp vs40, vs5, vs10 +.if \Complete==0 + lxv vs27, DISP2(\Index, \OffsetA)(\AREG) + xxspltd vs8, vs27, 1 +.endif +.if \Complete==0 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs4, vs24 + xvmaddasp vs40, vs4, vs26 +.if \Complete==0 + xxspltd vs24, vs27, 0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxvp vs4, DISP4(\Index, 0+\OffsetB)(\BREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index, \OffsetA) + addi \BREG, \BREG, DISP4(\Index, \OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index, 16) + addi \BREG, \BREG, DISP4(\Index, 32) +.endif +.endif +.endm + +.macro KERNEL2x1 + LOAD2x1 + END2x1 AO, BO, 8, 16 +.endm + +.macro SAVE2x1 + add T1, CO, LDC +#ifndef TRMMKERNEL + lxsd v4, 0(CO) +#endif +#ifndef TRMMKERNEL + lxsd v5, 0(T1) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxspltd vs1, vs0, 0 + xxspltd vs3, vs0, 1 + /*--v4==vs36 v5==vs37---*/ + xvaddsp vs36, vs36, vs1 + xvaddsp vs37, vs37, vs3 +#else + /*--v4==vs36 v5==vs37---*/ + xxspltd vs36, vs0, 0 + xxspltd vs37, vs0, 1 +#endif + stxsd v4, 0(CO) + stxsd v5, 0(T1) + addi CO, CO, 8 +.endm + +/* macros for N=1 and M=8 +**********************************************************************************************/ + +.macro ZERO1x8 + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + +.macro LOAD1x8 + LOAD1x8O 0, 0 +.endm + +.macro LOAD1x8O OffsetA, OffsetB + lxsd v2, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) + lxvp vs36, (\OffsetA+32)(AO) +.endm + +.macro END1x8_NORMAL + END1x8 AO, BO, 64,8 +.endm + +.macro END1x8_WITHOUT_ADD + END1x8 AO, BO, 0, 0 +.endm + +.macro END1x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 0, 34, 33 + xvf32gerpp 1, 34, 32 + xvf32gerpp 2, 34, 37 + xvf32gerpp 3, 34, 36 +.endm + +.macro LOAD1x8_2 + LOAD1x8_2O 0, 0 +.endm + +.macro LOAD1x8_2O OffsetA, OffsetB + lxv vs34, (\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + lxvp vs36, (32+\OffsetA)(AO) + vspltisb v10, 0 + xxpermdi vs35, vs34, vs42, 0 + xxpermdi vs34, vs34, vs42, 2 + lxvp vs38, (64+\OffsetA)(AO) + lxvp vs40, (64+32+\OffsetA)(AO) +.endm + +.macro END1x8_2 + /*for load2 offset will be 128 and 16*/ + KERNEL1x8_2 AO, BO, 128, 16, 0, 1, 1 +.endm + +.macro KERNEL1x8_E2 OffsetA, OffsetB, Index, IsLast + KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL1x8_L2 OffsetA, OffsetB, Index, IsLast + KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL1x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 0, 34, 33 + xvf32gerpp 1, 34, 32 +.if \Complete==0 + lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) +.endif + xvf32gerpp 2, 34, 37 + xvf32gerpp 3, 34, 36 +.if \Complete==0 + lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) +.endif + xvf32gerpp 0, 35, 39 + xvf32gerpp 1, 35, 38 +.if \Complete==0 + lxvp vs38, DISP16(\Index, 64+\OffsetA)(\AREG) +.endif + xvf32gerpp 2, 35, 41 + xvf32gerpp 3, 35, 40 +.if \Complete==0 + lxv vs34, DISP2(\Index, \OffsetB)(\BREG) + xxpermdi vs35, vs34, vs42, 0 + xxpermdi vs34, vs34, vs42, 2 + lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index, \OffsetB) + addi \AREG, \AREG, DISP16(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index, 16) + addi \AREG, \AREG, DISP16(\Index, 128) +.endif +.endif +.endm + +.macro KERNEL1x8 + LOAD1x8 + END1x8 AO, BO, 64,8 +.endm + +.macro SAVE1x8 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46 + SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47 + xxpermdi vs32, vs32, vs36, 0 + xxpermdi vs33, vs33, vs37, 0 + xxpermdi vs34, vs34, vs38, 0 + xxpermdi vs35, vs35, vs39, 0 + xxpermdi vs40, vs40, vs44, 0 + xxperm vs40, vs40, permute_mask + xxpermdi vs41, vs41, vs45, 0 + xxperm vs41, vs41, permute_mask + xxpermdi vs42, vs42, vs46, 0 + xxperm vs42, vs42, permute_mask + xxpermdi vs43, vs43, vs47, 0 + xxperm vs43, vs43, permute_mask +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask +#ifndef TRMMKERNEL + lxvp vs26, 32(CO) +#endif + xxperm vs1, vs33, permute_mask + xxperm vs5, vs41, permute_mask + xxperm vs2, vs34, permute_mask + xxperm vs6, vs42, permute_mask + xxperm vs3, vs35, permute_mask + xxperm vs7, vs43, permute_mask + AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5 + AGGREGATE_REALS_IMAGES vs34, vs2, vs42, vs6 + AGGREGATE_REALS_IMAGES vs35, vs3, vs43, vs7 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1, 2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART1 vs33, vs41, vs2, vs3 + MULT_APLHA_PART1 vs34, vs42, vs4, vs5 + MULT_APLHA_PART1 vs35, vs43, vs6, vs7 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs33, vs41, vs2, vs3 + MULT_APLHA_PART2 vs34, vs42, vs4, vs5 + MULT_APLHA_PART2 vs35, vs43, vs6, vs7 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, vs28 + xxperm vs2, vs3, vs28 + xxperm vs4, vs5, vs28 + xxperm vs6, vs7, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24, vs24, vs2 + xvaddsp vs25, vs25, vs0 + xvaddsp vs26, vs26, vs6 + xvaddsp vs27, vs27, vs4 + stxvp vs24, 0(CO) + stxvp vs26, 32(CO) +#else +/* reconstruct r, i pairs*/ + stxv vs0, 0(CO) + stxv vs2, 16(CO) + stxv vs4, 32(CO) + stxv vs6, 48(CO) +#endif + addi CO, CO, 64 +.endm + +/* macros for N=1 and M=4 +**********************************************************************************************/ + +.macro ZERO1x4 + xxsetaccz 0 + xxsetaccz 1 +.endm + +.macro LOAD1x4 + LOAD1x4O 0, 0 +.endm + +.macro LOAD1x4O OffsetA, OffsetB + lxsd v2, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) +.endm + +.macro END1x4_NORMAL + END1x4 AO, BO, 32,8 +.endm + +.macro END1x4_WITHOUT_ADD + END1x4 AO, BO, 0, 0 +.endm + +.macro END1x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 0, 34, 33 + xvf32gerpp 1, 34, 32 +.endm + +.macro LOAD1x4_2 + LOAD1x4_2O 0, 0 +.endm + +.macro LOAD1x4_2O OffsetA, OffsetB + lxv vs34, (\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + vspltisb v6, 0 + xxpermdi vs35, vs34, vs38, 0 + xxpermdi vs34, vs34, vs38, 2 + lxvp vs36, (32+\OffsetA)(AO) +.endm + +.macro END1x4_2 + /*for load2 offset will be 64 and 16*/ + KERNEL1x4_2 AO, BO, 64, 16, 0, 1, 1 +.endm + +.macro KERNEL1x4_E2 OffsetA, OffsetB, Index, IsLast + KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL1x4_L2 OffsetA, OffsetB, Index, IsLast + KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL1x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 0, 34, 33 + xvf32gerpp 1, 34, 32 +.if \Complete==0 + lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) +.endif + xvf32gerpp 0, 35, 37 + xvf32gerpp 1, 35, 36 +.if \Complete==0 + lxv vs34, DISP2(\Index, \OffsetB)(\BREG) + xxpermdi vs35, vs34, vs38, 0 + xxpermdi vs34, vs34, vs38, 2 + lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index, \OffsetB) + addi \AREG, \AREG, DISP8(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index, 16) + addi \AREG, \AREG, DISP8(\Index, 64) +.endif +.endif +.endm + +.macro KERNEL1x4 + LOAD1x4 + END1x4 AO, BO, 32,8 +.endm + +.macro SAVE1x4 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + xxpermdi vs32, vs32, vs36, 0 + xxpermdi vs40, vs40, vs44, 0 + xxpermdi vs33, vs33, vs37, 0 + xxpermdi vs41, vs41, vs45, 0 + xxperm vs40, vs40, permute_mask + xxperm vs41, vs41, permute_mask +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + xxperm vs1, vs33, permute_mask + xxperm vs5, vs41, permute_mask + AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1, 2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART1 vs33, vs41, vs2, vs3 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs33, vs41, vs2, vs3 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, vs28 + xxperm vs2, vs3, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24, vs24, vs2 + xvaddsp vs25, vs25, vs0 + stxvp vs24, 0(CO) +#else +/* reconstruct r, i pairs*/ + stxv vs0, 0(CO) + stxv vs2, 16(CO) +#endif + addi CO, CO, 32 +.endm + +/* macros for N=1 and M=2 +**********************************************************************************************/ + +.macro ZERO1x2 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + +.macro LOAD1x2 + LOAD1x2O 0, 0 +.endm + +.macro LOAD1x2O OffsetA, OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + xxspltd vs24, vs36, 0 + xxperm vs26, vs24, permute_mask +.endm + +.macro END1x2_NORMAL + END1x2 AO, BO, 16,8 +.endm + +.macro END1x2_WITHOUT_ADD + END1x2 AO, BO, 0, 0 +.endm + +.macro END1x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddasp vs32, vs0, vs24 + xvmaddasp vs40, vs0, vs26 +.endm + +.macro LOAD1x2_2 + LOAD1x2_2O 0, 0 +.endm + +.macro LOAD1x2_2O OffsetA, OffsetB + lxv vs27, (\OffsetB)(BO) + lxvp vs4, (0+\OffsetA)(AO) + xxspltd vs8, vs27, 1 + xxspltd vs24, vs27, 0 + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + +.macro END1x2_2 + /*for load2 offset will be 32 and 16*/ + KERNEL1x2_2 AO, BO, 32, 16, 0, 1, 1 +.endm + +.macro KERNEL1x2_E2 OffsetA, OffsetB, Index, IsLast + KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL1x2_L2 OffsetA, OffsetB, Index, IsLast + KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL1x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +.if \Complete==0 + lxv vs27, DISP2(\Index, \OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs5, vs8 + xvmaddasp vs40, vs5, vs10 + +.if \Complete==0 + xxspltd vs8, vs27, 1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs4, vs24 + xvmaddasp vs40, vs4, vs26 +.if \Complete==0 + lxvp vs4, DISP4(\Index, 0+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs24, vs27, 0 + xxperm vs26, vs24, permute_mask +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index, \OffsetB) + addi \AREG, \AREG, DISP4(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index, 16) + addi \AREG, \AREG, DISP4(\Index, 32) +.endif +.endif +.endm + +.macro KERNEL1x2 + LOAD1x2 + END1x2 AO, BO, 16,8 +.endm + +.macro SAVE1x2 +#ifndef TRMMKERNEL + lxv vs24, 0(CO) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1, 2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24, vs24, vs0 + stxv vs24, 0(CO) +#else +/* reconstruct r, i pairs*/ + stxv vs0, 0(CO) +#endif + addi CO, CO, 16 +.endm + +/* macros for N=1 and M=1 +**********************************************************************************************/ +.macro ZERO1x1 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + +.macro LOAD1x1 + LOAD1x1O 0, 0 +.endm + +.macro LOAD1x1O OffsetA, OffsetB + lxsd v4, (\OffsetB+0)(BO) + lxsd v5, (\OffsetA+0)(AO) + xxperm vs38, vs36, permute_mask +.endm + +.macro END1x1_NORMAL + END1x1 AO, BO,8,8 +.endm + +.macro END1x1_WITHOUT_ADD + END1x1 AO, BO, 0, 0 +.endm + +.macro END1x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddasp vs32, vs37, vs36 + xvmaddasp vs40, vs37, vs38 +.endm + +.macro LOAD1x1_2 + LOAD1x1_2O 0, 0 +.endm + +.macro LOAD1x1_2O OffsetA, OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask +.endm + +.macro END1x1_2 + /*for load2 offset will be 16 and 16*/ + KERNEL1x1_2 AO, BO, 16, 16, 0, 1, 1 +.endm + +.macro KERNEL1x1_E2 OffsetA, OffsetB, Index, IsLast + KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL1x1_L2 OffsetA, OffsetB, Index, IsLast + KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL1x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvmaddasp vs32, vs4, vs8 + xvmaddasp vs40, vs4, vs10 +.if \Complete==0 + lxv vs8, DISP2(\Index, \OffsetB)(\BREG) + lxv vs4, DISP2(\Index, \OffsetB)(\AREG) + xxperm vs10, vs8, permute_mask +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index, \OffsetB) + addi \AREG, \AREG, DISP2(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index, 16) + addi \AREG, \AREG, DISP2(\Index, 16) +.endif +.endif +.endm + +.macro KERNEL1x1 + LOAD1x1 + END1x1 AO, BO, 8,8 +.endm + +.macro SAVE1x1 +#ifndef TRMMKERNEL + lxsd v4, 0(CO) +#endif + /*aggregate x2*/ + xxpermdi vs33, vs32, vs32, 2 + xxpermdi vs41, vs40, vs40, 2 + xvaddsp vs32, vs32, vs33 + xvaddsp vs40, vs40, vs41 + + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1, 2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs37, vs1 + MULT_APLHA_PART2 vs32, vs40, vs37, vs1 +/* reconstruct r, i pairs*/ + xxperm vs37, vs1, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs36, vs36, vs37 + stxsd v4, 0(CO) +#else +/* vs37 is v5 */ + stxsd v5, 0(CO) +#endif + addi CO, CO, 8 +.endm + +/****************************TRMM POINTER REFRESH MACROSES*************************/ +.macro SHIFT_REG REG1,REG2,SHIFT_VAL +.if \SHIFT_VAL==16 + slwi \REG1, \REG2, 7 +.elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 6 +.elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 5 +.elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 4 +.elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 3 +.endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*8; +// ptrbb = bb + off*4; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B, OFF_VAL, B_VAL, C_A, C_B +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +/* ptrbb = bb;*/ + mr \PTR_B, \B_VAL /* refresh BPOINT */ +#else +/* +// ptrba =ptrba+ off*C_A; +// ptrbb = bb + off*C_B; +*/ + SHIFT_REG T4, \OFF_VAL, \C_B /* Number of values in B shifted */ + SHIFT_REG T2, \OFF_VAL, \C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL, T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ +#endif +.endm + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+8; // number of values in A +// #else +// temp = off+4; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK, BK_VAL, OFF_VAL, INCR_A, INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK, \BK_VAL, \OFF_VAL + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK, \OFF_VAL, \INCR_B + #endif +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 8; // number of values in A +// #else +// temp -= 4; // number of values in B +// #endif +// ptrba += temp*8; +// ptrbb += temp*4; +// #endif + +// #ifdef LEFT +// off += 8; // number of values in A +// #endif +*/ +.macro REFRESH_AFTER_SAVE TEMP_BK, BK_VAL, OFF_VAL,PTR_B,PTR_A, C_A, C_B + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK, \BK_VAL, \OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK, \TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK, \TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4, \TEMP_BK, \C_A + SHIFT_REG T2, \TEMP_BK, \C_B + add \PTR_A, \PTR_A, T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B, T2 + #endif + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL, \OFF_VAL, \C_A + #endif +.endm diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c new file mode 100644 index 000000000..b3ee301be --- /dev/null +++ b/kernel/power/dgemm_kernel_power10.c @@ -0,0 +1,864 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include + +typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); + +#ifdef TRMMKERNEL +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] = result[2] * alpha; +#else +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[2] * alpha; +#endif + +#define SET_ACC_ZERO4() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); + +#define SET_ACC_ZERO8() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * x; \ + BO = B + off * y; \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * x; \ + BO += temp * y; +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, + FLOAT * C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG N = n; + BLASLONG i1; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + v4sf_t valpha = { alpha, alpha }; + N = n >> 2; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; + FLOAT *CO; + FLOAT *AO; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + CO = C; + C += ldc << 2; + AO = A; + PREFETCH1 (A, 128); + PREFETCH1 (A, 256); + i = m >> 4; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + BLASLONG l = 0; + PREFETCH1 (CO, 0); + PREFETCH1 (CO + ldc, 0); + PREFETCH1 (CO + ldc + ldc, 0); + PREFETCH1 (CO + ldc + ldc + ldc, 0); + PREFETCH1 (CO, 128); + PREFETCH1 (CO + ldc, 128); + PREFETCH1 (CO + ldc + ldc, 128); + PREFETCH1 (CO + ldc + ldc + ldc, 128); + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 4]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[l << 2]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); + __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); + __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); + __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC (&acc1, 2); + SAVE_ACC (&acc3, 6); + SAVE_ACC (&acc4, 8); + SAVE_ACC (&acc6, 12); + SAVE_ACC (&acc5, 10); + SAVE_ACC (&acc7, 14); + AO += temp << 4; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 4) +#endif + CO += 16; + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 3]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[l << 2]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC (&acc1, 2); + SAVE_ACC (&acc3, 6); + CO += 8; + AO += temp << 3; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 4) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 2]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[l << 2]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 2); + CO += 4; + AO += temp << 2; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 4) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[l << 2]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + } + SAVE_ACC (&acc0, 0); + CO += 2; + AO += temp << 1; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 4) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 4); +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1] }; + v4sf_t rowB1 = { BO[(l << 2) + 2], BO[(l << 2) + 3] }; + t += rowA * rowB; + t1 += rowA * rowB1; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[2 * ldc] = t1[0]; + CO[3 * ldc] = t1[1]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t1[0]; + CO[3 * ldc] += t1[1]; +#endif + CO += 1; + AO += temp; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 4) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + B += k << 2; + } + N = (n & 3) >> 1; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc << 1; + AO = A; + i = m >> 4; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 2); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[l << 4]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); + __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); + __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); + __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 2); + SAVE2x4_ACC (&acc2, 4); + SAVE2x4_ACC (&acc3, 6); + SAVE2x4_ACC (&acc4, 8); + SAVE2x4_ACC (&acc5, 10); + SAVE2x4_ACC (&acc6, 12); + SAVE2x4_ACC (&acc7, 14); + CO += 16; + AO += temp << 4; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 2) +#endif + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[l << 3]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 2); + SAVE2x4_ACC (&acc2, 4); + SAVE2x4_ACC (&acc3, 6); + CO += 8; + AO += temp << 3; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[l << 2]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 2); + CO += 4; + AO += temp << 2; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[l << 1]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + } + SAVE2x4_ACC (&acc0, 0); + CO += 2; + AO += temp << 1; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2); +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1] }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; +#endif + CO += 1; + AO += temp; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + B += k << 1; + } + N = (n & 1) >> 0; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc; + AO = A; + i = m; + while (i >= 16) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + v4sf_t t2 = { 0, 0 }; + v4sf_t t3 = { 0, 0 }; + v4sf_t t4 = { 0, 0 }; + v4sf_t t5 = { 0, 0 }; + v4sf_t t6 = { 0, 0 }; + v4sf_t t7 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] }; + v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] }; + v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] }; + v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] }; + v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] }; + v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] }; + v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] }; + v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] }; + t += rowA * rowB; + t1 += rowA1 * rowB; + t2 += rowA2 * rowB; + t3 += rowA3 * rowB; + t4 += rowA4 * rowB; + t5 += rowA5 * rowB; + t6 += rowA6 * rowB; + t7 += rowA7 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; + t4 = t4 * valpha; + t5 = t5 * valpha; + t6 = t6 * valpha; + t7 = t7 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t1[0]; + CO[3] = t1[1]; + CO[4] = t2[0]; + CO[5] = t2[1]; + CO[6] = t3[0]; + CO[7] = t3[1]; + CO[8] = t4[0]; + CO[9] = t4[1]; + CO[10] = t5[0]; + CO[11] = t5[1]; + CO[12] = t6[0]; + CO[13] = t6[1]; + CO[14] = t7[0]; + CO[15] = t7[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t1[0]; + CO[3] += t1[1]; + CO[4] += t2[0]; + CO[5] += t2[1]; + CO[6] += t3[0]; + CO[7] += t3[1]; + CO[8] += t4[0]; + CO[9] += t4[1]; + CO[10] += t5[0]; + CO[11] += t5[1]; + CO[12] += t6[0]; + CO[13] += t6[1]; + CO[14] += t7[0]; + CO[15] += t7[1]; +#endif + AO += temp << 4; + BO += temp; + CO += 16; + i -= 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 1) +#endif + } + while (i >= 8) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + v4sf_t t2 = { 0, 0 }; + v4sf_t t3 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1] }; + v4sf_t rowA1 = { AO[(l << 3) + 2], AO[(l << 3) + 3] }; + v4sf_t rowA2 = { AO[(l << 3) + 4], AO[(l << 3) + 5] }; + v4sf_t rowA3 = { AO[(l << 3) + 6], AO[(l << 3) + 7] }; + t += rowA * rowB; + t1 += rowA1 * rowB; + t2 += rowA2 * rowB; + t3 += rowA3 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t1[0]; + CO[3] = t1[1]; + CO[4] = t2[0]; + CO[5] = t2[1]; + CO[6] = t3[0]; + CO[7] = t3[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t1[0]; + CO[3] += t1[1]; + CO[4] += t2[0]; + CO[5] += t2[1]; + CO[6] += t3[0]; + CO[7] += t3[1]; +#endif + AO += temp << 3; + BO += temp; + CO += 8; + i -= 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + while (i >= 4) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1] }; + v4sf_t rowA1 = { AO[(l << 2) + 2], AO[(l << 2) + 3] }; + t += rowA * rowB; + t1 += rowA1 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t1[0]; + CO[3] = t1[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t1[0]; + CO[3] += t1[1]; +#endif + AO += temp << 2; + BO += temp; + CO += 4; + i -= 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } + while (i >= 2) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1] }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; +#endif + AO += temp << 1; + BO += temp; + CO += 2; + i -= 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + while (i >= 1) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + FLOAT t = 0; + for (l = 0; l < temp; l++) + { + t += AO[l] * BO[l]; + } + AO += temp; + BO += temp; +#if defined(TRMMKERNEL) + CO[0] = t * alpha; +#else + CO[0] += t * alpha; +#endif + CO += 1; + i -= 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k; + } + return 0; +} diff --git a/kernel/power/sgemm_kernel_power10.c b/kernel/power/sgemm_kernel_power10.c new file mode 100644 index 000000000..01c122c6d --- /dev/null +++ b/kernel/power/sgemm_kernel_power10.c @@ -0,0 +1,1334 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include + +typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); +#if defined(TRMMKERNEL) +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE4x2_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[6] * alpha; \ + rowC = (v2sf_t *) &CO[1* ldc+J]; \ + rowC[0] = result[4] * alpha; \ + rowC = (v2sf_t *) &CO[2* ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v2sf_t *) &CO[3* ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE4x2_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[4* ldc+J]; \ + rowC[0] = result[6] * alpha; \ + rowC = (v2sf_t *) &CO[5* ldc+J]; \ + rowC[0] = result[4] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v2sf_t *) &CO[7* ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] = result[2] * alpha; +#else +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE4x2_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[6] * alpha; \ + rowC = (v2sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[4] * alpha; \ + rowC = (v2sf_t *) &CO[2* ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[3* ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE4x2_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[6] * alpha; \ + rowC = (v2sf_t *) &CO[5* ldc+J]; \ + rowC[0] += result[4] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[7* ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[2] * alpha; +#endif +#define KERNEL(i, j) \ + __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \ + __builtin_mma_xvf32gerpp (&acc1, rowB[i+1], rowA[j]); \ + __builtin_mma_xvf32gerpp (&acc2, rowB[i], rowA[j+1]); \ + __builtin_mma_xvf32gerpp (&acc3, rowB[i+1], rowA[j+1]); \ + __builtin_mma_xvf32gerpp (&acc4, rowB[i], rowA[j+2]); \ + __builtin_mma_xvf32gerpp (&acc5, rowB[i+1], rowA[j+2]); \ + __builtin_mma_xvf32gerpp (&acc6, rowB[i], rowA[j+3]); \ + __builtin_mma_xvf32gerpp (&acc7, rowB[i+1], rowA[j+3]); +#define SET_ACC_ZERO4() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); + +#define SET_ACC_ZERO8() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * x; \ + BO = B + off * y; \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * x; \ + BO += temp * y; +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, + FLOAT * C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG N = n; + BLASLONG i1; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + v4sf_t valpha = { alpha, alpha, alpha, alpha }; + N = n >> 3; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; + FLOAT *CO; + FLOAT *AO; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + CO = C; + C += ldc << 3; + AO = A; + PREFETCH1 (A, 128); + PREFETCH1 (A, 256); + i = m >> 4; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + BLASLONG K = temp / 64; + for (l = 0; l < K; l++) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + KERNEL (8, 16); + KERNEL (10, 20); + KERNEL (12, 24); + KERNEL (14, 28); + KERNEL (16, 32); + KERNEL (18, 36); + KERNEL (20, 40); + KERNEL (22, 44); + KERNEL (24, 48); + KERNEL (26, 52); + KERNEL (28, 56); + KERNEL (30, 60); + KERNEL (32, 64); + KERNEL (34, 68); + KERNEL (36, 72); + KERNEL (38, 76); + KERNEL (40, 80); + KERNEL (42, 84); + KERNEL (44, 88); + KERNEL (46, 92); + KERNEL (48, 96); + KERNEL (50, 100); + KERNEL (52, 104); + KERNEL (54, 108); + KERNEL (56, 112); + KERNEL (58, 116); + KERNEL (60, 120); + KERNEL (62, 124); + KERNEL (64, 128); + KERNEL (66, 132); + KERNEL (68, 136); + KERNEL (70, 140); + KERNEL (72, 144); + KERNEL (74, 148); + KERNEL (76, 152); + KERNEL (78, 156); + KERNEL (80, 160); + KERNEL (82, 164); + KERNEL (84, 168); + KERNEL (86, 172); + KERNEL (88, 176); + KERNEL (90, 180); + KERNEL (92, 184); + KERNEL (94, 188); + KERNEL (96, 192); + KERNEL (98, 196); + KERNEL (100, 200); + KERNEL (102, 204); + KERNEL (104, 208); + KERNEL (106, 212); + KERNEL (108, 216); + KERNEL (110, 220); + KERNEL (112, 224); + KERNEL (114, 228); + KERNEL (116, 232); + KERNEL (118, 236); + KERNEL (120, 240); + KERNEL (122, 244); + KERNEL (124, 248); + KERNEL (126, 252); + AO += 1024; + BO += 512; + } + if ((temp & 63) >> 5) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + KERNEL (8, 16); + KERNEL (10, 20); + KERNEL (12, 24); + KERNEL (14, 28); + KERNEL (16, 32); + KERNEL (18, 36); + KERNEL (20, 40); + KERNEL (22, 44); + KERNEL (24, 48); + KERNEL (26, 52); + KERNEL (28, 56); + KERNEL (30, 60); + KERNEL (32, 64); + KERNEL (34, 68); + KERNEL (36, 72); + KERNEL (38, 76); + KERNEL (40, 80); + KERNEL (42, 84); + KERNEL (44, 88); + KERNEL (46, 92); + KERNEL (48, 96); + KERNEL (50, 100); + KERNEL (52, 104); + KERNEL (54, 108); + KERNEL (56, 112); + KERNEL (58, 116); + KERNEL (60, 120); + KERNEL (62, 124); + AO += 512; + BO += 256; + } + if ((temp & 31) >> 4) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + KERNEL (8, 16); + KERNEL (10, 20); + KERNEL (12, 24); + KERNEL (14, 28); + KERNEL (16, 32); + KERNEL (18, 36); + KERNEL (20, 40); + KERNEL (22, 44); + KERNEL (24, 48); + KERNEL (26, 52); + KERNEL (28, 56); + KERNEL (30, 60); + AO += 256; + BO += 128; + } + if ((temp & 15) >> 3) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + KERNEL (8, 16); + KERNEL (10, 20); + KERNEL (12, 24); + KERNEL (14, 28); + AO += 128; + BO += 64; + } + if ((temp & 7) >> 2) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + AO += 64; + BO += 32; + } + if ((temp & 3) >> 1) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + AO += 32; + BO += 16; + } + if ((temp & 1) >> 0) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + AO += 16; + BO += 8; + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC1 (&acc3, 4); + SAVE_ACC (&acc4, 8); + SAVE_ACC (&acc6, 12); + SAVE_ACC1 (&acc5, 8); + SAVE_ACC1 (&acc7, 12); +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 8) +#endif + CO += 16; + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 3]; + vec_t *rowB = (vec_t *) & BO[l << 3]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc3, rowB[1], rowA[1]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC1 (&acc3, 4); + AO += (temp << 3); + BO += (temp << 3); + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 8) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 2]; + vec_t *rowB = (vec_t *) & BO[l << 3]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC1 (&acc1, 0); + CO += 4; + AO += (temp << 2); + BO += (temp << 3); +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 8) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 8); +#else + BO = B; + temp = k; +#endif + + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1]; + vec_t *rowA = (vec_t *) & t[0]; + vec_t *rowB = (vec_t *) & BO[l << 3]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); + } + SAVE4x2_ACC (&acc0, 0); + SAVE4x2_ACC1 (&acc1, 0); + CO += 2; + AO += (temp << 1); + BO += (temp << 3); +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 8) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 8); +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1], BO[(l << 3) + 2], + BO[(l << 3) + 3] + }; + v4sf_t rowB1 = + { BO[(l << 3) + 4], BO[(l << 3) + 5], BO[(l << 3) + 6], + BO[(l << 3) + 7] + }; + t += rowA * rowB; + t1 += rowA * rowB1; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[2 * ldc] = t[2]; + CO[3 * ldc] = t[3]; + CO[4 * ldc] = t1[0]; + CO[5 * ldc] = t1[1]; + CO[6 * ldc] = t1[2]; + CO[7 * ldc] = t1[3]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t[2]; + CO[3 * ldc] += t[3]; + CO[4 * ldc] += t1[0]; + CO[5 * ldc] += t1[1]; + CO[6 * ldc] += t1[2]; + CO[7 * ldc] += t1[3]; +#endif + CO += 1; + AO += temp; + BO += (temp << 3); +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 8) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 8; // number of values in A +#endif + + B += k << 3; + } + N = (n & 7) >> 2; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc << 2; + AO = A; +#if !defined(TRMMKERNEL) + i = m >> 5; + for (j = 0; j < i; j++) + { + FLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + FLOAT *A1; + A1 = AO + (16 * k); + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 4]; + vec_t *rowA1 = (vec_t *) & A1[l << 4]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]); + __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]); + __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]); + __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]); + __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]); + } + + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + SAVE_ACC (&acc2, 0); + SAVE_ACC (&acc3, 4); + CO += 8; + SAVE_ACC (&acc4, 0); + SAVE_ACC (&acc5, 4); + CO += 8; + SAVE_ACC (&acc6, 0); + SAVE_ACC (&acc7, 4); + CO += 8; + AO += k << 5; + BO += k << 2; + } + i = (m & 31) >> 4; +#else + i = m >> 4; +#endif + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 4]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]); + } + + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + SAVE_ACC (&acc2, 0); + SAVE_ACC (&acc3, 4); + CO += 8; + AO += temp << 4; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 4) +#endif + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 3]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + AO += temp << 3; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 4) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + __vector_quad acc0; + v4sf_t result[4]; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 2]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + } + SAVE_ACC (&acc0, 0); + CO += 4; + AO += temp << 2; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 4) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 4); +#else + BO = B; + temp = k; +#endif + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1]; + vec_t *rowA = (vec_t *) & t[0]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + } + SAVE4x2_ACC (&acc0, 0); + CO += 2; + AO += temp << 1; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 4) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 4) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1], BO[(l << 2) + 2], + BO[(l << 2) + 3] + }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[2 * ldc] = t[2]; + CO[3 * ldc] = t[3]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t[2]; + CO[3 * ldc] += t[3]; +#endif + CO += 1; + AO += temp; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 4) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + + B += k << 2; + } + N = (n & 3) >> 1; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc << 1; + AO = A; +#if !defined(TRMMKERNEL) + i = m >> 5; + for (j = 0; j < i; j++) + { + FLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + FLOAT *A1; + A1 = AO + (16 * k); + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k; l++) + { + FLOAT t[4] = { 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[l << 4]; + vec_t *rowA1 = (vec_t *) & A1[l << 4]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]); + __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]); + __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]); + __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]); + __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + SAVE2x4_ACC (&acc2, 8); + SAVE2x4_ACC (&acc3, 12); + CO += 16; + SAVE2x4_ACC (&acc4, 0); + SAVE2x4_ACC (&acc5, 4); + SAVE2x4_ACC (&acc6, 8); + SAVE2x4_ACC (&acc7, 12); + CO += 16; + AO += k << 5; + BO += k << 1; + } + i = (m & 31) >> 4; +#else + i = m >> 4; +#endif + for (j = 0; j < i; j++) + { + FLOAT *BO; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 2) +#else + BO = B; + temp = k; +#endif + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[l << 4]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + SAVE2x4_ACC (&acc2, 8); + SAVE2x4_ACC (&acc3, 12); + CO += 16; + AO += temp << 4; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 2) +#endif + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[l << 3]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + CO += 8; + AO += temp << 3; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + } + SAVE2x4_ACC (&acc0, 0); + CO += 4; + AO += temp << 2; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2) +#else + BO = B; + temp = k; +#endif + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < (temp << 1); l += 2) + { + v4sf_t rowA = { AO[l], AO[l], AO[l + 1], AO[l + 1] }; + v4sf_t rowB = { BO[l], BO[l + 1], BO[l], BO[l + 1] }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[0 * ldc + 1] = t[2]; + CO[1 * ldc + 1] = t[3]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[0 * ldc + 1] += t[2]; + CO[1 * ldc + 1] += t[3]; +#endif + CO += 2; + AO += temp << 1; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2) +#else + BO = B; + temp = k; +#endif + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l], 0, 0 }; + v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1], 0, 0 }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; +#endif + CO += 1; + AO += temp; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + + B += k << 1; + } + N = (n & 1) >> 0; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc; + AO = A; + i = m; + while (i >= 16) + { + FLOAT *BO; + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 1) +#else + BO = B; + temp = k; +#endif + + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; + v4sf_t t2 = { 0, 0, 0, 0 }; + v4sf_t t3 = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1], AO[(l << 4) + 2], + AO[(l << 4) + 3] + }; + v4sf_t rowA1 = + { AO[(l << 4) + 4], AO[(l << 4) + 5], AO[(l << 4) + 6], + AO[(l << 4) + 7] + }; + v4sf_t rowA2 = + { AO[(l << 4) + 8], AO[(l << 4) + 9], AO[(l << 4) + 10], + AO[(l << 4) + 11] + }; + v4sf_t rowA3 = + { AO[(l << 4) + 12], AO[(l << 4) + 13], AO[(l << 4) + 14], + AO[(l << 4) + 15] + }; + t += rowA * rowB; + t1 += rowA1 * rowB; + t2 += rowA2 * rowB; + t3 += rowA3 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t[2]; + CO[3] = t[3]; + CO[4] = t1[0]; + CO[5] = t1[1]; + CO[6] = t1[2]; + CO[7] = t1[3]; + CO[8] = t2[0]; + CO[9] = t2[1]; + CO[10] = t2[2]; + CO[11] = t2[3]; + CO[12] = t3[0]; + CO[13] = t3[1]; + CO[14] = t3[2]; + CO[15] = t3[3]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + CO[4] += t1[0]; + CO[5] += t1[1]; + CO[6] += t1[2]; + CO[7] += t1[3]; + CO[8] += t2[0]; + CO[9] += t2[1]; + CO[10] += t2[2]; + CO[11] += t2[3]; + CO[12] += t3[0]; + CO[13] += t3[1]; + CO[14] += t3[2]; + CO[15] += t3[3]; +#endif + AO += temp << 4; + BO += temp; + CO += 16; + i -= 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 1) +#endif + } + while (i >= 8) + { + FLOAT *BO; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1], AO[(l << 3) + 2], + AO[(l << 3) + 3] + }; + v4sf_t rowA1 = + { AO[(l << 3) + 4], AO[(l << 3) + 5], AO[(l << 3) + 6], + AO[(l << 3) + 7] + }; + t += rowA * rowB; + t1 += rowA1 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t[2]; + CO[3] = t[3]; + CO[4] = t1[0]; + CO[5] = t1[1]; + CO[6] = t1[2]; + CO[7] = t1[3]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + CO[4] += t1[0]; + CO[5] += t1[1]; + CO[6] += t1[2]; + CO[7] += t1[3]; +#endif + AO += temp << 3; + BO += temp; + CO += 8; + i -= 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + while (i >= 4) + { + FLOAT *BO; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1], AO[(l << 2) + 2], + AO[(l << 2) + 3] + }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t[2]; + CO[3] = t[3]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; +#endif + AO += temp << 2; + BO += temp; + CO += 4; + i -= 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } + while (i >= 2) + { + FLOAT *BO; + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l], 0, 0 }; + v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1], 0, 0 }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; +#endif + AO += temp << 1; + BO += temp; + CO += 2; + i -= 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + while (i >= 1) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + + BLASLONG l = 0; + FLOAT t = 0; + for (l = 0; l < temp; l++) + { + t += AO[l] * BO[l]; + } + AO += temp; + BO += temp; +#if defined(TRMMKERNEL) + CO[0] = t * alpha; +#else + CO[0] += t * alpha; +#endif + CO += 1; + i -= 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k; + } + return 0; +}