diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 760d568cd..d40b20dd8 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -3,14 +3,18 @@ #CGEMM_BETA = ../generic/zgemm_beta.c #ZGEMM_BETA = ../generic/zgemm_beta.c -STRMMKERNEL = gemm_kernel_power6.S +STRMMKERNEL = strmm_kernel_16x8_power8.S DTRMMKERNEL = dtrmm_kernel_16x4_power8.S CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S -SGEMMKERNEL = gemm_kernel_power6.S -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMKERNEL = sgemm_kernel_16x8_power8.S +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = ../generic/gemm_tcopy_16.c +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o @@ -146,7 +150,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SGEMVTKERNEL = ../arm/gemv_t.c #DGEMVTKERNEL = ../arm/gemv_t.c #CGEMVTKERNEL = ../arm/zgemv_t.c -#ZGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = zgemv_t_4.c #SSYMV_U_KERNEL = ../generic/symv_k.c diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S new file mode 100644 index 000000000..9f221301a --- /dev/null +++ b/kernel/power/sgemm_kernel_16x8_power8.S @@ -0,0 +1,354 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_SP 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA_SP 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define alpha_r vs30 +#define alpha_vr vs31 + +#define o0 0 + +#define o4 r15 +#define o12 r16 +#define o8 r17 +#define L r18 +#define T1 r19 +#define KK r20 +#define BB r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T2 r31 + +#include "sgemm_macros_16x8_power8.S" + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) +#endif + + // stfd f1, ALPHA_SP + // stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, 2 + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif +#endif + + + cmpwi cr0, M, 0 + ble .L999_H1 + cmpwi cr0, N, 0 + ble .L999_H1 + cmpwi cr0, K, 0 + ble .L999_H1 + + li PRE, 384 + li o4 , 4 + li o8 , 8 + li o12, 12 + li o16, 16 + li o32, 32 + li o48, 48 + + addi T1, SP, 300 + stfs f1, 0(T1) + stfs f1, 4(T1) + stfs f1, 8(T1) + stfs f1,12(T1) + + lxsspx vs28, 0, T1 + + xxspltw alpha_r, vs28 , 0 + lxvw4x alpha_vr, 0, T1 + + + +#include "sgemm_logic_16x8_power8.S" + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/sgemm_logic_16x8_power8.S b/kernel/power/sgemm_logic_16x8_power8.S new file mode 100644 index 000000000..6c5a1c7ef --- /dev/null +++ b/kernel/power/sgemm_logic_16x8_power8.S @@ -0,0 +1,2172 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +**************************************************************************************/ + + + srawi. J, N, 3 + ble .LSGEMM_L8_END + +.LSGEMM_L8_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 3 + add C, C, T1 + srawi. I, M, 4 + ble .LSGEMM_L8x16_END + +.LSGEMM_L8x16_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L8x16_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L8x16_SUB4 + +.LSGEMM_L8x16_LOOP_START: + + dcbt AO, PRE + LOAD8x16_1 + KERNEL8x16_I1 + dcbt AO, PRE + KERNEL8x16_2 + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + addic. L, L, -2 + ble .LSGEMM_L8x16_LOOP_END + + .align 5 + +.LSGEMM_L8x16_LOOP: + + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + addic. L, L, -1 + bgt .LSGEMM_L8x16_LOOP + +.LSGEMM_L8x16_LOOP_END: + + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + KERNEL8x16_1 + KERNEL8x16_E2 + + b .LSGEMM_L8x16_SUB1 + +.LSGEMM_L8x16_SUB4: + + dcbt AO, PRE + KERNEL8x16_SUBI1 + KERNEL8x16_SUB1 + dcbt AO, PRE + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + + b .LSGEMM_L8x16_SUB1 + +.LSGEMM_L8x16_SUB0: + + andi. L, K, 7 + + KERNEL8x16_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L8x16_SAVE + b .LSGEMM_L8x16_SUB2 + +.LSGEMM_L8x16_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L8x16_SAVE + +.LSGEMM_L8x16_SUB2: + + KERNEL8x16_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L8x16_SUB2 + +.LSGEMM_L8x16_SAVE: + + SAVE8x16 + + addic. I, I, -1 + bgt .LSGEMM_L8x16_BEGIN + +.LSGEMM_L8x16_END: + +.LSGEMM_L8x8_BEGIN: + + andi. T2, M, 15 + ble .LSGEMM_L8x1_END + + andi. T1, M, 8 + ble .LSGEMM_L8x8_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L8x8_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L8x8_SUB4 + +.LSGEMM_L8x8_LOOP_START: + + LOAD8x8_1 + KERNEL8x8_I1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + addic. L, L, -2 + ble .LSGEMM_L8x8_LOOP_END + + .align 5 + +.LSGEMM_L8x8_LOOP: + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + addic. L, L, -1 + bgt .LSGEMM_L8x8_LOOP + +.LSGEMM_L8x8_LOOP_END: + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_E2 + + b .LSGEMM_L8x8_SUB1 + +.LSGEMM_L8x8_SUB4: + + KERNEL8x8_SUBI1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + + b .LSGEMM_L8x8_SUB1 + +.LSGEMM_L8x8_SUB0: + + andi. L, K, 7 + + KERNEL8x8_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L8x8_SAVE + b .LSGEMM_L8x8_SUB2 + +.LSGEMM_L8x8_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L8x8_SAVE + +.LSGEMM_L8x8_SUB2: + + KERNEL8x8_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L8x8_SUB2 + +.LSGEMM_L8x8_SAVE: + + SAVE8x8 + +.LSGEMM_L8x8_END: + +.LSGEMM_L8x4_BEGIN: + + + andi. T1, M, 4 + ble .LSGEMM_L8x4_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L8x4_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L8x4_SUB4 + +.LSGEMM_L8x4_LOOP_START: + + LOAD8x4_1 + KERNEL8x4_I1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + addic. L, L, -2 + ble .LSGEMM_L8x4_LOOP_END + + .align 5 + +.LSGEMM_L8x4_LOOP: + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + addic. L, L, -1 + bgt .LSGEMM_L8x4_LOOP + +.LSGEMM_L8x4_LOOP_END: + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_E2 + + b .LSGEMM_L8x4_SUB1 + +.LSGEMM_L8x4_SUB4: + + KERNEL8x4_SUBI1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + + b .LSGEMM_L8x4_SUB1 + +.LSGEMM_L8x4_SUB0: + + andi. L, K, 7 + + KERNEL8x4_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L8x4_SAVE + b .LSGEMM_L8x4_SUB2 + +.LSGEMM_L8x4_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L8x4_SAVE + +.LSGEMM_L8x4_SUB2: + + KERNEL8x4_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L8x4_SUB2 + +.LSGEMM_L8x4_SAVE: + + SAVE8x4 + +.LSGEMM_L8x4_END: + +.LSGEMM_L8x2_BEGIN: + + + andi. T1, M, 2 + ble .LSGEMM_L8x2_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L8x2_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L8x2_SUB4 + +.LSGEMM_L8x2_LOOP_START: + + LOAD8x2_1 + KERNEL8x2_I1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + addic. L, L, -2 + ble .LSGEMM_L8x2_LOOP_END + + .align 5 + +.LSGEMM_L8x2_LOOP: + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + addic. L, L, -1 + bgt .LSGEMM_L8x2_LOOP + +.LSGEMM_L8x2_LOOP_END: + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_E2 + + b .LSGEMM_L8x2_SUB1 + +.LSGEMM_L8x2_SUB4: + + KERNEL8x2_SUBI1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + + b .LSGEMM_L8x2_SUB1 + +.LSGEMM_L8x2_SUB0: + + andi. L, K, 7 + + KERNEL8x2_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L8x2_SAVE + b .LSGEMM_L8x2_SUB2 + +.LSGEMM_L8x2_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L8x2_SAVE + +.LSGEMM_L8x2_SUB2: + + KERNEL8x2_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L8x2_SUB2 + +.LSGEMM_L8x2_SAVE: + + SAVE8x2 + +.LSGEMM_L8x2_END: + +.LSGEMM_L8x1_BEGIN: + + + andi. T1, M, 1 + ble .LSGEMM_L8x1_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L8x1_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L8x1_SUB4 + +.LSGEMM_L8x1_LOOP_START: + + LOAD8x1_1 + KERNEL8x1_I1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + addic. L, L, -2 + ble .LSGEMM_L8x1_LOOP_END + + .align 5 + +.LSGEMM_L8x1_LOOP: + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + addic. L, L, -1 + bgt .LSGEMM_L8x1_LOOP + +.LSGEMM_L8x1_LOOP_END: + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_E2 + + b .LSGEMM_L8x1_SUB1 + +.LSGEMM_L8x1_SUB4: + + KERNEL8x1_SUBI1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + + b .LSGEMM_L8x1_SUB1 + +.LSGEMM_L8x1_SUB0: + + andi. L, K, 7 + + KERNEL8x1_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L8x1_SAVE + b .LSGEMM_L8x1_SUB2 + +.LSGEMM_L8x1_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L8x1_SAVE + +.LSGEMM_L8x1_SUB2: + + KERNEL8x1_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L8x1_SUB2 + +.LSGEMM_L8x1_SAVE: + + SAVE8x1 + +.LSGEMM_L8x1_END: + + slwi T1, K, 5 + add B, B, T1 + + addic. J, J, -1 + bgt .LSGEMM_L8_BEGIN + + andi. T2, N, 7 + ble .L999 + +.LSGEMM_L8_END: + + b .LSGEMM_L4_BEGIN + +.L999_H1: + + b .L999 + +.LSGEMM_L4_BEGIN: + + andi. T1, N, 4 + ble .LSGEMM_L4_END + mr CO, C + mr AO, A + slwi T1, LDC , 2 + add C, C, T1 + srawi. I, M, 4 + ble .LSGEMM_L4x16_END + +.LSGEMM_L4x16_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L4x16_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L4x16_SUB4 + +.LSGEMM_L4x16_LOOP_START: + + dcbt AO, PRE + LOAD4x16_1 + KERNEL4x16_I1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -2 + ble .LSGEMM_L4x16_LOOP_END + + .align 5 + +.LSGEMM_L4x16_LOOP: + + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -1 + bgt .LSGEMM_L4x16_LOOP + +.LSGEMM_L4x16_LOOP_END: + + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + KERNEL4x16_E2 + + b .LSGEMM_L4x16_SUB1 + +.LSGEMM_L4x16_SUB4: + + dcbt AO, PRE + KERNEL4x16_SUBI1 + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + + b .LSGEMM_L4x16_SUB1 + +.LSGEMM_L4x16_SUB0: + + andi. L, K, 7 + + KERNEL4x16_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L4x16_SAVE + b .LSGEMM_L4x16_SUB2 + +.LSGEMM_L4x16_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L4x16_SAVE + +.LSGEMM_L4x16_SUB2: + + KERNEL4x16_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L4x16_SUB2 + +.LSGEMM_L4x16_SAVE: + + SAVE4x16 + + addic. I, I, -1 + bgt .LSGEMM_L4x16_BEGIN + +.LSGEMM_L4x16_END: + +.LSGEMM_L4x8_BEGIN: + + andi. T2, M, 15 + ble .LSGEMM_L4x1_END + + andi. T1, M, 8 + ble .LSGEMM_L4x8_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L4x8_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L4x8_SUB4 + +.LSGEMM_L4x8_LOOP_START: + + LOAD4x8_1 + KERNEL4x8_I1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -2 + ble .LSGEMM_L4x8_LOOP_END + + .align 5 + +.LSGEMM_L4x8_LOOP: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -1 + bgt .LSGEMM_L4x8_LOOP + +.LSGEMM_L4x8_LOOP_END: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_E2 + + b .LSGEMM_L4x8_SUB1 + +.LSGEMM_L4x8_SUB4: + + KERNEL4x8_SUBI1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + b .LSGEMM_L4x8_SUB1 + +.LSGEMM_L4x8_SUB0: + + andi. L, K, 7 + + KERNEL4x8_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L4x8_SAVE + b .LSGEMM_L4x8_SUB2 + +.LSGEMM_L4x8_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L4x8_SAVE + +.LSGEMM_L4x8_SUB2: + + KERNEL4x8_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L4x8_SUB2 + +.LSGEMM_L4x8_SAVE: + + SAVE4x8 + +.LSGEMM_L4x8_END: + +.LSGEMM_L4x4_BEGIN: + + + andi. T1, M, 4 + ble .LSGEMM_L4x4_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L4x4_SUB4 + +.LSGEMM_L4x4_LOOP_START: + + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -2 + ble .LSGEMM_L4x4_LOOP_END + + .align 5 + +.LSGEMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -1 + bgt .LSGEMM_L4x4_LOOP + +.LSGEMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b .LSGEMM_L4x4_SUB1 + +.LSGEMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b .LSGEMM_L4x4_SUB1 + +.LSGEMM_L4x4_SUB0: + + andi. L, K, 7 + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L4x4_SAVE + b .LSGEMM_L4x4_SUB2 + +.LSGEMM_L4x4_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L4x4_SAVE + +.LSGEMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L4x4_SUB2 + +.LSGEMM_L4x4_SAVE: + + SAVE4x4 + +.LSGEMM_L4x4_END: + +.LSGEMM_L4x2_BEGIN: + + + andi. T1, M, 2 + ble .LSGEMM_L4x2_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L4x2_SUB4 + +.LSGEMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble .LSGEMM_L4x2_LOOP_END + + .align 5 + +.LSGEMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt .LSGEMM_L4x2_LOOP + +.LSGEMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b .LSGEMM_L4x2_SUB1 + +.LSGEMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b .LSGEMM_L4x2_SUB1 + +.LSGEMM_L4x2_SUB0: + + andi. L, K, 7 + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L4x2_SAVE + b .LSGEMM_L4x2_SUB2 + +.LSGEMM_L4x2_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L4x2_SAVE + +.LSGEMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L4x2_SUB2 + +.LSGEMM_L4x2_SAVE: + + SAVE4x2 + +.LSGEMM_L4x2_END: + +.LSGEMM_L4x1_BEGIN: + + + andi. T1, M, 1 + ble .LSGEMM_L4x1_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L4x1_SUB4 + +.LSGEMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble .LSGEMM_L4x1_LOOP_END + + .align 5 + +.LSGEMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt .LSGEMM_L4x1_LOOP + +.LSGEMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b .LSGEMM_L4x1_SUB1 + +.LSGEMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b .LSGEMM_L4x1_SUB1 + +.LSGEMM_L4x1_SUB0: + + andi. L, K, 7 + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L4x1_SAVE + b .LSGEMM_L4x1_SUB2 + +.LSGEMM_L4x1_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L4x1_SAVE + +.LSGEMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L4x1_SUB2 + +.LSGEMM_L4x1_SAVE: + + SAVE4x1 + +.LSGEMM_L4x1_END: + + slwi T1, K, 4 + add B, B, T1 + +.LSGEMM_L4_END: +.LSGEMM_L2_BEGIN: + + andi. T1, N, 2 + ble .LSGEMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + srawi. I, M, 4 + ble .LSGEMM_L2x16_END + +.LSGEMM_L2x16_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L2x16_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L2x16_SUB4 + +.LSGEMM_L2x16_LOOP_START: + + dcbt AO, PRE + LOAD2x16_1 + KERNEL2x16_I1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -2 + ble .LSGEMM_L2x16_LOOP_END + + .align 5 + +.LSGEMM_L2x16_LOOP: + + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -1 + bgt .LSGEMM_L2x16_LOOP + +.LSGEMM_L2x16_LOOP_END: + + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + KERNEL2x16_E2 + + b .LSGEMM_L2x16_SUB1 + +.LSGEMM_L2x16_SUB4: + + dcbt AO, PRE + KERNEL2x16_SUBI1 + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + b .LSGEMM_L2x16_SUB1 + +.LSGEMM_L2x16_SUB0: + + andi. L, K, 7 + + KERNEL2x16_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L2x16_SAVE + b .LSGEMM_L2x16_SUB2 + +.LSGEMM_L2x16_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L2x16_SAVE + +.LSGEMM_L2x16_SUB2: + + KERNEL2x16_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L2x16_SUB2 + +.LSGEMM_L2x16_SAVE: + + SAVE2x16 + + addic. I, I, -1 + bgt .LSGEMM_L2x16_BEGIN + +.LSGEMM_L2x16_END: + +.LSGEMM_L2x8_BEGIN: + + andi. T2, M, 15 + ble .LSGEMM_L2x1_END + + andi. T1, M, 8 + ble .LSGEMM_L2x8_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L2x8_SUB4 + +.LSGEMM_L2x8_LOOP_START: + + LOAD2x8_1 + KERNEL2x8_I1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -2 + ble .LSGEMM_L2x8_LOOP_END + + .align 5 + +.LSGEMM_L2x8_LOOP: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -1 + bgt .LSGEMM_L2x8_LOOP + +.LSGEMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b .LSGEMM_L2x8_SUB1 + +.LSGEMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b .LSGEMM_L2x8_SUB1 + +.LSGEMM_L2x8_SUB0: + + andi. L, K, 7 + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L2x8_SAVE + b .LSGEMM_L2x8_SUB2 + +.LSGEMM_L2x8_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L2x8_SAVE + +.LSGEMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L2x8_SUB2 + +.LSGEMM_L2x8_SAVE: + + SAVE2x8 + +.LSGEMM_L2x8_END: + +.LSGEMM_L2x4_BEGIN: + + + andi. T1, M, 4 + ble .LSGEMM_L2x4_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L2x4_SUB4 + +.LSGEMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble .LSGEMM_L2x4_LOOP_END + + .align 5 + +.LSGEMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt .LSGEMM_L2x4_LOOP + +.LSGEMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b .LSGEMM_L2x4_SUB1 + +.LSGEMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b .LSGEMM_L2x4_SUB1 + +.LSGEMM_L2x4_SUB0: + + andi. L, K, 7 + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L2x4_SAVE + b .LSGEMM_L2x4_SUB2 + +.LSGEMM_L2x4_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L2x4_SAVE + +.LSGEMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L2x4_SUB2 + +.LSGEMM_L2x4_SAVE: + + SAVE2x4 + +.LSGEMM_L2x4_END: + +.LSGEMM_L2x2_BEGIN: + + + andi. T1, M, 2 + ble .LSGEMM_L2x2_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L2x2_SUB4 + +.LSGEMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble .LSGEMM_L2x2_LOOP_END + + .align 5 + +.LSGEMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt .LSGEMM_L2x2_LOOP + +.LSGEMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b .LSGEMM_L2x2_SUB1 + +.LSGEMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b .LSGEMM_L2x2_SUB1 + +.LSGEMM_L2x2_SUB0: + + andi. L, K, 7 + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L2x2_SAVE + b .LSGEMM_L2x2_SUB2 + +.LSGEMM_L2x2_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L2x2_SAVE + +.LSGEMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L2x2_SUB2 + +.LSGEMM_L2x2_SAVE: + + SAVE2x2 + +.LSGEMM_L2x2_END: + +.LSGEMM_L2x1_BEGIN: + + + andi. T1, M, 1 + ble .LSGEMM_L2x1_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L2x1_SUB4 + +.LSGEMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble .LSGEMM_L2x1_LOOP_END + + .align 5 + +.LSGEMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt .LSGEMM_L2x1_LOOP + +.LSGEMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b .LSGEMM_L2x1_SUB1 + +.LSGEMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b .LSGEMM_L2x1_SUB1 + +.LSGEMM_L2x1_SUB0: + + andi. L, K, 7 + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L2x1_SAVE + b .LSGEMM_L2x1_SUB2 + +.LSGEMM_L2x1_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L2x1_SAVE + +.LSGEMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L2x1_SUB2 + +.LSGEMM_L2x1_SAVE: + + SAVE2x1 + +.LSGEMM_L2x1_END: + + slwi T1, K, 3 + add B, B, T1 + +.LSGEMM_L2_END: +.LSGEMM_L1_BEGIN: + + andi. T1, N, 1 + ble .LSGEMM_L1_END + mr CO, C + mr AO, A + srawi. I, M, 4 + ble .LSGEMM_L1x16_END + +.LSGEMM_L1x16_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L1x16_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L1x16_SUB4 + +.LSGEMM_L1x16_LOOP_START: + + dcbt AO, PRE + LOAD1x16_1 + KERNEL1x16_I1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -2 + ble .LSGEMM_L1x16_LOOP_END + + .align 5 + +.LSGEMM_L1x16_LOOP: + + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -1 + bgt .LSGEMM_L1x16_LOOP + +.LSGEMM_L1x16_LOOP_END: + + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + KERNEL1x16_E2 + + b .LSGEMM_L1x16_SUB1 + +.LSGEMM_L1x16_SUB4: + + dcbt AO, PRE + KERNEL1x16_SUBI1 + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + b .LSGEMM_L1x16_SUB1 + +.LSGEMM_L1x16_SUB0: + + andi. L, K, 7 + + KERNEL1x16_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L1x16_SAVE + b .LSGEMM_L1x16_SUB2 + +.LSGEMM_L1x16_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L1x16_SAVE + +.LSGEMM_L1x16_SUB2: + + KERNEL1x16_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L1x16_SUB2 + +.LSGEMM_L1x16_SAVE: + + SAVE1x16 + + addic. I, I, -1 + bgt .LSGEMM_L1x16_BEGIN + +.LSGEMM_L1x16_END: + +.LSGEMM_L1x8_BEGIN: + + andi. T2, M, 15 + ble .LSGEMM_L1x1_END + + andi. T1, M, 8 + ble .LSGEMM_L1x8_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L1x8_SUB4 + +.LSGEMM_L1x8_LOOP_START: + + LOAD1x8_1 + KERNEL1x8_I1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -2 + ble .LSGEMM_L1x8_LOOP_END + + .align 5 + +.LSGEMM_L1x8_LOOP: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -1 + bgt .LSGEMM_L1x8_LOOP + +.LSGEMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b .LSGEMM_L1x8_SUB1 + +.LSGEMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b .LSGEMM_L1x8_SUB1 + +.LSGEMM_L1x8_SUB0: + + andi. L, K, 7 + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L1x8_SAVE + b .LSGEMM_L1x8_SUB2 + +.LSGEMM_L1x8_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L1x8_SAVE + +.LSGEMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L1x8_SUB2 + +.LSGEMM_L1x8_SAVE: + + SAVE1x8 + +.LSGEMM_L1x8_END: + +.LSGEMM_L1x4_BEGIN: + + + andi. T1, M, 4 + ble .LSGEMM_L1x4_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L1x4_SUB4 + +.LSGEMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble .LSGEMM_L1x4_LOOP_END + + .align 5 + +.LSGEMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt .LSGEMM_L1x4_LOOP + +.LSGEMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b .LSGEMM_L1x4_SUB1 + +.LSGEMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b .LSGEMM_L1x4_SUB1 + +.LSGEMM_L1x4_SUB0: + + andi. L, K, 7 + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L1x4_SAVE + b .LSGEMM_L1x4_SUB2 + +.LSGEMM_L1x4_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L1x4_SAVE + +.LSGEMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L1x4_SUB2 + +.LSGEMM_L1x4_SAVE: + + SAVE1x4 + +.LSGEMM_L1x4_END: + +.LSGEMM_L1x2_BEGIN: + + + andi. T1, M, 2 + ble .LSGEMM_L1x2_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L1x2_SUB4 + +.LSGEMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble .LSGEMM_L1x2_LOOP_END + + .align 5 + +.LSGEMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt .LSGEMM_L1x2_LOOP + +.LSGEMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b .LSGEMM_L1x2_SUB1 + +.LSGEMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b .LSGEMM_L1x2_SUB1 + +.LSGEMM_L1x2_SUB0: + + andi. L, K, 7 + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L1x2_SAVE + b .LSGEMM_L1x2_SUB2 + +.LSGEMM_L1x2_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L1x2_SAVE + +.LSGEMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L1x2_SUB2 + +.LSGEMM_L1x2_SAVE: + + SAVE1x2 + +.LSGEMM_L1x2_END: + +.LSGEMM_L1x1_BEGIN: + + + andi. T1, M, 1 + ble .LSGEMM_L1x1_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L1x1_SUB4 + +.LSGEMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble .LSGEMM_L1x1_LOOP_END + + .align 5 + +.LSGEMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt .LSGEMM_L1x1_LOOP + +.LSGEMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b .LSGEMM_L1x1_SUB1 + +.LSGEMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b .LSGEMM_L1x1_SUB1 + +.LSGEMM_L1x1_SUB0: + + andi. L, K, 7 + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L1x1_SAVE + b .LSGEMM_L1x1_SUB2 + +.LSGEMM_L1x1_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L1x1_SAVE + +.LSGEMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L1x1_SUB2 + +.LSGEMM_L1x1_SAVE: + + SAVE1x1 + +.LSGEMM_L1x1_END: + +.LSGEMM_L1_END: diff --git a/kernel/power/sgemm_macros_16x8_power8.S b/kernel/power/sgemm_macros_16x8_power8.S new file mode 100644 index 000000000..78f530cfa --- /dev/null +++ b/kernel/power/sgemm_macros_16x8_power8.S @@ -0,0 +1,6145 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +**************************************************************************************/ + +/********************************************************************************************** +* Macros for N=8 and M=16 +**********************************************************************************************/ + +.macro LOAD8x16_1 + + lxvw4x vs28, o0, BO + lxvw4x vs29, o16, BO + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + addi AO, AO, 64 + addi BO, BO, 32 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + +.endm + +.macro KERNEL8x16_I1 + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + lxvw4x vs28, o0, BO + lxvw4x vs29, o16, BO + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + xvmulsp vs48, vs0, vs12 + xvmulsp vs49, vs1, vs12 + xvmulsp vs50, vs2, vs12 + xvmulsp vs51, vs3, vs12 + + xvmulsp vs52, vs0, vs13 + xvmulsp vs53, vs1, vs13 + xvmulsp vs54, vs2, vs13 + xvmulsp vs55, vs3, vs13 + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + xvmulsp vs56, vs0, vs14 + xvmulsp vs57, vs1, vs14 + xvmulsp vs58, vs2, vs14 + xvmulsp vs59, vs3, vs14 + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + xvmulsp vs60, vs0, vs15 + xvmulsp vs61, vs1, vs15 + + addi AO, AO, 64 + addi BO, BO, 32 + + xvmulsp vs62, vs2, vs15 + xvmulsp vs63, vs3, vs15 + + +.endm + +.macro KERNEL8x16_1 + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + lxvw4x vs28, o0, BO + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + + lxvw4x vs29, o16, BO + + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + xvmaddasp vs48, vs0, vs12 + xvmaddasp vs49, vs1, vs12 + xvmaddasp vs50, vs2, vs12 + xvmaddasp vs51, vs3, vs12 + + xvmaddasp vs52, vs0, vs13 + xvmaddasp vs53, vs1, vs13 + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + xvmaddasp vs54, vs2, vs13 + xvmaddasp vs55, vs3, vs13 + + xvmaddasp vs56, vs0, vs14 + xvmaddasp vs57, vs1, vs14 + + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + xvmaddasp vs58, vs2, vs14 + xvmaddasp vs59, vs3, vs14 + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + + xvmaddasp vs60, vs0, vs15 + xvmaddasp vs61, vs1, vs15 + + addi AO, AO, 64 + addi BO, BO, 32 + + xvmaddasp vs62, vs2, vs15 + xvmaddasp vs63, vs3, vs15 + + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + +.endm + +.macro KERNEL8x16_2 + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + + lxvw4x vs28, o0, BO + + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + + lxvw4x vs29, o16, BO + + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + xvmaddasp vs48, vs4, vs20 + xvmaddasp vs49, vs5, vs20 + xvmaddasp vs50, vs6, vs20 + xvmaddasp vs51, vs7, vs20 + + xvmaddasp vs52, vs4, vs21 + xvmaddasp vs53, vs5, vs21 + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + xvmaddasp vs54, vs6, vs21 + xvmaddasp vs55, vs7, vs21 + + xvmaddasp vs56, vs4, vs22 + xvmaddasp vs57, vs5, vs22 + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + xvmaddasp vs58, vs6, vs22 + xvmaddasp vs59, vs7, vs22 + + xvmaddasp vs60, vs4, vs23 + xvmaddasp vs61, vs5, vs23 + + addi AO, AO, 64 + addi BO, BO, 32 + + xvmaddasp vs62, vs6, vs23 + xvmaddasp vs63, vs7, vs23 + + +.endm + +.macro KERNEL8x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + xvmaddasp vs48, vs4, vs20 + xvmaddasp vs49, vs5, vs20 + xvmaddasp vs50, vs6, vs20 + xvmaddasp vs51, vs7, vs20 + + xvmaddasp vs52, vs4, vs21 + xvmaddasp vs53, vs5, vs21 + xvmaddasp vs54, vs6, vs21 + xvmaddasp vs55, vs7, vs21 + + xvmaddasp vs56, vs4, vs22 + xvmaddasp vs57, vs5, vs22 + xvmaddasp vs58, vs6, vs22 + xvmaddasp vs59, vs7, vs22 + + xvmaddasp vs60, vs4, vs23 + xvmaddasp vs61, vs5, vs23 + xvmaddasp vs62, vs6, vs23 + xvmaddasp vs63, vs7, vs23 + + +.endm + +.macro KERNEL8x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + xvmulsp vs48, vs0, vs12 + xvmulsp vs49, vs1, vs12 + xvmulsp vs50, vs2, vs12 + xvmulsp vs51, vs3, vs12 + + xvmulsp vs52, vs0, vs13 + xvmulsp vs53, vs1, vs13 + xvmulsp vs54, vs2, vs13 + xvmulsp vs55, vs3, vs13 + + xvmulsp vs56, vs0, vs14 + xvmulsp vs57, vs1, vs14 + xvmulsp vs58, vs2, vs14 + xvmulsp vs59, vs3, vs14 + + xvmulsp vs60, vs0, vs15 + xvmulsp vs61, vs1, vs15 + xvmulsp vs62, vs2, vs15 + xvmulsp vs63, vs3, vs15 + + +.endm + +.macro KERNEL8x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + xvmaddasp vs48, vs0, vs12 + xvmaddasp vs49, vs1, vs12 + xvmaddasp vs50, vs2, vs12 + xvmaddasp vs51, vs3, vs12 + + xvmaddasp vs52, vs0, vs13 + xvmaddasp vs53, vs1, vs13 + xvmaddasp vs54, vs2, vs13 + xvmaddasp vs55, vs3, vs13 + + xvmaddasp vs56, vs0, vs14 + xvmaddasp vs57, vs1, vs14 + xvmaddasp vs58, vs2, vs14 + xvmaddasp vs59, vs3, vs14 + + xvmaddasp vs60, vs0, vs15 + xvmaddasp vs61, vs1, vs15 + xvmaddasp vs62, vs2, vs15 + xvmaddasp vs63, vs3, vs15 + + +.endm + +.macro SAVE8x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr + +#else + + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr + xvmulsp vs2, vs42, alpha_vr + xvmulsp vs3, vs43, alpha_vr + +#else + + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr + xvmaddasp vs2, vs42, alpha_vr + xvmaddasp vs3, vs43, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr + xvmulsp vs2, vs46, alpha_vr + xvmulsp vs3, vs47, alpha_vr + +#else + + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr + xvmaddasp vs2, vs46, alpha_vr + xvmaddasp vs3, vs47, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs48, alpha_vr + xvmulsp vs1, vs49, alpha_vr + xvmulsp vs2, vs50, alpha_vr + xvmulsp vs3, vs51, alpha_vr + +#else + + xvmaddasp vs0, vs48, alpha_vr + xvmaddasp vs1, vs49, alpha_vr + xvmaddasp vs2, vs50, alpha_vr + xvmaddasp vs3, vs51, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs52, alpha_vr + xvmulsp vs1, vs53, alpha_vr + xvmulsp vs2, vs54, alpha_vr + xvmulsp vs3, vs55, alpha_vr + +#else + + xvmaddasp vs0, vs52, alpha_vr + xvmaddasp vs1, vs53, alpha_vr + xvmaddasp vs2, vs54, alpha_vr + xvmaddasp vs3, vs55, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs56, alpha_vr + xvmulsp vs1, vs57, alpha_vr + xvmulsp vs2, vs58, alpha_vr + xvmulsp vs3, vs59, alpha_vr + +#else + + xvmaddasp vs0, vs56, alpha_vr + xvmaddasp vs1, vs57, alpha_vr + xvmaddasp vs2, vs58, alpha_vr + xvmaddasp vs3, vs59, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs60, alpha_vr + xvmulsp vs1, vs61, alpha_vr + xvmulsp vs2, vs62, alpha_vr + xvmulsp vs3, vs63, alpha_vr + +#else + + xvmaddasp vs0, vs60, alpha_vr + xvmaddasp vs1, vs61, alpha_vr + xvmaddasp vs2, vs62, alpha_vr + xvmaddasp vs3, vs63, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=8 +**********************************************************************************************/ + +.macro LOAD8x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + xvmulsp vs40, vs0, vs12 + xvmulsp vs41, vs1, vs12 + + xvmulsp vs42, vs0, vs13 + xvmulsp vs43, vs1, vs13 + + xvmulsp vs44, vs0, vs14 + xvmulsp vs45, vs1, vs14 + + xvmulsp vs46, vs0, vs15 + xvmulsp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + xvmaddasp vs40, vs0, vs12 + xvmaddasp vs41, vs1, vs12 + + xvmaddasp vs42, vs0, vs13 + xvmaddasp vs43, vs1, vs13 + + xvmaddasp vs44, vs0, vs14 + xvmaddasp vs45, vs1, vs14 + + xvmaddasp vs46, vs0, vs15 + xvmaddasp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + xvmaddasp vs40, vs4, vs20 + xvmaddasp vs41, vs5, vs20 + + xvmaddasp vs42, vs4, vs21 + xvmaddasp vs43, vs5, vs21 + + xvmaddasp vs44, vs4, vs22 + xvmaddasp vs45, vs5, vs22 + + xvmaddasp vs46, vs4, vs23 + xvmaddasp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + xvmaddasp vs40, vs4, vs20 + xvmaddasp vs41, vs5, vs20 + + xvmaddasp vs42, vs4, vs21 + xvmaddasp vs43, vs5, vs21 + + xvmaddasp vs44, vs4, vs22 + xvmaddasp vs45, vs5, vs22 + + xvmaddasp vs46, vs4, vs23 + xvmaddasp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + xvmulsp vs40, vs0, vs12 + xvmulsp vs41, vs1, vs12 + + xvmulsp vs42, vs0, vs13 + xvmulsp vs43, vs1, vs13 + + xvmulsp vs44, vs0, vs14 + xvmulsp vs45, vs1, vs14 + + xvmulsp vs46, vs0, vs15 + xvmulsp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + xvmaddasp vs40, vs0, vs12 + xvmaddasp vs41, vs1, vs12 + + xvmaddasp vs42, vs0, vs13 + xvmaddasp vs43, vs1, vs13 + + xvmaddasp vs44, vs0, vs14 + xvmaddasp vs45, vs1, vs14 + + xvmaddasp vs46, vs0, vs15 + xvmaddasp vs47, vs1, vs15 + + +.endm + +.macro SAVE8x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + +#else + + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs38, alpha_vr + xvmulsp vs1, vs39, alpha_vr + +#else + + xvmaddasp vs0, vs38, alpha_vr + xvmaddasp vs1, vs39, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr + +#else + + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs42, alpha_vr + xvmulsp vs1, vs43, alpha_vr + +#else + + xvmaddasp vs0, vs42, alpha_vr + xvmaddasp vs1, vs43, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr + +#else + + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs46, alpha_vr + xvmulsp vs1, vs47, alpha_vr + +#else + + xvmaddasp vs0, vs46, alpha_vr + xvmaddasp vs1, vs47, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=4 +**********************************************************************************************/ + +.macro LOAD8x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + xvmulsp vs36, vs0, vs12 + + xvmulsp vs37, vs0, vs13 + + xvmulsp vs38, vs0, vs14 + + xvmulsp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + xvmaddasp vs36, vs0, vs12 + + xvmaddasp vs37, vs0, vs13 + + xvmaddasp vs38, vs0, vs14 + + xvmaddasp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + xvmaddasp vs36, vs4, vs20 + + xvmaddasp vs37, vs4, vs21 + + xvmaddasp vs38, vs4, vs22 + + xvmaddasp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + xvmaddasp vs36, vs4, vs20 + + xvmaddasp vs37, vs4, vs21 + + xvmaddasp vs38, vs4, vs22 + + xvmaddasp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + xvmulsp vs36, vs0, vs12 + + xvmulsp vs37, vs0, vs13 + + xvmulsp vs38, vs0, vs14 + + xvmulsp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + xvmaddasp vs36, vs0, vs12 + + xvmaddasp vs37, vs0, vs13 + + xvmaddasp vs38, vs0, vs14 + + xvmaddasp vs39, vs0, vs15 + + +.endm + +.macro SAVE8x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs33, alpha_vr + +#else + + xvmaddasp vs0, vs33, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs34, alpha_vr + +#else + + xvmaddasp vs0, vs34, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs36, alpha_vr + +#else + + xvmaddasp vs0, vs36, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs37, alpha_vr + +#else + + xvmaddasp vs0, vs37, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs38, alpha_vr + +#else + + xvmaddasp vs0, vs38, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs39, alpha_vr + +#else + + xvmaddasp vs0, vs39, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=2 +**********************************************************************************************/ + +.macro LOAD8x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + xsmulsp vs34, vs0, vs9 + xsmulsp vs35, vs1, vs9 + + xsmulsp vs36, vs0, vs10 + xsmulsp vs37, vs1, vs10 + + xsmulsp vs38, vs0, vs11 + xsmulsp vs39, vs1, vs11 + + xsmulsp vs40, vs0, vs12 + xsmulsp vs41, vs1, vs12 + + xsmulsp vs42, vs0, vs13 + xsmulsp vs43, vs1, vs13 + + xsmulsp vs44, vs0, vs14 + xsmulsp vs45, vs1, vs14 + + xsmulsp vs46, vs0, vs15 + xsmulsp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + xsmaddasp vs34, vs0, vs9 + xsmaddasp vs35, vs1, vs9 + + xsmaddasp vs36, vs0, vs10 + xsmaddasp vs37, vs1, vs10 + + xsmaddasp vs38, vs0, vs11 + xsmaddasp vs39, vs1, vs11 + + xsmaddasp vs40, vs0, vs12 + xsmaddasp vs41, vs1, vs12 + + xsmaddasp vs42, vs0, vs13 + xsmaddasp vs43, vs1, vs13 + + xsmaddasp vs44, vs0, vs14 + xsmaddasp vs45, vs1, vs14 + + xsmaddasp vs46, vs0, vs15 + xsmaddasp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + xsmaddasp vs34, vs4, vs17 + xsmaddasp vs35, vs5, vs17 + + xsmaddasp vs36, vs4, vs18 + xsmaddasp vs37, vs5, vs18 + + xsmaddasp vs38, vs4, vs19 + xsmaddasp vs39, vs5, vs19 + + xsmaddasp vs40, vs4, vs20 + xsmaddasp vs41, vs5, vs20 + + xsmaddasp vs42, vs4, vs21 + xsmaddasp vs43, vs5, vs21 + + xsmaddasp vs44, vs4, vs22 + xsmaddasp vs45, vs5, vs22 + + xsmaddasp vs46, vs4, vs23 + xsmaddasp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x2_E2 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + xsmaddasp vs34, vs4, vs17 + xsmaddasp vs35, vs5, vs17 + + xsmaddasp vs36, vs4, vs18 + xsmaddasp vs37, vs5, vs18 + + xsmaddasp vs38, vs4, vs19 + xsmaddasp vs39, vs5, vs19 + + xsmaddasp vs40, vs4, vs20 + xsmaddasp vs41, vs5, vs20 + + xsmaddasp vs42, vs4, vs21 + xsmaddasp vs43, vs5, vs21 + + xsmaddasp vs44, vs4, vs22 + xsmaddasp vs45, vs5, vs22 + + xsmaddasp vs46, vs4, vs23 + xsmaddasp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + xsmulsp vs34, vs0, vs9 + xsmulsp vs35, vs1, vs9 + + xsmulsp vs36, vs0, vs10 + xsmulsp vs37, vs1, vs10 + + xsmulsp vs38, vs0, vs11 + xsmulsp vs39, vs1, vs11 + + xsmulsp vs40, vs0, vs12 + xsmulsp vs41, vs1, vs12 + + xsmulsp vs42, vs0, vs13 + xsmulsp vs43, vs1, vs13 + + xsmulsp vs44, vs0, vs14 + xsmulsp vs45, vs1, vs14 + + xsmulsp vs46, vs0, vs15 + xsmulsp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + xsmaddasp vs34, vs0, vs9 + xsmaddasp vs35, vs1, vs9 + + xsmaddasp vs36, vs0, vs10 + xsmaddasp vs37, vs1, vs10 + + xsmaddasp vs38, vs0, vs11 + xsmaddasp vs39, vs1, vs11 + + xsmaddasp vs40, vs0, vs12 + xsmaddasp vs41, vs1, vs12 + + xsmaddasp vs42, vs0, vs13 + xsmaddasp vs43, vs1, vs13 + + xsmaddasp vs44, vs0, vs14 + xsmaddasp vs45, vs1, vs14 + + xsmaddasp vs46, vs0, vs15 + xsmaddasp vs47, vs1, vs15 + + +.endm + +.macro SAVE8x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + xsmulsp vs1, vs33, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + xsmaddasp vs1, vs33, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs34, alpha_r + xsmulsp vs1, vs35, alpha_r + +#else + + xsmaddasp vs0, vs34, alpha_r + xsmaddasp vs1, vs35, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs36, alpha_r + xsmulsp vs1, vs37, alpha_r + +#else + + xsmaddasp vs0, vs36, alpha_r + xsmaddasp vs1, vs37, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs38, alpha_r + xsmulsp vs1, vs39, alpha_r + +#else + + xsmaddasp vs0, vs38, alpha_r + xsmaddasp vs1, vs39, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs40, alpha_r + xsmulsp vs1, vs41, alpha_r + +#else + + xsmaddasp vs0, vs40, alpha_r + xsmaddasp vs1, vs41, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs42, alpha_r + xsmulsp vs1, vs43, alpha_r + +#else + + xsmaddasp vs0, vs42, alpha_r + xsmaddasp vs1, vs43, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs44, alpha_r + xsmulsp vs1, vs45, alpha_r + +#else + + xsmaddasp vs0, vs44, alpha_r + xsmaddasp vs1, vs45, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs46, alpha_r + xsmulsp vs1, vs47, alpha_r + +#else + + xsmaddasp vs0, vs46, alpha_r + xsmaddasp vs1, vs47, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=1 +**********************************************************************************************/ + +.macro LOAD8x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmulsp vs32, vs0, vs8 + + xsmulsp vs33, vs0, vs9 + + xsmulsp vs34, vs0, vs10 + + xsmulsp vs35, vs0, vs11 + + xsmulsp vs36, vs0, vs12 + + xsmulsp vs37, vs0, vs13 + + xsmulsp vs38, vs0, vs14 + + xsmulsp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmaddasp vs32, vs0, vs8 + + xsmaddasp vs33, vs0, vs9 + + xsmaddasp vs34, vs0, vs10 + + xsmaddasp vs35, vs0, vs11 + + xsmaddasp vs36, vs0, vs12 + + xsmaddasp vs37, vs0, vs13 + + xsmaddasp vs38, vs0, vs14 + + xsmaddasp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddasp vs32, vs4, vs16 + + xsmaddasp vs33, vs4, vs17 + + xsmaddasp vs34, vs4, vs18 + + xsmaddasp vs35, vs4, vs19 + + xsmaddasp vs36, vs4, vs20 + + xsmaddasp vs37, vs4, vs21 + + xsmaddasp vs38, vs4, vs22 + + xsmaddasp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x1_E2 + + + xsmaddasp vs32, vs4, vs16 + + xsmaddasp vs33, vs4, vs17 + + xsmaddasp vs34, vs4, vs18 + + xsmaddasp vs35, vs4, vs19 + + xsmaddasp vs36, vs4, vs20 + + xsmaddasp vs37, vs4, vs21 + + xsmaddasp vs38, vs4, vs22 + + xsmaddasp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmulsp vs32, vs0, vs8 + + xsmulsp vs33, vs0, vs9 + + xsmulsp vs34, vs0, vs10 + + xsmulsp vs35, vs0, vs11 + + xsmulsp vs36, vs0, vs12 + + xsmulsp vs37, vs0, vs13 + + xsmulsp vs38, vs0, vs14 + + xsmulsp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddasp vs32, vs0, vs8 + + xsmaddasp vs33, vs0, vs9 + + xsmaddasp vs34, vs0, vs10 + + xsmaddasp vs35, vs0, vs11 + + xsmaddasp vs36, vs0, vs12 + + xsmaddasp vs37, vs0, vs13 + + xsmaddasp vs38, vs0, vs14 + + xsmaddasp vs39, vs0, vs15 + + +.endm + +.macro SAVE8x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs33, alpha_r + +#else + + xsmaddasp vs0, vs33, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs34, alpha_r + +#else + + xsmaddasp vs0, vs34, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs35, alpha_r + +#else + + xsmaddasp vs0, vs35, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs36, alpha_r + +#else + + xsmaddasp vs0, vs36, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs37, alpha_r + +#else + + xsmaddasp vs0, vs37, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs38, alpha_r + +#else + + xsmaddasp vs0, vs38, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs39, alpha_r + +#else + + xsmaddasp vs0, vs39, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro LOAD4x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + +.endm + +.macro KERNEL4x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + +.endm + +.macro KERNEL4x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + +.endm + +.macro KERNEL4x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + +.endm + +.macro KERNEL4x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + +.endm + +.macro KERNEL4x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + +.endm + +.macro SAVE4x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr + +#else + + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr + xvmulsp vs2, vs42, alpha_vr + xvmulsp vs3, vs43, alpha_vr + +#else + + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr + xvmaddasp vs2, vs42, alpha_vr + xvmaddasp vs3, vs43, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr + xvmulsp vs2, vs46, alpha_vr + xvmulsp vs3, vs47, alpha_vr + +#else + + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr + xvmaddasp vs2, vs46, alpha_vr + xvmaddasp vs3, vs47, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro LOAD4x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + +.endm + +.macro SAVE4x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + +#else + + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs38, alpha_vr + xvmulsp vs1, vs39, alpha_vr + +#else + + xvmaddasp vs0, vs38, alpha_vr + xvmaddasp vs1, vs39, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro LOAD4x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + +.endm + +.macro SAVE4x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs33, alpha_vr + +#else + + xvmaddasp vs0, vs33, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs34, alpha_vr + +#else + + xvmaddasp vs0, vs34, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro LOAD4x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + xsmulsp vs34, vs0, vs9 + xsmulsp vs35, vs1, vs9 + + xsmulsp vs36, vs0, vs10 + xsmulsp vs37, vs1, vs10 + + xsmulsp vs38, vs0, vs11 + xsmulsp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + xsmaddasp vs34, vs0, vs9 + xsmaddasp vs35, vs1, vs9 + + xsmaddasp vs36, vs0, vs10 + xsmaddasp vs37, vs1, vs10 + + xsmaddasp vs38, vs0, vs11 + xsmaddasp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + xsmaddasp vs34, vs4, vs17 + xsmaddasp vs35, vs5, vs17 + + xsmaddasp vs36, vs4, vs18 + xsmaddasp vs37, vs5, vs18 + + xsmaddasp vs38, vs4, vs19 + xsmaddasp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x2_E2 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + xsmaddasp vs34, vs4, vs17 + xsmaddasp vs35, vs5, vs17 + + xsmaddasp vs36, vs4, vs18 + xsmaddasp vs37, vs5, vs18 + + xsmaddasp vs38, vs4, vs19 + xsmaddasp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + xsmulsp vs34, vs0, vs9 + xsmulsp vs35, vs1, vs9 + + xsmulsp vs36, vs0, vs10 + xsmulsp vs37, vs1, vs10 + + xsmulsp vs38, vs0, vs11 + xsmulsp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + xsmaddasp vs34, vs0, vs9 + xsmaddasp vs35, vs1, vs9 + + xsmaddasp vs36, vs0, vs10 + xsmaddasp vs37, vs1, vs10 + + xsmaddasp vs38, vs0, vs11 + xsmaddasp vs39, vs1, vs11 + + +.endm + +.macro SAVE4x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + xsmulsp vs1, vs33, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + xsmaddasp vs1, vs33, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs34, alpha_r + xsmulsp vs1, vs35, alpha_r + +#else + + xsmaddasp vs0, vs34, alpha_r + xsmaddasp vs1, vs35, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs36, alpha_r + xsmulsp vs1, vs37, alpha_r + +#else + + xsmaddasp vs0, vs36, alpha_r + xsmaddasp vs1, vs37, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs38, alpha_r + xsmulsp vs1, vs39, alpha_r + +#else + + xsmaddasp vs0, vs38, alpha_r + xsmaddasp vs1, vs39, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro LOAD4x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmulsp vs32, vs0, vs8 + + xsmulsp vs33, vs0, vs9 + + xsmulsp vs34, vs0, vs10 + + xsmulsp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmaddasp vs32, vs0, vs8 + + xsmaddasp vs33, vs0, vs9 + + xsmaddasp vs34, vs0, vs10 + + xsmaddasp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddasp vs32, vs4, vs16 + + xsmaddasp vs33, vs4, vs17 + + xsmaddasp vs34, vs4, vs18 + + xsmaddasp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddasp vs32, vs4, vs16 + + xsmaddasp vs33, vs4, vs17 + + xsmaddasp vs34, vs4, vs18 + + xsmaddasp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmulsp vs32, vs0, vs8 + + xsmulsp vs33, vs0, vs9 + + xsmulsp vs34, vs0, vs10 + + xsmulsp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddasp vs32, vs0, vs8 + + xsmaddasp vs33, vs0, vs9 + + xsmaddasp vs34, vs0, vs10 + + xsmaddasp vs35, vs0, vs11 + + +.endm + +.macro SAVE4x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs33, alpha_r + +#else + + xsmaddasp vs0, vs33, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs34, alpha_r + +#else + + xsmaddasp vs0, vs34, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs35, alpha_r + +#else + + xsmaddasp vs0, vs35, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +.macro LOAD2x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + +.endm + +.macro KERNEL2x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + +.endm + +.macro KERNEL2x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + +.endm + +.macro KERNEL2x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + +.endm + +.macro KERNEL2x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + +.endm + +.macro KERNEL2x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + +.endm + +.macro SAVE2x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr + +#else + + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro LOAD2x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + +.endm + +.macro SAVE2x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro LOAD2x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs33, alpha_vr + +#else + + xvmaddasp vs0, vs33, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro LOAD2x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + xsmulsp vs34, vs0, vs9 + xsmulsp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + xsmaddasp vs34, vs0, vs9 + xsmaddasp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + xsmaddasp vs34, vs4, vs17 + xsmaddasp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x2_E2 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + xsmaddasp vs34, vs4, vs17 + xsmaddasp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + xsmulsp vs34, vs0, vs9 + xsmulsp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + xsmaddasp vs34, vs0, vs9 + xsmaddasp vs35, vs1, vs9 + + +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + xsmulsp vs1, vs33, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + xsmaddasp vs1, vs33, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs34, alpha_r + xsmulsp vs1, vs35, alpha_r + +#else + + xsmaddasp vs0, vs34, alpha_r + xsmaddasp vs1, vs35, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro LOAD2x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmulsp vs32, vs0, vs8 + + xsmulsp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmaddasp vs32, vs0, vs8 + + xsmaddasp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddasp vs32, vs4, vs16 + + xsmaddasp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddasp vs32, vs4, vs16 + + xsmaddasp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmulsp vs32, vs0, vs8 + + xsmulsp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddasp vs32, vs0, vs8 + + xsmaddasp vs33, vs0, vs9 + + +.endm + +.macro SAVE2x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs33, alpha_r + +#else + + xsmaddasp vs0, vs33, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +.macro LOAD1x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + +.endm + +.macro KERNEL1x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + +.endm + +.macro KERNEL1x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + +.endm + +.macro KERNEL1x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + +.endm + +.macro KERNEL1x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + +.endm + +.macro KERNEL1x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + +.endm + +.macro SAVE1x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro LOAD1x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + +.endm + +.macro SAVE1x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro LOAD1x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + + +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro LOAD1x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x2_E2 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + xsmulsp vs1, vs33, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + xsmaddasp vs1, vs33, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro LOAD1x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmulsp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmaddasp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddasp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddasp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmulsp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddasp vs32, vs0, vs8 + + +.endm + +.macro SAVE1x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + diff --git a/kernel/power/strmm_kernel_16x8_power8.S b/kernel/power/strmm_kernel_16x8_power8.S new file mode 100644 index 000000000..5b1c5ca6b --- /dev/null +++ b/kernel/power/strmm_kernel_16x8_power8.S @@ -0,0 +1,364 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_SP 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA_SP 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define alpha_r vs30 +#define alpha_vr vs31 + +#define o0 0 + +#define o12 r14 +#define o4 r15 +#define K1 r16 +#define o8 r17 +#define L r18 +#define T1 r19 +#define KK r20 +#define KKK 21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T2 r31 + +#include "sgemm_macros_16x8_power8.S" + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) + stw r14, 212(SP) +#endif + + // stfd f1, ALPHA_SP + // stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif +#endif + + mr KK, OFFSET +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, KK +#endif + + + cmpwi cr0, M, 0 + ble .L999_H1 + cmpwi cr0, N, 0 + ble .L999_H1 + cmpwi cr0, K, 0 + ble .L999_H1 + + li PRE, 256 + li o4 , 4 + li o8 , 8 + li o12, 12 + li o16, 16 + li o32, 32 + li o48, 48 + + addi T1, SP, 300 + stfs f1, 0(T1) + stfs f1, 4(T1) + stfs f1, 8(T1) + stfs f1,12(T1) + + lxsspx vs28, 0, T1 + + xxspltw alpha_r, vs28 , 0 + lxvw4x alpha_vr, 0, T1 + + + +#include "strmm_logic_16x8_power8.S" + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) + lwz r14, 212(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/strmm_logic_16x8_power8.S b/kernel/power/strmm_logic_16x8_power8.S new file mode 100644 index 000000000..0d6d04858 --- /dev/null +++ b/kernel/power/strmm_logic_16x8_power8.S @@ -0,0 +1,2969 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +**************************************************************************************/ + + + + srawi. J, N, 3 + ble .LSTRMM_L8_END + +.LSTRMM_L8_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 3 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble .LSTRMM_L8x16_END + +.LSTRMM_L8x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L8x16_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L8x16_SUB4 + +.LSTRMM_L8x16_LOOP_START: + + dcbt AO, PRE + LOAD8x16_1 + dcbt AO, PRE + KERNEL8x16_I1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + addic. L, L, -2 + ble .LSTRMM_L8x16_LOOP_END + + .align 5 + +.LSTRMM_L8x16_LOOP: + + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + addic. L, L, -1 + bgt .LSTRMM_L8x16_LOOP + +.LSTRMM_L8x16_LOOP_END: + + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + KERNEL8x16_E2 + + b .LSTRMM_L8x16_SUB1 + +.LSTRMM_L8x16_SUB4: + + dcbt AO, PRE + KERNEL8x16_SUBI1 + dcbt AO, PRE + KERNEL8x16_SUB1 + dcbt AO, PRE + KERNEL8x16_SUB1 + dcbt AO, PRE + KERNEL8x16_SUB1 + + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + + b .LSTRMM_L8x16_SUB1 + +.LSTRMM_L8x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL8x16_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L8x16_SAVE + b .LSTRMM_L8x16_SUB2 + +.LSTRMM_L8x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L8x16_SAVE + +.LSTRMM_L8x16_SUB2: + + KERNEL8x16_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L8x16_SUB2 + +.LSTRMM_L8x16_SAVE: + + SAVE8x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LSTRMM_L8x16_BEGIN + +.LSTRMM_L8x16_END: + +.LSTRMM_L8x8_BEGIN: + andi. T2, M, 15 + ble .LSTRMM_L8x1_END + + andi. T1, M, 8 + ble .LSTRMM_L8x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L8x8_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L8x8_SUB4 + +.LSTRMM_L8x8_LOOP_START: + + LOAD8x8_1 + KERNEL8x8_I1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + addic. L, L, -2 + ble .LSTRMM_L8x8_LOOP_END + + .align 5 + +.LSTRMM_L8x8_LOOP: + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + addic. L, L, -1 + bgt .LSTRMM_L8x8_LOOP + +.LSTRMM_L8x8_LOOP_END: + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_E2 + + b .LSTRMM_L8x8_SUB1 + +.LSTRMM_L8x8_SUB4: + + KERNEL8x8_SUBI1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + + b .LSTRMM_L8x8_SUB1 + +.LSTRMM_L8x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL8x8_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L8x8_SAVE + b .LSTRMM_L8x8_SUB2 + +.LSTRMM_L8x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L8x8_SAVE + +.LSTRMM_L8x8_SUB2: + + KERNEL8x8_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L8x8_SUB2 + +.LSTRMM_L8x8_SAVE: + + SAVE8x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +.LSTRMM_L8x8_END: + +.LSTRMM_L8x4_BEGIN: + + andi. T1, M, 4 + ble .LSTRMM_L8x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L8x4_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L8x4_SUB4 + +.LSTRMM_L8x4_LOOP_START: + + LOAD8x4_1 + KERNEL8x4_I1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + addic. L, L, -2 + ble .LSTRMM_L8x4_LOOP_END + + .align 5 + +.LSTRMM_L8x4_LOOP: + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + addic. L, L, -1 + bgt .LSTRMM_L8x4_LOOP + +.LSTRMM_L8x4_LOOP_END: + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_E2 + + b .LSTRMM_L8x4_SUB1 + +.LSTRMM_L8x4_SUB4: + + KERNEL8x4_SUBI1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + + b .LSTRMM_L8x4_SUB1 + +.LSTRMM_L8x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL8x4_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L8x4_SAVE + b .LSTRMM_L8x4_SUB2 + +.LSTRMM_L8x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L8x4_SAVE + +.LSTRMM_L8x4_SUB2: + + KERNEL8x4_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L8x4_SUB2 + +.LSTRMM_L8x4_SAVE: + + SAVE8x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LSTRMM_L8x4_END: + +.LSTRMM_L8x2_BEGIN: + + andi. T1, M, 2 + ble .LSTRMM_L8x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L8x2_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L8x2_SUB4 + +.LSTRMM_L8x2_LOOP_START: + + LOAD8x2_1 + KERNEL8x2_I1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + addic. L, L, -2 + ble .LSTRMM_L8x2_LOOP_END + + .align 5 + +.LSTRMM_L8x2_LOOP: + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + addic. L, L, -1 + bgt .LSTRMM_L8x2_LOOP + +.LSTRMM_L8x2_LOOP_END: + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_E2 + + b .LSTRMM_L8x2_SUB1 + +.LSTRMM_L8x2_SUB4: + + KERNEL8x2_SUBI1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + + b .LSTRMM_L8x2_SUB1 + +.LSTRMM_L8x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL8x2_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L8x2_SAVE + b .LSTRMM_L8x2_SUB2 + +.LSTRMM_L8x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L8x2_SAVE + +.LSTRMM_L8x2_SUB2: + + KERNEL8x2_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L8x2_SUB2 + +.LSTRMM_L8x2_SAVE: + + SAVE8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LSTRMM_L8x2_END: + +.LSTRMM_L8x1_BEGIN: + + andi. T1, M, 1 + ble .LSTRMM_L8x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 2 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L8x1_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L8x1_SUB4 + +.LSTRMM_L8x1_LOOP_START: + + LOAD8x1_1 + KERNEL8x1_I1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + addic. L, L, -2 + ble .LSTRMM_L8x1_LOOP_END + + .align 5 + +.LSTRMM_L8x1_LOOP: + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + addic. L, L, -1 + bgt .LSTRMM_L8x1_LOOP + +.LSTRMM_L8x1_LOOP_END: + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_E2 + + b .LSTRMM_L8x1_SUB1 + +.LSTRMM_L8x1_SUB4: + + KERNEL8x1_SUBI1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + + b .LSTRMM_L8x1_SUB1 + +.LSTRMM_L8x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL8x1_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L8x1_SAVE + b .LSTRMM_L8x1_SUB2 + +.LSTRMM_L8x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L8x1_SAVE + +.LSTRMM_L8x1_SUB2: + + KERNEL8x1_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L8x1_SUB2 + +.LSTRMM_L8x1_SAVE: + + SAVE8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LSTRMM_L8x1_END: + + slwi T1, K, 5 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 8 // KK += Number of values in B +#endif + + + addic. J, J, -1 + bgt .LSTRMM_L8_BEGIN + + andi. T2, N, 7 + ble .L999 + +.LSTRMM_L8_END: + + b .LSTRMM_L4_BEGIN + +.L999_H1: + + b .L999 + +.LSTRMM_L4_BEGIN: + + andi. T1, N, 4 + ble .LSTRMM_L4_END + mr CO, C + mr AO, A + slwi T1, LDC , 2 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble .LSTRMM_L4x16_END + +.LSTRMM_L4x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L4x16_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L4x16_SUB4 + +.LSTRMM_L4x16_LOOP_START: + + dcbt AO, PRE + LOAD4x16_1 + dcbt AO, PRE + KERNEL4x16_I1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -2 + ble .LSTRMM_L4x16_LOOP_END + + .align 5 + +.LSTRMM_L4x16_LOOP: + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -1 + bgt .LSTRMM_L4x16_LOOP + +.LSTRMM_L4x16_LOOP_END: + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + KERNEL4x16_E2 + + b .LSTRMM_L4x16_SUB1 + +.LSTRMM_L4x16_SUB4: + + dcbt AO, PRE + KERNEL4x16_SUBI1 + dcbt AO, PRE + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + + b .LSTRMM_L4x16_SUB1 + +.LSTRMM_L4x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x16_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L4x16_SAVE + b .LSTRMM_L4x16_SUB2 + +.LSTRMM_L4x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L4x16_SAVE + +.LSTRMM_L4x16_SUB2: + + KERNEL4x16_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L4x16_SUB2 + +.LSTRMM_L4x16_SAVE: + + SAVE4x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LSTRMM_L4x16_BEGIN + +.LSTRMM_L4x16_END: + +.LSTRMM_L4x8_BEGIN: + andi. T2, M, 15 + ble .LSTRMM_L4x1_END + + andi. T1, M, 8 + ble .LSTRMM_L4x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L4x8_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L4x8_SUB4 + +.LSTRMM_L4x8_LOOP_START: + + LOAD4x8_1 + KERNEL4x8_I1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -2 + ble .LSTRMM_L4x8_LOOP_END + + .align 5 + +.LSTRMM_L4x8_LOOP: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -1 + bgt .LSTRMM_L4x8_LOOP + +.LSTRMM_L4x8_LOOP_END: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_E2 + + b .LSTRMM_L4x8_SUB1 + +.LSTRMM_L4x8_SUB4: + + KERNEL4x8_SUBI1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + b .LSTRMM_L4x8_SUB1 + +.LSTRMM_L4x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x8_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L4x8_SAVE + b .LSTRMM_L4x8_SUB2 + +.LSTRMM_L4x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L4x8_SAVE + +.LSTRMM_L4x8_SUB2: + + KERNEL4x8_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L4x8_SUB2 + +.LSTRMM_L4x8_SAVE: + + SAVE4x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +.LSTRMM_L4x8_END: + +.LSTRMM_L4x4_BEGIN: + + andi. T1, M, 4 + ble .LSTRMM_L4x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L4x4_SUB4 + +.LSTRMM_L4x4_LOOP_START: + + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -2 + ble .LSTRMM_L4x4_LOOP_END + + .align 5 + +.LSTRMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -1 + bgt .LSTRMM_L4x4_LOOP + +.LSTRMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b .LSTRMM_L4x4_SUB1 + +.LSTRMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b .LSTRMM_L4x4_SUB1 + +.LSTRMM_L4x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L4x4_SAVE + b .LSTRMM_L4x4_SUB2 + +.LSTRMM_L4x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L4x4_SAVE + +.LSTRMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L4x4_SUB2 + +.LSTRMM_L4x4_SAVE: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LSTRMM_L4x4_END: + +.LSTRMM_L4x2_BEGIN: + + andi. T1, M, 2 + ble .LSTRMM_L4x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L4x2_SUB4 + +.LSTRMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble .LSTRMM_L4x2_LOOP_END + + .align 5 + +.LSTRMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt .LSTRMM_L4x2_LOOP + +.LSTRMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b .LSTRMM_L4x2_SUB1 + +.LSTRMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b .LSTRMM_L4x2_SUB1 + +.LSTRMM_L4x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L4x2_SAVE + b .LSTRMM_L4x2_SUB2 + +.LSTRMM_L4x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L4x2_SAVE + +.LSTRMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L4x2_SUB2 + +.LSTRMM_L4x2_SAVE: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LSTRMM_L4x2_END: + +.LSTRMM_L4x1_BEGIN: + + andi. T1, M, 1 + ble .LSTRMM_L4x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 2 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L4x1_SUB4 + +.LSTRMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble .LSTRMM_L4x1_LOOP_END + + .align 5 + +.LSTRMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt .LSTRMM_L4x1_LOOP + +.LSTRMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b .LSTRMM_L4x1_SUB1 + +.LSTRMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b .LSTRMM_L4x1_SUB1 + +.LSTRMM_L4x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L4x1_SAVE + b .LSTRMM_L4x1_SUB2 + +.LSTRMM_L4x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L4x1_SAVE + +.LSTRMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L4x1_SUB2 + +.LSTRMM_L4x1_SAVE: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LSTRMM_L4x1_END: + + slwi T1, K, 4 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 4 // KK += Number of values in B +#endif + + +.LSTRMM_L4_END: +.LSTRMM_L2_BEGIN: + + andi. T1, N, 2 + ble .LSTRMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble .LSTRMM_L2x16_END + +.LSTRMM_L2x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L2x16_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L2x16_SUB4 + +.LSTRMM_L2x16_LOOP_START: + + dcbt AO, PRE + LOAD2x16_1 + dcbt AO, PRE + KERNEL2x16_I1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -2 + ble .LSTRMM_L2x16_LOOP_END + + .align 5 + +.LSTRMM_L2x16_LOOP: + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -1 + bgt .LSTRMM_L2x16_LOOP + +.LSTRMM_L2x16_LOOP_END: + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + KERNEL2x16_E2 + + b .LSTRMM_L2x16_SUB1 + +.LSTRMM_L2x16_SUB4: + + dcbt AO, PRE + KERNEL2x16_SUBI1 + dcbt AO, PRE + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + b .LSTRMM_L2x16_SUB1 + +.LSTRMM_L2x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x16_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L2x16_SAVE + b .LSTRMM_L2x16_SUB2 + +.LSTRMM_L2x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L2x16_SAVE + +.LSTRMM_L2x16_SUB2: + + KERNEL2x16_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L2x16_SUB2 + +.LSTRMM_L2x16_SAVE: + + SAVE2x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LSTRMM_L2x16_BEGIN + +.LSTRMM_L2x16_END: + +.LSTRMM_L2x8_BEGIN: + andi. T2, M, 15 + ble .LSTRMM_L2x1_END + + andi. T1, M, 8 + ble .LSTRMM_L2x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L2x8_SUB4 + +.LSTRMM_L2x8_LOOP_START: + + LOAD2x8_1 + KERNEL2x8_I1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -2 + ble .LSTRMM_L2x8_LOOP_END + + .align 5 + +.LSTRMM_L2x8_LOOP: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -1 + bgt .LSTRMM_L2x8_LOOP + +.LSTRMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b .LSTRMM_L2x8_SUB1 + +.LSTRMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b .LSTRMM_L2x8_SUB1 + +.LSTRMM_L2x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L2x8_SAVE + b .LSTRMM_L2x8_SUB2 + +.LSTRMM_L2x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L2x8_SAVE + +.LSTRMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L2x8_SUB2 + +.LSTRMM_L2x8_SAVE: + + SAVE2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +.LSTRMM_L2x8_END: + +.LSTRMM_L2x4_BEGIN: + + andi. T1, M, 4 + ble .LSTRMM_L2x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L2x4_SUB4 + +.LSTRMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble .LSTRMM_L2x4_LOOP_END + + .align 5 + +.LSTRMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt .LSTRMM_L2x4_LOOP + +.LSTRMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b .LSTRMM_L2x4_SUB1 + +.LSTRMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b .LSTRMM_L2x4_SUB1 + +.LSTRMM_L2x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L2x4_SAVE + b .LSTRMM_L2x4_SUB2 + +.LSTRMM_L2x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L2x4_SAVE + +.LSTRMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L2x4_SUB2 + +.LSTRMM_L2x4_SAVE: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LSTRMM_L2x4_END: + +.LSTRMM_L2x2_BEGIN: + + andi. T1, M, 2 + ble .LSTRMM_L2x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L2x2_SUB4 + +.LSTRMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble .LSTRMM_L2x2_LOOP_END + + .align 5 + +.LSTRMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt .LSTRMM_L2x2_LOOP + +.LSTRMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b .LSTRMM_L2x2_SUB1 + +.LSTRMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b .LSTRMM_L2x2_SUB1 + +.LSTRMM_L2x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L2x2_SAVE + b .LSTRMM_L2x2_SUB2 + +.LSTRMM_L2x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L2x2_SAVE + +.LSTRMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L2x2_SUB2 + +.LSTRMM_L2x2_SAVE: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LSTRMM_L2x2_END: + +.LSTRMM_L2x1_BEGIN: + + andi. T1, M, 1 + ble .LSTRMM_L2x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 2 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L2x1_SUB4 + +.LSTRMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble .LSTRMM_L2x1_LOOP_END + + .align 5 + +.LSTRMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt .LSTRMM_L2x1_LOOP + +.LSTRMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b .LSTRMM_L2x1_SUB1 + +.LSTRMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b .LSTRMM_L2x1_SUB1 + +.LSTRMM_L2x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L2x1_SAVE + b .LSTRMM_L2x1_SUB2 + +.LSTRMM_L2x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L2x1_SAVE + +.LSTRMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L2x1_SUB2 + +.LSTRMM_L2x1_SAVE: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LSTRMM_L2x1_END: + + slwi T1, K, 3 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 2 // KK += Number of values in B +#endif + + +.LSTRMM_L2_END: +.LSTRMM_L1_BEGIN: + + andi. T1, N, 1 + ble .LSTRMM_L1_END + mr CO, C + mr AO, A + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble .LSTRMM_L1x16_END + +.LSTRMM_L1x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 2 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L1x16_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L1x16_SUB4 + +.LSTRMM_L1x16_LOOP_START: + + dcbt AO, PRE + LOAD1x16_1 + dcbt AO, PRE + KERNEL1x16_I1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -2 + ble .LSTRMM_L1x16_LOOP_END + + .align 5 + +.LSTRMM_L1x16_LOOP: + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -1 + bgt .LSTRMM_L1x16_LOOP + +.LSTRMM_L1x16_LOOP_END: + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + KERNEL1x16_E2 + + b .LSTRMM_L1x16_SUB1 + +.LSTRMM_L1x16_SUB4: + + dcbt AO, PRE + KERNEL1x16_SUBI1 + dcbt AO, PRE + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + b .LSTRMM_L1x16_SUB1 + +.LSTRMM_L1x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x16_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L1x16_SAVE + b .LSTRMM_L1x16_SUB2 + +.LSTRMM_L1x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L1x16_SAVE + +.LSTRMM_L1x16_SUB2: + + KERNEL1x16_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L1x16_SUB2 + +.LSTRMM_L1x16_SAVE: + + SAVE1x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LSTRMM_L1x16_BEGIN + +.LSTRMM_L1x16_END: + +.LSTRMM_L1x8_BEGIN: + andi. T2, M, 15 + ble .LSTRMM_L1x1_END + + andi. T1, M, 8 + ble .LSTRMM_L1x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 2 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L1x8_SUB4 + +.LSTRMM_L1x8_LOOP_START: + + LOAD1x8_1 + KERNEL1x8_I1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -2 + ble .LSTRMM_L1x8_LOOP_END + + .align 5 + +.LSTRMM_L1x8_LOOP: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -1 + bgt .LSTRMM_L1x8_LOOP + +.LSTRMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b .LSTRMM_L1x8_SUB1 + +.LSTRMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b .LSTRMM_L1x8_SUB1 + +.LSTRMM_L1x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L1x8_SAVE + b .LSTRMM_L1x8_SUB2 + +.LSTRMM_L1x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L1x8_SAVE + +.LSTRMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L1x8_SUB2 + +.LSTRMM_L1x8_SAVE: + + SAVE1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +.LSTRMM_L1x8_END: + +.LSTRMM_L1x4_BEGIN: + + andi. T1, M, 4 + ble .LSTRMM_L1x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 2 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L1x4_SUB4 + +.LSTRMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble .LSTRMM_L1x4_LOOP_END + + .align 5 + +.LSTRMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt .LSTRMM_L1x4_LOOP + +.LSTRMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b .LSTRMM_L1x4_SUB1 + +.LSTRMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b .LSTRMM_L1x4_SUB1 + +.LSTRMM_L1x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L1x4_SAVE + b .LSTRMM_L1x4_SUB2 + +.LSTRMM_L1x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L1x4_SAVE + +.LSTRMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L1x4_SUB2 + +.LSTRMM_L1x4_SAVE: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LSTRMM_L1x4_END: + +.LSTRMM_L1x2_BEGIN: + + andi. T1, M, 2 + ble .LSTRMM_L1x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 2 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L1x2_SUB4 + +.LSTRMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble .LSTRMM_L1x2_LOOP_END + + .align 5 + +.LSTRMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt .LSTRMM_L1x2_LOOP + +.LSTRMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b .LSTRMM_L1x2_SUB1 + +.LSTRMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b .LSTRMM_L1x2_SUB1 + +.LSTRMM_L1x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L1x2_SAVE + b .LSTRMM_L1x2_SUB2 + +.LSTRMM_L1x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L1x2_SAVE + +.LSTRMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L1x2_SUB2 + +.LSTRMM_L1x2_SAVE: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LSTRMM_L1x2_END: + +.LSTRMM_L1x1_BEGIN: + + andi. T1, M, 1 + ble .LSTRMM_L1x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 2 // Number of values in B shifted + slwi T2, KK, 2 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L1x1_SUB4 + +.LSTRMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble .LSTRMM_L1x1_LOOP_END + + .align 5 + +.LSTRMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt .LSTRMM_L1x1_LOOP + +.LSTRMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b .LSTRMM_L1x1_SUB1 + +.LSTRMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b .LSTRMM_L1x1_SUB1 + +.LSTRMM_L1x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L1x1_SAVE + b .LSTRMM_L1x1_SUB2 + +.LSTRMM_L1x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L1x1_SAVE + +.LSTRMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L1x1_SUB2 + +.LSTRMM_L1x1_SAVE: + + SAVE1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LSTRMM_L1x1_END: + +#if !defined(LEFT) + addi KK, KK, 1 // KK += Number of values in B +#endif + + +.LSTRMM_L1_END: diff --git a/param.h b/param.h index 31125d8e4..f5d1ab2ea 100644 --- a/param.h +++ b/param.h @@ -1961,15 +1961,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) -#define SNUMOPT 4 +#define SNUMOPT 16 #define DNUMOPT 8 #define GEMM_DEFAULT_OFFSET_A 384 #define GEMM_DEFAULT_OFFSET_B 1024 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 4 -#define SGEMM_DEFAULT_UNROLL_N 4 +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 2 @@ -1977,12 +1977,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 992 +#define SGEMM_DEFAULT_P 960 #define DGEMM_DEFAULT_P 480 #define CGEMM_DEFAULT_P 488 #define ZGEMM_DEFAULT_P 240 -#define SGEMM_DEFAULT_Q 504 +#define SGEMM_DEFAULT_Q 720 #define DGEMM_DEFAULT_Q 720 #define CGEMM_DEFAULT_Q 400 #define ZGEMM_DEFAULT_Q 360