From 96284ab295d26e00ea36e21ed31e26b39b9601d0 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 14 Mar 2016 13:52:44 +0100 Subject: [PATCH 1/4] added sgemm- and strmm-kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 14 +- kernel/power/sgemm_kernel_16x8_power8.S | 354 ++ kernel/power/sgemm_logic_16x8_power8.S | 2172 ++++++++ kernel/power/sgemm_macros_16x8_power8.S | 6145 +++++++++++++++++++++++ kernel/power/strmm_kernel_16x8_power8.S | 364 ++ kernel/power/strmm_logic_16x8_power8.S | 2969 +++++++++++ param.h | 10 +- 7 files changed, 12018 insertions(+), 10 deletions(-) create mode 100644 kernel/power/sgemm_kernel_16x8_power8.S create mode 100644 kernel/power/sgemm_logic_16x8_power8.S create mode 100644 kernel/power/sgemm_macros_16x8_power8.S create mode 100644 kernel/power/strmm_kernel_16x8_power8.S create mode 100644 kernel/power/strmm_logic_16x8_power8.S diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 760d568cd..d40b20dd8 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -3,14 +3,18 @@ #CGEMM_BETA = ../generic/zgemm_beta.c #ZGEMM_BETA = ../generic/zgemm_beta.c -STRMMKERNEL = gemm_kernel_power6.S +STRMMKERNEL = strmm_kernel_16x8_power8.S DTRMMKERNEL = dtrmm_kernel_16x4_power8.S CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S -SGEMMKERNEL = gemm_kernel_power6.S -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMKERNEL = sgemm_kernel_16x8_power8.S +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = ../generic/gemm_tcopy_16.c +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o @@ -146,7 +150,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SGEMVTKERNEL = ../arm/gemv_t.c #DGEMVTKERNEL = ../arm/gemv_t.c #CGEMVTKERNEL = ../arm/zgemv_t.c -#ZGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = zgemv_t_4.c #SSYMV_U_KERNEL = ../generic/symv_k.c diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S new file mode 100644 index 000000000..9f221301a --- /dev/null +++ b/kernel/power/sgemm_kernel_16x8_power8.S @@ -0,0 +1,354 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_SP 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA_SP 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define alpha_r vs30 +#define alpha_vr vs31 + +#define o0 0 + +#define o4 r15 +#define o12 r16 +#define o8 r17 +#define L r18 +#define T1 r19 +#define KK r20 +#define BB r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T2 r31 + +#include "sgemm_macros_16x8_power8.S" + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) +#endif + + // stfd f1, ALPHA_SP + // stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, 2 + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif +#endif + + + cmpwi cr0, M, 0 + ble .L999_H1 + cmpwi cr0, N, 0 + ble .L999_H1 + cmpwi cr0, K, 0 + ble .L999_H1 + + li PRE, 384 + li o4 , 4 + li o8 , 8 + li o12, 12 + li o16, 16 + li o32, 32 + li o48, 48 + + addi T1, SP, 300 + stfs f1, 0(T1) + stfs f1, 4(T1) + stfs f1, 8(T1) + stfs f1,12(T1) + + lxsspx vs28, 0, T1 + + xxspltw alpha_r, vs28 , 0 + lxvw4x alpha_vr, 0, T1 + + + +#include "sgemm_logic_16x8_power8.S" + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/sgemm_logic_16x8_power8.S b/kernel/power/sgemm_logic_16x8_power8.S new file mode 100644 index 000000000..6c5a1c7ef --- /dev/null +++ b/kernel/power/sgemm_logic_16x8_power8.S @@ -0,0 +1,2172 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +**************************************************************************************/ + + + srawi. J, N, 3 + ble .LSGEMM_L8_END + +.LSGEMM_L8_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 3 + add C, C, T1 + srawi. I, M, 4 + ble .LSGEMM_L8x16_END + +.LSGEMM_L8x16_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L8x16_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L8x16_SUB4 + +.LSGEMM_L8x16_LOOP_START: + + dcbt AO, PRE + LOAD8x16_1 + KERNEL8x16_I1 + dcbt AO, PRE + KERNEL8x16_2 + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + addic. L, L, -2 + ble .LSGEMM_L8x16_LOOP_END + + .align 5 + +.LSGEMM_L8x16_LOOP: + + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + addic. L, L, -1 + bgt .LSGEMM_L8x16_LOOP + +.LSGEMM_L8x16_LOOP_END: + + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + KERNEL8x16_1 + KERNEL8x16_E2 + + b .LSGEMM_L8x16_SUB1 + +.LSGEMM_L8x16_SUB4: + + dcbt AO, PRE + KERNEL8x16_SUBI1 + KERNEL8x16_SUB1 + dcbt AO, PRE + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + + b .LSGEMM_L8x16_SUB1 + +.LSGEMM_L8x16_SUB0: + + andi. L, K, 7 + + KERNEL8x16_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L8x16_SAVE + b .LSGEMM_L8x16_SUB2 + +.LSGEMM_L8x16_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L8x16_SAVE + +.LSGEMM_L8x16_SUB2: + + KERNEL8x16_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L8x16_SUB2 + +.LSGEMM_L8x16_SAVE: + + SAVE8x16 + + addic. I, I, -1 + bgt .LSGEMM_L8x16_BEGIN + +.LSGEMM_L8x16_END: + +.LSGEMM_L8x8_BEGIN: + + andi. T2, M, 15 + ble .LSGEMM_L8x1_END + + andi. T1, M, 8 + ble .LSGEMM_L8x8_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L8x8_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L8x8_SUB4 + +.LSGEMM_L8x8_LOOP_START: + + LOAD8x8_1 + KERNEL8x8_I1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + addic. L, L, -2 + ble .LSGEMM_L8x8_LOOP_END + + .align 5 + +.LSGEMM_L8x8_LOOP: + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + addic. L, L, -1 + bgt .LSGEMM_L8x8_LOOP + +.LSGEMM_L8x8_LOOP_END: + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_E2 + + b .LSGEMM_L8x8_SUB1 + +.LSGEMM_L8x8_SUB4: + + KERNEL8x8_SUBI1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + + b .LSGEMM_L8x8_SUB1 + +.LSGEMM_L8x8_SUB0: + + andi. L, K, 7 + + KERNEL8x8_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L8x8_SAVE + b .LSGEMM_L8x8_SUB2 + +.LSGEMM_L8x8_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L8x8_SAVE + +.LSGEMM_L8x8_SUB2: + + KERNEL8x8_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L8x8_SUB2 + +.LSGEMM_L8x8_SAVE: + + SAVE8x8 + +.LSGEMM_L8x8_END: + +.LSGEMM_L8x4_BEGIN: + + + andi. T1, M, 4 + ble .LSGEMM_L8x4_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L8x4_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L8x4_SUB4 + +.LSGEMM_L8x4_LOOP_START: + + LOAD8x4_1 + KERNEL8x4_I1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + addic. L, L, -2 + ble .LSGEMM_L8x4_LOOP_END + + .align 5 + +.LSGEMM_L8x4_LOOP: + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + addic. L, L, -1 + bgt .LSGEMM_L8x4_LOOP + +.LSGEMM_L8x4_LOOP_END: + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_E2 + + b .LSGEMM_L8x4_SUB1 + +.LSGEMM_L8x4_SUB4: + + KERNEL8x4_SUBI1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + + b .LSGEMM_L8x4_SUB1 + +.LSGEMM_L8x4_SUB0: + + andi. L, K, 7 + + KERNEL8x4_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L8x4_SAVE + b .LSGEMM_L8x4_SUB2 + +.LSGEMM_L8x4_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L8x4_SAVE + +.LSGEMM_L8x4_SUB2: + + KERNEL8x4_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L8x4_SUB2 + +.LSGEMM_L8x4_SAVE: + + SAVE8x4 + +.LSGEMM_L8x4_END: + +.LSGEMM_L8x2_BEGIN: + + + andi. T1, M, 2 + ble .LSGEMM_L8x2_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L8x2_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L8x2_SUB4 + +.LSGEMM_L8x2_LOOP_START: + + LOAD8x2_1 + KERNEL8x2_I1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + addic. L, L, -2 + ble .LSGEMM_L8x2_LOOP_END + + .align 5 + +.LSGEMM_L8x2_LOOP: + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + addic. L, L, -1 + bgt .LSGEMM_L8x2_LOOP + +.LSGEMM_L8x2_LOOP_END: + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_E2 + + b .LSGEMM_L8x2_SUB1 + +.LSGEMM_L8x2_SUB4: + + KERNEL8x2_SUBI1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + + b .LSGEMM_L8x2_SUB1 + +.LSGEMM_L8x2_SUB0: + + andi. L, K, 7 + + KERNEL8x2_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L8x2_SAVE + b .LSGEMM_L8x2_SUB2 + +.LSGEMM_L8x2_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L8x2_SAVE + +.LSGEMM_L8x2_SUB2: + + KERNEL8x2_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L8x2_SUB2 + +.LSGEMM_L8x2_SAVE: + + SAVE8x2 + +.LSGEMM_L8x2_END: + +.LSGEMM_L8x1_BEGIN: + + + andi. T1, M, 1 + ble .LSGEMM_L8x1_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L8x1_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L8x1_SUB4 + +.LSGEMM_L8x1_LOOP_START: + + LOAD8x1_1 + KERNEL8x1_I1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + addic. L, L, -2 + ble .LSGEMM_L8x1_LOOP_END + + .align 5 + +.LSGEMM_L8x1_LOOP: + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + addic. L, L, -1 + bgt .LSGEMM_L8x1_LOOP + +.LSGEMM_L8x1_LOOP_END: + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_E2 + + b .LSGEMM_L8x1_SUB1 + +.LSGEMM_L8x1_SUB4: + + KERNEL8x1_SUBI1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + + b .LSGEMM_L8x1_SUB1 + +.LSGEMM_L8x1_SUB0: + + andi. L, K, 7 + + KERNEL8x1_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L8x1_SAVE + b .LSGEMM_L8x1_SUB2 + +.LSGEMM_L8x1_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L8x1_SAVE + +.LSGEMM_L8x1_SUB2: + + KERNEL8x1_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L8x1_SUB2 + +.LSGEMM_L8x1_SAVE: + + SAVE8x1 + +.LSGEMM_L8x1_END: + + slwi T1, K, 5 + add B, B, T1 + + addic. J, J, -1 + bgt .LSGEMM_L8_BEGIN + + andi. T2, N, 7 + ble .L999 + +.LSGEMM_L8_END: + + b .LSGEMM_L4_BEGIN + +.L999_H1: + + b .L999 + +.LSGEMM_L4_BEGIN: + + andi. T1, N, 4 + ble .LSGEMM_L4_END + mr CO, C + mr AO, A + slwi T1, LDC , 2 + add C, C, T1 + srawi. I, M, 4 + ble .LSGEMM_L4x16_END + +.LSGEMM_L4x16_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L4x16_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L4x16_SUB4 + +.LSGEMM_L4x16_LOOP_START: + + dcbt AO, PRE + LOAD4x16_1 + KERNEL4x16_I1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -2 + ble .LSGEMM_L4x16_LOOP_END + + .align 5 + +.LSGEMM_L4x16_LOOP: + + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -1 + bgt .LSGEMM_L4x16_LOOP + +.LSGEMM_L4x16_LOOP_END: + + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + KERNEL4x16_E2 + + b .LSGEMM_L4x16_SUB1 + +.LSGEMM_L4x16_SUB4: + + dcbt AO, PRE + KERNEL4x16_SUBI1 + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + + b .LSGEMM_L4x16_SUB1 + +.LSGEMM_L4x16_SUB0: + + andi. L, K, 7 + + KERNEL4x16_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L4x16_SAVE + b .LSGEMM_L4x16_SUB2 + +.LSGEMM_L4x16_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L4x16_SAVE + +.LSGEMM_L4x16_SUB2: + + KERNEL4x16_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L4x16_SUB2 + +.LSGEMM_L4x16_SAVE: + + SAVE4x16 + + addic. I, I, -1 + bgt .LSGEMM_L4x16_BEGIN + +.LSGEMM_L4x16_END: + +.LSGEMM_L4x8_BEGIN: + + andi. T2, M, 15 + ble .LSGEMM_L4x1_END + + andi. T1, M, 8 + ble .LSGEMM_L4x8_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L4x8_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L4x8_SUB4 + +.LSGEMM_L4x8_LOOP_START: + + LOAD4x8_1 + KERNEL4x8_I1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -2 + ble .LSGEMM_L4x8_LOOP_END + + .align 5 + +.LSGEMM_L4x8_LOOP: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -1 + bgt .LSGEMM_L4x8_LOOP + +.LSGEMM_L4x8_LOOP_END: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_E2 + + b .LSGEMM_L4x8_SUB1 + +.LSGEMM_L4x8_SUB4: + + KERNEL4x8_SUBI1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + b .LSGEMM_L4x8_SUB1 + +.LSGEMM_L4x8_SUB0: + + andi. L, K, 7 + + KERNEL4x8_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L4x8_SAVE + b .LSGEMM_L4x8_SUB2 + +.LSGEMM_L4x8_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L4x8_SAVE + +.LSGEMM_L4x8_SUB2: + + KERNEL4x8_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L4x8_SUB2 + +.LSGEMM_L4x8_SAVE: + + SAVE4x8 + +.LSGEMM_L4x8_END: + +.LSGEMM_L4x4_BEGIN: + + + andi. T1, M, 4 + ble .LSGEMM_L4x4_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L4x4_SUB4 + +.LSGEMM_L4x4_LOOP_START: + + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -2 + ble .LSGEMM_L4x4_LOOP_END + + .align 5 + +.LSGEMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -1 + bgt .LSGEMM_L4x4_LOOP + +.LSGEMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b .LSGEMM_L4x4_SUB1 + +.LSGEMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b .LSGEMM_L4x4_SUB1 + +.LSGEMM_L4x4_SUB0: + + andi. L, K, 7 + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L4x4_SAVE + b .LSGEMM_L4x4_SUB2 + +.LSGEMM_L4x4_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L4x4_SAVE + +.LSGEMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L4x4_SUB2 + +.LSGEMM_L4x4_SAVE: + + SAVE4x4 + +.LSGEMM_L4x4_END: + +.LSGEMM_L4x2_BEGIN: + + + andi. T1, M, 2 + ble .LSGEMM_L4x2_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L4x2_SUB4 + +.LSGEMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble .LSGEMM_L4x2_LOOP_END + + .align 5 + +.LSGEMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt .LSGEMM_L4x2_LOOP + +.LSGEMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b .LSGEMM_L4x2_SUB1 + +.LSGEMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b .LSGEMM_L4x2_SUB1 + +.LSGEMM_L4x2_SUB0: + + andi. L, K, 7 + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L4x2_SAVE + b .LSGEMM_L4x2_SUB2 + +.LSGEMM_L4x2_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L4x2_SAVE + +.LSGEMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L4x2_SUB2 + +.LSGEMM_L4x2_SAVE: + + SAVE4x2 + +.LSGEMM_L4x2_END: + +.LSGEMM_L4x1_BEGIN: + + + andi. T1, M, 1 + ble .LSGEMM_L4x1_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L4x1_SUB4 + +.LSGEMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble .LSGEMM_L4x1_LOOP_END + + .align 5 + +.LSGEMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt .LSGEMM_L4x1_LOOP + +.LSGEMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b .LSGEMM_L4x1_SUB1 + +.LSGEMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b .LSGEMM_L4x1_SUB1 + +.LSGEMM_L4x1_SUB0: + + andi. L, K, 7 + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L4x1_SAVE + b .LSGEMM_L4x1_SUB2 + +.LSGEMM_L4x1_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L4x1_SAVE + +.LSGEMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L4x1_SUB2 + +.LSGEMM_L4x1_SAVE: + + SAVE4x1 + +.LSGEMM_L4x1_END: + + slwi T1, K, 4 + add B, B, T1 + +.LSGEMM_L4_END: +.LSGEMM_L2_BEGIN: + + andi. T1, N, 2 + ble .LSGEMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + srawi. I, M, 4 + ble .LSGEMM_L2x16_END + +.LSGEMM_L2x16_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L2x16_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L2x16_SUB4 + +.LSGEMM_L2x16_LOOP_START: + + dcbt AO, PRE + LOAD2x16_1 + KERNEL2x16_I1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -2 + ble .LSGEMM_L2x16_LOOP_END + + .align 5 + +.LSGEMM_L2x16_LOOP: + + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -1 + bgt .LSGEMM_L2x16_LOOP + +.LSGEMM_L2x16_LOOP_END: + + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + KERNEL2x16_E2 + + b .LSGEMM_L2x16_SUB1 + +.LSGEMM_L2x16_SUB4: + + dcbt AO, PRE + KERNEL2x16_SUBI1 + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + b .LSGEMM_L2x16_SUB1 + +.LSGEMM_L2x16_SUB0: + + andi. L, K, 7 + + KERNEL2x16_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L2x16_SAVE + b .LSGEMM_L2x16_SUB2 + +.LSGEMM_L2x16_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L2x16_SAVE + +.LSGEMM_L2x16_SUB2: + + KERNEL2x16_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L2x16_SUB2 + +.LSGEMM_L2x16_SAVE: + + SAVE2x16 + + addic. I, I, -1 + bgt .LSGEMM_L2x16_BEGIN + +.LSGEMM_L2x16_END: + +.LSGEMM_L2x8_BEGIN: + + andi. T2, M, 15 + ble .LSGEMM_L2x1_END + + andi. T1, M, 8 + ble .LSGEMM_L2x8_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L2x8_SUB4 + +.LSGEMM_L2x8_LOOP_START: + + LOAD2x8_1 + KERNEL2x8_I1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -2 + ble .LSGEMM_L2x8_LOOP_END + + .align 5 + +.LSGEMM_L2x8_LOOP: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -1 + bgt .LSGEMM_L2x8_LOOP + +.LSGEMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b .LSGEMM_L2x8_SUB1 + +.LSGEMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b .LSGEMM_L2x8_SUB1 + +.LSGEMM_L2x8_SUB0: + + andi. L, K, 7 + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L2x8_SAVE + b .LSGEMM_L2x8_SUB2 + +.LSGEMM_L2x8_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L2x8_SAVE + +.LSGEMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L2x8_SUB2 + +.LSGEMM_L2x8_SAVE: + + SAVE2x8 + +.LSGEMM_L2x8_END: + +.LSGEMM_L2x4_BEGIN: + + + andi. T1, M, 4 + ble .LSGEMM_L2x4_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L2x4_SUB4 + +.LSGEMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble .LSGEMM_L2x4_LOOP_END + + .align 5 + +.LSGEMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt .LSGEMM_L2x4_LOOP + +.LSGEMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b .LSGEMM_L2x4_SUB1 + +.LSGEMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b .LSGEMM_L2x4_SUB1 + +.LSGEMM_L2x4_SUB0: + + andi. L, K, 7 + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L2x4_SAVE + b .LSGEMM_L2x4_SUB2 + +.LSGEMM_L2x4_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L2x4_SAVE + +.LSGEMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L2x4_SUB2 + +.LSGEMM_L2x4_SAVE: + + SAVE2x4 + +.LSGEMM_L2x4_END: + +.LSGEMM_L2x2_BEGIN: + + + andi. T1, M, 2 + ble .LSGEMM_L2x2_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L2x2_SUB4 + +.LSGEMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble .LSGEMM_L2x2_LOOP_END + + .align 5 + +.LSGEMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt .LSGEMM_L2x2_LOOP + +.LSGEMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b .LSGEMM_L2x2_SUB1 + +.LSGEMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b .LSGEMM_L2x2_SUB1 + +.LSGEMM_L2x2_SUB0: + + andi. L, K, 7 + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L2x2_SAVE + b .LSGEMM_L2x2_SUB2 + +.LSGEMM_L2x2_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L2x2_SAVE + +.LSGEMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L2x2_SUB2 + +.LSGEMM_L2x2_SAVE: + + SAVE2x2 + +.LSGEMM_L2x2_END: + +.LSGEMM_L2x1_BEGIN: + + + andi. T1, M, 1 + ble .LSGEMM_L2x1_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L2x1_SUB4 + +.LSGEMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble .LSGEMM_L2x1_LOOP_END + + .align 5 + +.LSGEMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt .LSGEMM_L2x1_LOOP + +.LSGEMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b .LSGEMM_L2x1_SUB1 + +.LSGEMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b .LSGEMM_L2x1_SUB1 + +.LSGEMM_L2x1_SUB0: + + andi. L, K, 7 + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L2x1_SAVE + b .LSGEMM_L2x1_SUB2 + +.LSGEMM_L2x1_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L2x1_SAVE + +.LSGEMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L2x1_SUB2 + +.LSGEMM_L2x1_SAVE: + + SAVE2x1 + +.LSGEMM_L2x1_END: + + slwi T1, K, 3 + add B, B, T1 + +.LSGEMM_L2_END: +.LSGEMM_L1_BEGIN: + + andi. T1, N, 1 + ble .LSGEMM_L1_END + mr CO, C + mr AO, A + srawi. I, M, 4 + ble .LSGEMM_L1x16_END + +.LSGEMM_L1x16_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L1x16_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L1x16_SUB4 + +.LSGEMM_L1x16_LOOP_START: + + dcbt AO, PRE + LOAD1x16_1 + KERNEL1x16_I1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -2 + ble .LSGEMM_L1x16_LOOP_END + + .align 5 + +.LSGEMM_L1x16_LOOP: + + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -1 + bgt .LSGEMM_L1x16_LOOP + +.LSGEMM_L1x16_LOOP_END: + + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + KERNEL1x16_E2 + + b .LSGEMM_L1x16_SUB1 + +.LSGEMM_L1x16_SUB4: + + dcbt AO, PRE + KERNEL1x16_SUBI1 + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + b .LSGEMM_L1x16_SUB1 + +.LSGEMM_L1x16_SUB0: + + andi. L, K, 7 + + KERNEL1x16_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L1x16_SAVE + b .LSGEMM_L1x16_SUB2 + +.LSGEMM_L1x16_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L1x16_SAVE + +.LSGEMM_L1x16_SUB2: + + KERNEL1x16_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L1x16_SUB2 + +.LSGEMM_L1x16_SAVE: + + SAVE1x16 + + addic. I, I, -1 + bgt .LSGEMM_L1x16_BEGIN + +.LSGEMM_L1x16_END: + +.LSGEMM_L1x8_BEGIN: + + andi. T2, M, 15 + ble .LSGEMM_L1x1_END + + andi. T1, M, 8 + ble .LSGEMM_L1x8_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L1x8_SUB4 + +.LSGEMM_L1x8_LOOP_START: + + LOAD1x8_1 + KERNEL1x8_I1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -2 + ble .LSGEMM_L1x8_LOOP_END + + .align 5 + +.LSGEMM_L1x8_LOOP: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -1 + bgt .LSGEMM_L1x8_LOOP + +.LSGEMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b .LSGEMM_L1x8_SUB1 + +.LSGEMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b .LSGEMM_L1x8_SUB1 + +.LSGEMM_L1x8_SUB0: + + andi. L, K, 7 + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L1x8_SAVE + b .LSGEMM_L1x8_SUB2 + +.LSGEMM_L1x8_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L1x8_SAVE + +.LSGEMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L1x8_SUB2 + +.LSGEMM_L1x8_SAVE: + + SAVE1x8 + +.LSGEMM_L1x8_END: + +.LSGEMM_L1x4_BEGIN: + + + andi. T1, M, 4 + ble .LSGEMM_L1x4_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L1x4_SUB4 + +.LSGEMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble .LSGEMM_L1x4_LOOP_END + + .align 5 + +.LSGEMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt .LSGEMM_L1x4_LOOP + +.LSGEMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b .LSGEMM_L1x4_SUB1 + +.LSGEMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b .LSGEMM_L1x4_SUB1 + +.LSGEMM_L1x4_SUB0: + + andi. L, K, 7 + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L1x4_SAVE + b .LSGEMM_L1x4_SUB2 + +.LSGEMM_L1x4_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L1x4_SAVE + +.LSGEMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L1x4_SUB2 + +.LSGEMM_L1x4_SAVE: + + SAVE1x4 + +.LSGEMM_L1x4_END: + +.LSGEMM_L1x2_BEGIN: + + + andi. T1, M, 2 + ble .LSGEMM_L1x2_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L1x2_SUB4 + +.LSGEMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble .LSGEMM_L1x2_LOOP_END + + .align 5 + +.LSGEMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt .LSGEMM_L1x2_LOOP + +.LSGEMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b .LSGEMM_L1x2_SUB1 + +.LSGEMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b .LSGEMM_L1x2_SUB1 + +.LSGEMM_L1x2_SUB0: + + andi. L, K, 7 + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L1x2_SAVE + b .LSGEMM_L1x2_SUB2 + +.LSGEMM_L1x2_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L1x2_SAVE + +.LSGEMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L1x2_SUB2 + +.LSGEMM_L1x2_SAVE: + + SAVE1x2 + +.LSGEMM_L1x2_END: + +.LSGEMM_L1x1_BEGIN: + + + andi. T1, M, 1 + ble .LSGEMM_L1x1_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L1x1_SUB4 + +.LSGEMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble .LSGEMM_L1x1_LOOP_END + + .align 5 + +.LSGEMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt .LSGEMM_L1x1_LOOP + +.LSGEMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b .LSGEMM_L1x1_SUB1 + +.LSGEMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b .LSGEMM_L1x1_SUB1 + +.LSGEMM_L1x1_SUB0: + + andi. L, K, 7 + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L1x1_SAVE + b .LSGEMM_L1x1_SUB2 + +.LSGEMM_L1x1_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L1x1_SAVE + +.LSGEMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L1x1_SUB2 + +.LSGEMM_L1x1_SAVE: + + SAVE1x1 + +.LSGEMM_L1x1_END: + +.LSGEMM_L1_END: diff --git a/kernel/power/sgemm_macros_16x8_power8.S b/kernel/power/sgemm_macros_16x8_power8.S new file mode 100644 index 000000000..78f530cfa --- /dev/null +++ b/kernel/power/sgemm_macros_16x8_power8.S @@ -0,0 +1,6145 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +**************************************************************************************/ + +/********************************************************************************************** +* Macros for N=8 and M=16 +**********************************************************************************************/ + +.macro LOAD8x16_1 + + lxvw4x vs28, o0, BO + lxvw4x vs29, o16, BO + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + addi AO, AO, 64 + addi BO, BO, 32 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + +.endm + +.macro KERNEL8x16_I1 + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + lxvw4x vs28, o0, BO + lxvw4x vs29, o16, BO + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + xvmulsp vs48, vs0, vs12 + xvmulsp vs49, vs1, vs12 + xvmulsp vs50, vs2, vs12 + xvmulsp vs51, vs3, vs12 + + xvmulsp vs52, vs0, vs13 + xvmulsp vs53, vs1, vs13 + xvmulsp vs54, vs2, vs13 + xvmulsp vs55, vs3, vs13 + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + xvmulsp vs56, vs0, vs14 + xvmulsp vs57, vs1, vs14 + xvmulsp vs58, vs2, vs14 + xvmulsp vs59, vs3, vs14 + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + xvmulsp vs60, vs0, vs15 + xvmulsp vs61, vs1, vs15 + + addi AO, AO, 64 + addi BO, BO, 32 + + xvmulsp vs62, vs2, vs15 + xvmulsp vs63, vs3, vs15 + + +.endm + +.macro KERNEL8x16_1 + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + lxvw4x vs28, o0, BO + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + + lxvw4x vs29, o16, BO + + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + xvmaddasp vs48, vs0, vs12 + xvmaddasp vs49, vs1, vs12 + xvmaddasp vs50, vs2, vs12 + xvmaddasp vs51, vs3, vs12 + + xvmaddasp vs52, vs0, vs13 + xvmaddasp vs53, vs1, vs13 + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + xvmaddasp vs54, vs2, vs13 + xvmaddasp vs55, vs3, vs13 + + xvmaddasp vs56, vs0, vs14 + xvmaddasp vs57, vs1, vs14 + + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + xvmaddasp vs58, vs2, vs14 + xvmaddasp vs59, vs3, vs14 + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + + xvmaddasp vs60, vs0, vs15 + xvmaddasp vs61, vs1, vs15 + + addi AO, AO, 64 + addi BO, BO, 32 + + xvmaddasp vs62, vs2, vs15 + xvmaddasp vs63, vs3, vs15 + + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + +.endm + +.macro KERNEL8x16_2 + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + + lxvw4x vs28, o0, BO + + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + + lxvw4x vs29, o16, BO + + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + xvmaddasp vs48, vs4, vs20 + xvmaddasp vs49, vs5, vs20 + xvmaddasp vs50, vs6, vs20 + xvmaddasp vs51, vs7, vs20 + + xvmaddasp vs52, vs4, vs21 + xvmaddasp vs53, vs5, vs21 + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + xvmaddasp vs54, vs6, vs21 + xvmaddasp vs55, vs7, vs21 + + xvmaddasp vs56, vs4, vs22 + xvmaddasp vs57, vs5, vs22 + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + xvmaddasp vs58, vs6, vs22 + xvmaddasp vs59, vs7, vs22 + + xvmaddasp vs60, vs4, vs23 + xvmaddasp vs61, vs5, vs23 + + addi AO, AO, 64 + addi BO, BO, 32 + + xvmaddasp vs62, vs6, vs23 + xvmaddasp vs63, vs7, vs23 + + +.endm + +.macro KERNEL8x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + xvmaddasp vs48, vs4, vs20 + xvmaddasp vs49, vs5, vs20 + xvmaddasp vs50, vs6, vs20 + xvmaddasp vs51, vs7, vs20 + + xvmaddasp vs52, vs4, vs21 + xvmaddasp vs53, vs5, vs21 + xvmaddasp vs54, vs6, vs21 + xvmaddasp vs55, vs7, vs21 + + xvmaddasp vs56, vs4, vs22 + xvmaddasp vs57, vs5, vs22 + xvmaddasp vs58, vs6, vs22 + xvmaddasp vs59, vs7, vs22 + + xvmaddasp vs60, vs4, vs23 + xvmaddasp vs61, vs5, vs23 + xvmaddasp vs62, vs6, vs23 + xvmaddasp vs63, vs7, vs23 + + +.endm + +.macro KERNEL8x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + xvmulsp vs48, vs0, vs12 + xvmulsp vs49, vs1, vs12 + xvmulsp vs50, vs2, vs12 + xvmulsp vs51, vs3, vs12 + + xvmulsp vs52, vs0, vs13 + xvmulsp vs53, vs1, vs13 + xvmulsp vs54, vs2, vs13 + xvmulsp vs55, vs3, vs13 + + xvmulsp vs56, vs0, vs14 + xvmulsp vs57, vs1, vs14 + xvmulsp vs58, vs2, vs14 + xvmulsp vs59, vs3, vs14 + + xvmulsp vs60, vs0, vs15 + xvmulsp vs61, vs1, vs15 + xvmulsp vs62, vs2, vs15 + xvmulsp vs63, vs3, vs15 + + +.endm + +.macro KERNEL8x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + xvmaddasp vs48, vs0, vs12 + xvmaddasp vs49, vs1, vs12 + xvmaddasp vs50, vs2, vs12 + xvmaddasp vs51, vs3, vs12 + + xvmaddasp vs52, vs0, vs13 + xvmaddasp vs53, vs1, vs13 + xvmaddasp vs54, vs2, vs13 + xvmaddasp vs55, vs3, vs13 + + xvmaddasp vs56, vs0, vs14 + xvmaddasp vs57, vs1, vs14 + xvmaddasp vs58, vs2, vs14 + xvmaddasp vs59, vs3, vs14 + + xvmaddasp vs60, vs0, vs15 + xvmaddasp vs61, vs1, vs15 + xvmaddasp vs62, vs2, vs15 + xvmaddasp vs63, vs3, vs15 + + +.endm + +.macro SAVE8x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr + +#else + + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr + xvmulsp vs2, vs42, alpha_vr + xvmulsp vs3, vs43, alpha_vr + +#else + + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr + xvmaddasp vs2, vs42, alpha_vr + xvmaddasp vs3, vs43, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr + xvmulsp vs2, vs46, alpha_vr + xvmulsp vs3, vs47, alpha_vr + +#else + + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr + xvmaddasp vs2, vs46, alpha_vr + xvmaddasp vs3, vs47, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs48, alpha_vr + xvmulsp vs1, vs49, alpha_vr + xvmulsp vs2, vs50, alpha_vr + xvmulsp vs3, vs51, alpha_vr + +#else + + xvmaddasp vs0, vs48, alpha_vr + xvmaddasp vs1, vs49, alpha_vr + xvmaddasp vs2, vs50, alpha_vr + xvmaddasp vs3, vs51, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs52, alpha_vr + xvmulsp vs1, vs53, alpha_vr + xvmulsp vs2, vs54, alpha_vr + xvmulsp vs3, vs55, alpha_vr + +#else + + xvmaddasp vs0, vs52, alpha_vr + xvmaddasp vs1, vs53, alpha_vr + xvmaddasp vs2, vs54, alpha_vr + xvmaddasp vs3, vs55, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs56, alpha_vr + xvmulsp vs1, vs57, alpha_vr + xvmulsp vs2, vs58, alpha_vr + xvmulsp vs3, vs59, alpha_vr + +#else + + xvmaddasp vs0, vs56, alpha_vr + xvmaddasp vs1, vs57, alpha_vr + xvmaddasp vs2, vs58, alpha_vr + xvmaddasp vs3, vs59, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs60, alpha_vr + xvmulsp vs1, vs61, alpha_vr + xvmulsp vs2, vs62, alpha_vr + xvmulsp vs3, vs63, alpha_vr + +#else + + xvmaddasp vs0, vs60, alpha_vr + xvmaddasp vs1, vs61, alpha_vr + xvmaddasp vs2, vs62, alpha_vr + xvmaddasp vs3, vs63, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=8 +**********************************************************************************************/ + +.macro LOAD8x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + xvmulsp vs40, vs0, vs12 + xvmulsp vs41, vs1, vs12 + + xvmulsp vs42, vs0, vs13 + xvmulsp vs43, vs1, vs13 + + xvmulsp vs44, vs0, vs14 + xvmulsp vs45, vs1, vs14 + + xvmulsp vs46, vs0, vs15 + xvmulsp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + xvmaddasp vs40, vs0, vs12 + xvmaddasp vs41, vs1, vs12 + + xvmaddasp vs42, vs0, vs13 + xvmaddasp vs43, vs1, vs13 + + xvmaddasp vs44, vs0, vs14 + xvmaddasp vs45, vs1, vs14 + + xvmaddasp vs46, vs0, vs15 + xvmaddasp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + xvmaddasp vs40, vs4, vs20 + xvmaddasp vs41, vs5, vs20 + + xvmaddasp vs42, vs4, vs21 + xvmaddasp vs43, vs5, vs21 + + xvmaddasp vs44, vs4, vs22 + xvmaddasp vs45, vs5, vs22 + + xvmaddasp vs46, vs4, vs23 + xvmaddasp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + xvmaddasp vs40, vs4, vs20 + xvmaddasp vs41, vs5, vs20 + + xvmaddasp vs42, vs4, vs21 + xvmaddasp vs43, vs5, vs21 + + xvmaddasp vs44, vs4, vs22 + xvmaddasp vs45, vs5, vs22 + + xvmaddasp vs46, vs4, vs23 + xvmaddasp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + xvmulsp vs40, vs0, vs12 + xvmulsp vs41, vs1, vs12 + + xvmulsp vs42, vs0, vs13 + xvmulsp vs43, vs1, vs13 + + xvmulsp vs44, vs0, vs14 + xvmulsp vs45, vs1, vs14 + + xvmulsp vs46, vs0, vs15 + xvmulsp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + xvmaddasp vs40, vs0, vs12 + xvmaddasp vs41, vs1, vs12 + + xvmaddasp vs42, vs0, vs13 + xvmaddasp vs43, vs1, vs13 + + xvmaddasp vs44, vs0, vs14 + xvmaddasp vs45, vs1, vs14 + + xvmaddasp vs46, vs0, vs15 + xvmaddasp vs47, vs1, vs15 + + +.endm + +.macro SAVE8x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + +#else + + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs38, alpha_vr + xvmulsp vs1, vs39, alpha_vr + +#else + + xvmaddasp vs0, vs38, alpha_vr + xvmaddasp vs1, vs39, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr + +#else + + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs42, alpha_vr + xvmulsp vs1, vs43, alpha_vr + +#else + + xvmaddasp vs0, vs42, alpha_vr + xvmaddasp vs1, vs43, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr + +#else + + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs46, alpha_vr + xvmulsp vs1, vs47, alpha_vr + +#else + + xvmaddasp vs0, vs46, alpha_vr + xvmaddasp vs1, vs47, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=4 +**********************************************************************************************/ + +.macro LOAD8x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + xvmulsp vs36, vs0, vs12 + + xvmulsp vs37, vs0, vs13 + + xvmulsp vs38, vs0, vs14 + + xvmulsp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + xvmaddasp vs36, vs0, vs12 + + xvmaddasp vs37, vs0, vs13 + + xvmaddasp vs38, vs0, vs14 + + xvmaddasp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + xvmaddasp vs36, vs4, vs20 + + xvmaddasp vs37, vs4, vs21 + + xvmaddasp vs38, vs4, vs22 + + xvmaddasp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + xvmaddasp vs36, vs4, vs20 + + xvmaddasp vs37, vs4, vs21 + + xvmaddasp vs38, vs4, vs22 + + xvmaddasp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + xvmulsp vs36, vs0, vs12 + + xvmulsp vs37, vs0, vs13 + + xvmulsp vs38, vs0, vs14 + + xvmulsp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + xvmaddasp vs36, vs0, vs12 + + xvmaddasp vs37, vs0, vs13 + + xvmaddasp vs38, vs0, vs14 + + xvmaddasp vs39, vs0, vs15 + + +.endm + +.macro SAVE8x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs33, alpha_vr + +#else + + xvmaddasp vs0, vs33, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs34, alpha_vr + +#else + + xvmaddasp vs0, vs34, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs36, alpha_vr + +#else + + xvmaddasp vs0, vs36, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs37, alpha_vr + +#else + + xvmaddasp vs0, vs37, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs38, alpha_vr + +#else + + xvmaddasp vs0, vs38, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs39, alpha_vr + +#else + + xvmaddasp vs0, vs39, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=2 +**********************************************************************************************/ + +.macro LOAD8x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + xsmulsp vs34, vs0, vs9 + xsmulsp vs35, vs1, vs9 + + xsmulsp vs36, vs0, vs10 + xsmulsp vs37, vs1, vs10 + + xsmulsp vs38, vs0, vs11 + xsmulsp vs39, vs1, vs11 + + xsmulsp vs40, vs0, vs12 + xsmulsp vs41, vs1, vs12 + + xsmulsp vs42, vs0, vs13 + xsmulsp vs43, vs1, vs13 + + xsmulsp vs44, vs0, vs14 + xsmulsp vs45, vs1, vs14 + + xsmulsp vs46, vs0, vs15 + xsmulsp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + xsmaddasp vs34, vs0, vs9 + xsmaddasp vs35, vs1, vs9 + + xsmaddasp vs36, vs0, vs10 + xsmaddasp vs37, vs1, vs10 + + xsmaddasp vs38, vs0, vs11 + xsmaddasp vs39, vs1, vs11 + + xsmaddasp vs40, vs0, vs12 + xsmaddasp vs41, vs1, vs12 + + xsmaddasp vs42, vs0, vs13 + xsmaddasp vs43, vs1, vs13 + + xsmaddasp vs44, vs0, vs14 + xsmaddasp vs45, vs1, vs14 + + xsmaddasp vs46, vs0, vs15 + xsmaddasp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + xsmaddasp vs34, vs4, vs17 + xsmaddasp vs35, vs5, vs17 + + xsmaddasp vs36, vs4, vs18 + xsmaddasp vs37, vs5, vs18 + + xsmaddasp vs38, vs4, vs19 + xsmaddasp vs39, vs5, vs19 + + xsmaddasp vs40, vs4, vs20 + xsmaddasp vs41, vs5, vs20 + + xsmaddasp vs42, vs4, vs21 + xsmaddasp vs43, vs5, vs21 + + xsmaddasp vs44, vs4, vs22 + xsmaddasp vs45, vs5, vs22 + + xsmaddasp vs46, vs4, vs23 + xsmaddasp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x2_E2 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + xsmaddasp vs34, vs4, vs17 + xsmaddasp vs35, vs5, vs17 + + xsmaddasp vs36, vs4, vs18 + xsmaddasp vs37, vs5, vs18 + + xsmaddasp vs38, vs4, vs19 + xsmaddasp vs39, vs5, vs19 + + xsmaddasp vs40, vs4, vs20 + xsmaddasp vs41, vs5, vs20 + + xsmaddasp vs42, vs4, vs21 + xsmaddasp vs43, vs5, vs21 + + xsmaddasp vs44, vs4, vs22 + xsmaddasp vs45, vs5, vs22 + + xsmaddasp vs46, vs4, vs23 + xsmaddasp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + xsmulsp vs34, vs0, vs9 + xsmulsp vs35, vs1, vs9 + + xsmulsp vs36, vs0, vs10 + xsmulsp vs37, vs1, vs10 + + xsmulsp vs38, vs0, vs11 + xsmulsp vs39, vs1, vs11 + + xsmulsp vs40, vs0, vs12 + xsmulsp vs41, vs1, vs12 + + xsmulsp vs42, vs0, vs13 + xsmulsp vs43, vs1, vs13 + + xsmulsp vs44, vs0, vs14 + xsmulsp vs45, vs1, vs14 + + xsmulsp vs46, vs0, vs15 + xsmulsp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + xsmaddasp vs34, vs0, vs9 + xsmaddasp vs35, vs1, vs9 + + xsmaddasp vs36, vs0, vs10 + xsmaddasp vs37, vs1, vs10 + + xsmaddasp vs38, vs0, vs11 + xsmaddasp vs39, vs1, vs11 + + xsmaddasp vs40, vs0, vs12 + xsmaddasp vs41, vs1, vs12 + + xsmaddasp vs42, vs0, vs13 + xsmaddasp vs43, vs1, vs13 + + xsmaddasp vs44, vs0, vs14 + xsmaddasp vs45, vs1, vs14 + + xsmaddasp vs46, vs0, vs15 + xsmaddasp vs47, vs1, vs15 + + +.endm + +.macro SAVE8x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + xsmulsp vs1, vs33, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + xsmaddasp vs1, vs33, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs34, alpha_r + xsmulsp vs1, vs35, alpha_r + +#else + + xsmaddasp vs0, vs34, alpha_r + xsmaddasp vs1, vs35, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs36, alpha_r + xsmulsp vs1, vs37, alpha_r + +#else + + xsmaddasp vs0, vs36, alpha_r + xsmaddasp vs1, vs37, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs38, alpha_r + xsmulsp vs1, vs39, alpha_r + +#else + + xsmaddasp vs0, vs38, alpha_r + xsmaddasp vs1, vs39, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs40, alpha_r + xsmulsp vs1, vs41, alpha_r + +#else + + xsmaddasp vs0, vs40, alpha_r + xsmaddasp vs1, vs41, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs42, alpha_r + xsmulsp vs1, vs43, alpha_r + +#else + + xsmaddasp vs0, vs42, alpha_r + xsmaddasp vs1, vs43, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs44, alpha_r + xsmulsp vs1, vs45, alpha_r + +#else + + xsmaddasp vs0, vs44, alpha_r + xsmaddasp vs1, vs45, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs46, alpha_r + xsmulsp vs1, vs47, alpha_r + +#else + + xsmaddasp vs0, vs46, alpha_r + xsmaddasp vs1, vs47, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=1 +**********************************************************************************************/ + +.macro LOAD8x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmulsp vs32, vs0, vs8 + + xsmulsp vs33, vs0, vs9 + + xsmulsp vs34, vs0, vs10 + + xsmulsp vs35, vs0, vs11 + + xsmulsp vs36, vs0, vs12 + + xsmulsp vs37, vs0, vs13 + + xsmulsp vs38, vs0, vs14 + + xsmulsp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmaddasp vs32, vs0, vs8 + + xsmaddasp vs33, vs0, vs9 + + xsmaddasp vs34, vs0, vs10 + + xsmaddasp vs35, vs0, vs11 + + xsmaddasp vs36, vs0, vs12 + + xsmaddasp vs37, vs0, vs13 + + xsmaddasp vs38, vs0, vs14 + + xsmaddasp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddasp vs32, vs4, vs16 + + xsmaddasp vs33, vs4, vs17 + + xsmaddasp vs34, vs4, vs18 + + xsmaddasp vs35, vs4, vs19 + + xsmaddasp vs36, vs4, vs20 + + xsmaddasp vs37, vs4, vs21 + + xsmaddasp vs38, vs4, vs22 + + xsmaddasp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x1_E2 + + + xsmaddasp vs32, vs4, vs16 + + xsmaddasp vs33, vs4, vs17 + + xsmaddasp vs34, vs4, vs18 + + xsmaddasp vs35, vs4, vs19 + + xsmaddasp vs36, vs4, vs20 + + xsmaddasp vs37, vs4, vs21 + + xsmaddasp vs38, vs4, vs22 + + xsmaddasp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmulsp vs32, vs0, vs8 + + xsmulsp vs33, vs0, vs9 + + xsmulsp vs34, vs0, vs10 + + xsmulsp vs35, vs0, vs11 + + xsmulsp vs36, vs0, vs12 + + xsmulsp vs37, vs0, vs13 + + xsmulsp vs38, vs0, vs14 + + xsmulsp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddasp vs32, vs0, vs8 + + xsmaddasp vs33, vs0, vs9 + + xsmaddasp vs34, vs0, vs10 + + xsmaddasp vs35, vs0, vs11 + + xsmaddasp vs36, vs0, vs12 + + xsmaddasp vs37, vs0, vs13 + + xsmaddasp vs38, vs0, vs14 + + xsmaddasp vs39, vs0, vs15 + + +.endm + +.macro SAVE8x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs33, alpha_r + +#else + + xsmaddasp vs0, vs33, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs34, alpha_r + +#else + + xsmaddasp vs0, vs34, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs35, alpha_r + +#else + + xsmaddasp vs0, vs35, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs36, alpha_r + +#else + + xsmaddasp vs0, vs36, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs37, alpha_r + +#else + + xsmaddasp vs0, vs37, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs38, alpha_r + +#else + + xsmaddasp vs0, vs38, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs39, alpha_r + +#else + + xsmaddasp vs0, vs39, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro LOAD4x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + +.endm + +.macro KERNEL4x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + +.endm + +.macro KERNEL4x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + +.endm + +.macro KERNEL4x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + +.endm + +.macro KERNEL4x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + +.endm + +.macro KERNEL4x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + +.endm + +.macro SAVE4x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr + +#else + + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr + xvmulsp vs2, vs42, alpha_vr + xvmulsp vs3, vs43, alpha_vr + +#else + + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr + xvmaddasp vs2, vs42, alpha_vr + xvmaddasp vs3, vs43, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr + xvmulsp vs2, vs46, alpha_vr + xvmulsp vs3, vs47, alpha_vr + +#else + + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr + xvmaddasp vs2, vs46, alpha_vr + xvmaddasp vs3, vs47, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro LOAD4x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + +.endm + +.macro SAVE4x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + +#else + + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs38, alpha_vr + xvmulsp vs1, vs39, alpha_vr + +#else + + xvmaddasp vs0, vs38, alpha_vr + xvmaddasp vs1, vs39, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro LOAD4x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + +.endm + +.macro SAVE4x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs33, alpha_vr + +#else + + xvmaddasp vs0, vs33, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs34, alpha_vr + +#else + + xvmaddasp vs0, vs34, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro LOAD4x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + xsmulsp vs34, vs0, vs9 + xsmulsp vs35, vs1, vs9 + + xsmulsp vs36, vs0, vs10 + xsmulsp vs37, vs1, vs10 + + xsmulsp vs38, vs0, vs11 + xsmulsp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + xsmaddasp vs34, vs0, vs9 + xsmaddasp vs35, vs1, vs9 + + xsmaddasp vs36, vs0, vs10 + xsmaddasp vs37, vs1, vs10 + + xsmaddasp vs38, vs0, vs11 + xsmaddasp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + xsmaddasp vs34, vs4, vs17 + xsmaddasp vs35, vs5, vs17 + + xsmaddasp vs36, vs4, vs18 + xsmaddasp vs37, vs5, vs18 + + xsmaddasp vs38, vs4, vs19 + xsmaddasp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x2_E2 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + xsmaddasp vs34, vs4, vs17 + xsmaddasp vs35, vs5, vs17 + + xsmaddasp vs36, vs4, vs18 + xsmaddasp vs37, vs5, vs18 + + xsmaddasp vs38, vs4, vs19 + xsmaddasp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + xsmulsp vs34, vs0, vs9 + xsmulsp vs35, vs1, vs9 + + xsmulsp vs36, vs0, vs10 + xsmulsp vs37, vs1, vs10 + + xsmulsp vs38, vs0, vs11 + xsmulsp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + xsmaddasp vs34, vs0, vs9 + xsmaddasp vs35, vs1, vs9 + + xsmaddasp vs36, vs0, vs10 + xsmaddasp vs37, vs1, vs10 + + xsmaddasp vs38, vs0, vs11 + xsmaddasp vs39, vs1, vs11 + + +.endm + +.macro SAVE4x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + xsmulsp vs1, vs33, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + xsmaddasp vs1, vs33, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs34, alpha_r + xsmulsp vs1, vs35, alpha_r + +#else + + xsmaddasp vs0, vs34, alpha_r + xsmaddasp vs1, vs35, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs36, alpha_r + xsmulsp vs1, vs37, alpha_r + +#else + + xsmaddasp vs0, vs36, alpha_r + xsmaddasp vs1, vs37, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs38, alpha_r + xsmulsp vs1, vs39, alpha_r + +#else + + xsmaddasp vs0, vs38, alpha_r + xsmaddasp vs1, vs39, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro LOAD4x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmulsp vs32, vs0, vs8 + + xsmulsp vs33, vs0, vs9 + + xsmulsp vs34, vs0, vs10 + + xsmulsp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmaddasp vs32, vs0, vs8 + + xsmaddasp vs33, vs0, vs9 + + xsmaddasp vs34, vs0, vs10 + + xsmaddasp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddasp vs32, vs4, vs16 + + xsmaddasp vs33, vs4, vs17 + + xsmaddasp vs34, vs4, vs18 + + xsmaddasp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddasp vs32, vs4, vs16 + + xsmaddasp vs33, vs4, vs17 + + xsmaddasp vs34, vs4, vs18 + + xsmaddasp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmulsp vs32, vs0, vs8 + + xsmulsp vs33, vs0, vs9 + + xsmulsp vs34, vs0, vs10 + + xsmulsp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddasp vs32, vs0, vs8 + + xsmaddasp vs33, vs0, vs9 + + xsmaddasp vs34, vs0, vs10 + + xsmaddasp vs35, vs0, vs11 + + +.endm + +.macro SAVE4x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs33, alpha_r + +#else + + xsmaddasp vs0, vs33, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs34, alpha_r + +#else + + xsmaddasp vs0, vs34, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs35, alpha_r + +#else + + xsmaddasp vs0, vs35, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +.macro LOAD2x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + +.endm + +.macro KERNEL2x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + +.endm + +.macro KERNEL2x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + +.endm + +.macro KERNEL2x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + +.endm + +.macro KERNEL2x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + +.endm + +.macro KERNEL2x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + +.endm + +.macro SAVE2x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr + +#else + + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro LOAD2x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + +.endm + +.macro SAVE2x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro LOAD2x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs33, alpha_vr + +#else + + xvmaddasp vs0, vs33, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro LOAD2x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + xsmulsp vs34, vs0, vs9 + xsmulsp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + xsmaddasp vs34, vs0, vs9 + xsmaddasp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + xsmaddasp vs34, vs4, vs17 + xsmaddasp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x2_E2 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + xsmaddasp vs34, vs4, vs17 + xsmaddasp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + xsmulsp vs34, vs0, vs9 + xsmulsp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + xsmaddasp vs34, vs0, vs9 + xsmaddasp vs35, vs1, vs9 + + +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + xsmulsp vs1, vs33, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + xsmaddasp vs1, vs33, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs34, alpha_r + xsmulsp vs1, vs35, alpha_r + +#else + + xsmaddasp vs0, vs34, alpha_r + xsmaddasp vs1, vs35, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro LOAD2x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmulsp vs32, vs0, vs8 + + xsmulsp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmaddasp vs32, vs0, vs8 + + xsmaddasp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddasp vs32, vs4, vs16 + + xsmaddasp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddasp vs32, vs4, vs16 + + xsmaddasp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmulsp vs32, vs0, vs8 + + xsmulsp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddasp vs32, vs0, vs8 + + xsmaddasp vs33, vs0, vs9 + + +.endm + +.macro SAVE2x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs33, alpha_r + +#else + + xsmaddasp vs0, vs33, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +.macro LOAD1x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + +.endm + +.macro KERNEL1x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + +.endm + +.macro KERNEL1x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + +.endm + +.macro KERNEL1x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + +.endm + +.macro KERNEL1x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + +.endm + +.macro KERNEL1x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + +.endm + +.macro SAVE1x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro LOAD1x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + +.endm + +.macro SAVE1x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro LOAD1x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + + +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro LOAD1x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x2_E2 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + xsmulsp vs1, vs33, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + xsmaddasp vs1, vs33, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro LOAD1x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmulsp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmaddasp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddasp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddasp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmulsp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddasp vs32, vs0, vs8 + + +.endm + +.macro SAVE1x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + diff --git a/kernel/power/strmm_kernel_16x8_power8.S b/kernel/power/strmm_kernel_16x8_power8.S new file mode 100644 index 000000000..5b1c5ca6b --- /dev/null +++ b/kernel/power/strmm_kernel_16x8_power8.S @@ -0,0 +1,364 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_SP 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA_SP 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define alpha_r vs30 +#define alpha_vr vs31 + +#define o0 0 + +#define o12 r14 +#define o4 r15 +#define K1 r16 +#define o8 r17 +#define L r18 +#define T1 r19 +#define KK r20 +#define KKK 21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T2 r31 + +#include "sgemm_macros_16x8_power8.S" + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) + stw r14, 212(SP) +#endif + + // stfd f1, ALPHA_SP + // stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif +#endif + + mr KK, OFFSET +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, KK +#endif + + + cmpwi cr0, M, 0 + ble .L999_H1 + cmpwi cr0, N, 0 + ble .L999_H1 + cmpwi cr0, K, 0 + ble .L999_H1 + + li PRE, 256 + li o4 , 4 + li o8 , 8 + li o12, 12 + li o16, 16 + li o32, 32 + li o48, 48 + + addi T1, SP, 300 + stfs f1, 0(T1) + stfs f1, 4(T1) + stfs f1, 8(T1) + stfs f1,12(T1) + + lxsspx vs28, 0, T1 + + xxspltw alpha_r, vs28 , 0 + lxvw4x alpha_vr, 0, T1 + + + +#include "strmm_logic_16x8_power8.S" + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) + lwz r14, 212(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/strmm_logic_16x8_power8.S b/kernel/power/strmm_logic_16x8_power8.S new file mode 100644 index 000000000..0d6d04858 --- /dev/null +++ b/kernel/power/strmm_logic_16x8_power8.S @@ -0,0 +1,2969 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +**************************************************************************************/ + + + + srawi. J, N, 3 + ble .LSTRMM_L8_END + +.LSTRMM_L8_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 3 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble .LSTRMM_L8x16_END + +.LSTRMM_L8x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L8x16_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L8x16_SUB4 + +.LSTRMM_L8x16_LOOP_START: + + dcbt AO, PRE + LOAD8x16_1 + dcbt AO, PRE + KERNEL8x16_I1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + addic. L, L, -2 + ble .LSTRMM_L8x16_LOOP_END + + .align 5 + +.LSTRMM_L8x16_LOOP: + + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + addic. L, L, -1 + bgt .LSTRMM_L8x16_LOOP + +.LSTRMM_L8x16_LOOP_END: + + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + KERNEL8x16_E2 + + b .LSTRMM_L8x16_SUB1 + +.LSTRMM_L8x16_SUB4: + + dcbt AO, PRE + KERNEL8x16_SUBI1 + dcbt AO, PRE + KERNEL8x16_SUB1 + dcbt AO, PRE + KERNEL8x16_SUB1 + dcbt AO, PRE + KERNEL8x16_SUB1 + + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + + b .LSTRMM_L8x16_SUB1 + +.LSTRMM_L8x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL8x16_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L8x16_SAVE + b .LSTRMM_L8x16_SUB2 + +.LSTRMM_L8x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L8x16_SAVE + +.LSTRMM_L8x16_SUB2: + + KERNEL8x16_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L8x16_SUB2 + +.LSTRMM_L8x16_SAVE: + + SAVE8x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LSTRMM_L8x16_BEGIN + +.LSTRMM_L8x16_END: + +.LSTRMM_L8x8_BEGIN: + andi. T2, M, 15 + ble .LSTRMM_L8x1_END + + andi. T1, M, 8 + ble .LSTRMM_L8x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L8x8_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L8x8_SUB4 + +.LSTRMM_L8x8_LOOP_START: + + LOAD8x8_1 + KERNEL8x8_I1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + addic. L, L, -2 + ble .LSTRMM_L8x8_LOOP_END + + .align 5 + +.LSTRMM_L8x8_LOOP: + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + addic. L, L, -1 + bgt .LSTRMM_L8x8_LOOP + +.LSTRMM_L8x8_LOOP_END: + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_E2 + + b .LSTRMM_L8x8_SUB1 + +.LSTRMM_L8x8_SUB4: + + KERNEL8x8_SUBI1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + + b .LSTRMM_L8x8_SUB1 + +.LSTRMM_L8x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL8x8_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L8x8_SAVE + b .LSTRMM_L8x8_SUB2 + +.LSTRMM_L8x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L8x8_SAVE + +.LSTRMM_L8x8_SUB2: + + KERNEL8x8_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L8x8_SUB2 + +.LSTRMM_L8x8_SAVE: + + SAVE8x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +.LSTRMM_L8x8_END: + +.LSTRMM_L8x4_BEGIN: + + andi. T1, M, 4 + ble .LSTRMM_L8x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L8x4_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L8x4_SUB4 + +.LSTRMM_L8x4_LOOP_START: + + LOAD8x4_1 + KERNEL8x4_I1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + addic. L, L, -2 + ble .LSTRMM_L8x4_LOOP_END + + .align 5 + +.LSTRMM_L8x4_LOOP: + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + addic. L, L, -1 + bgt .LSTRMM_L8x4_LOOP + +.LSTRMM_L8x4_LOOP_END: + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_E2 + + b .LSTRMM_L8x4_SUB1 + +.LSTRMM_L8x4_SUB4: + + KERNEL8x4_SUBI1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + + b .LSTRMM_L8x4_SUB1 + +.LSTRMM_L8x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL8x4_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L8x4_SAVE + b .LSTRMM_L8x4_SUB2 + +.LSTRMM_L8x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L8x4_SAVE + +.LSTRMM_L8x4_SUB2: + + KERNEL8x4_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L8x4_SUB2 + +.LSTRMM_L8x4_SAVE: + + SAVE8x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LSTRMM_L8x4_END: + +.LSTRMM_L8x2_BEGIN: + + andi. T1, M, 2 + ble .LSTRMM_L8x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L8x2_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L8x2_SUB4 + +.LSTRMM_L8x2_LOOP_START: + + LOAD8x2_1 + KERNEL8x2_I1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + addic. L, L, -2 + ble .LSTRMM_L8x2_LOOP_END + + .align 5 + +.LSTRMM_L8x2_LOOP: + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + addic. L, L, -1 + bgt .LSTRMM_L8x2_LOOP + +.LSTRMM_L8x2_LOOP_END: + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_E2 + + b .LSTRMM_L8x2_SUB1 + +.LSTRMM_L8x2_SUB4: + + KERNEL8x2_SUBI1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + + b .LSTRMM_L8x2_SUB1 + +.LSTRMM_L8x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL8x2_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L8x2_SAVE + b .LSTRMM_L8x2_SUB2 + +.LSTRMM_L8x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L8x2_SAVE + +.LSTRMM_L8x2_SUB2: + + KERNEL8x2_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L8x2_SUB2 + +.LSTRMM_L8x2_SAVE: + + SAVE8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LSTRMM_L8x2_END: + +.LSTRMM_L8x1_BEGIN: + + andi. T1, M, 1 + ble .LSTRMM_L8x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 2 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L8x1_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L8x1_SUB4 + +.LSTRMM_L8x1_LOOP_START: + + LOAD8x1_1 + KERNEL8x1_I1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + addic. L, L, -2 + ble .LSTRMM_L8x1_LOOP_END + + .align 5 + +.LSTRMM_L8x1_LOOP: + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + addic. L, L, -1 + bgt .LSTRMM_L8x1_LOOP + +.LSTRMM_L8x1_LOOP_END: + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_E2 + + b .LSTRMM_L8x1_SUB1 + +.LSTRMM_L8x1_SUB4: + + KERNEL8x1_SUBI1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + + b .LSTRMM_L8x1_SUB1 + +.LSTRMM_L8x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL8x1_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L8x1_SAVE + b .LSTRMM_L8x1_SUB2 + +.LSTRMM_L8x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L8x1_SAVE + +.LSTRMM_L8x1_SUB2: + + KERNEL8x1_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L8x1_SUB2 + +.LSTRMM_L8x1_SAVE: + + SAVE8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LSTRMM_L8x1_END: + + slwi T1, K, 5 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 8 // KK += Number of values in B +#endif + + + addic. J, J, -1 + bgt .LSTRMM_L8_BEGIN + + andi. T2, N, 7 + ble .L999 + +.LSTRMM_L8_END: + + b .LSTRMM_L4_BEGIN + +.L999_H1: + + b .L999 + +.LSTRMM_L4_BEGIN: + + andi. T1, N, 4 + ble .LSTRMM_L4_END + mr CO, C + mr AO, A + slwi T1, LDC , 2 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble .LSTRMM_L4x16_END + +.LSTRMM_L4x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L4x16_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L4x16_SUB4 + +.LSTRMM_L4x16_LOOP_START: + + dcbt AO, PRE + LOAD4x16_1 + dcbt AO, PRE + KERNEL4x16_I1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -2 + ble .LSTRMM_L4x16_LOOP_END + + .align 5 + +.LSTRMM_L4x16_LOOP: + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -1 + bgt .LSTRMM_L4x16_LOOP + +.LSTRMM_L4x16_LOOP_END: + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + KERNEL4x16_E2 + + b .LSTRMM_L4x16_SUB1 + +.LSTRMM_L4x16_SUB4: + + dcbt AO, PRE + KERNEL4x16_SUBI1 + dcbt AO, PRE + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + + b .LSTRMM_L4x16_SUB1 + +.LSTRMM_L4x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x16_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L4x16_SAVE + b .LSTRMM_L4x16_SUB2 + +.LSTRMM_L4x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L4x16_SAVE + +.LSTRMM_L4x16_SUB2: + + KERNEL4x16_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L4x16_SUB2 + +.LSTRMM_L4x16_SAVE: + + SAVE4x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LSTRMM_L4x16_BEGIN + +.LSTRMM_L4x16_END: + +.LSTRMM_L4x8_BEGIN: + andi. T2, M, 15 + ble .LSTRMM_L4x1_END + + andi. T1, M, 8 + ble .LSTRMM_L4x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L4x8_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L4x8_SUB4 + +.LSTRMM_L4x8_LOOP_START: + + LOAD4x8_1 + KERNEL4x8_I1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -2 + ble .LSTRMM_L4x8_LOOP_END + + .align 5 + +.LSTRMM_L4x8_LOOP: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -1 + bgt .LSTRMM_L4x8_LOOP + +.LSTRMM_L4x8_LOOP_END: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_E2 + + b .LSTRMM_L4x8_SUB1 + +.LSTRMM_L4x8_SUB4: + + KERNEL4x8_SUBI1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + b .LSTRMM_L4x8_SUB1 + +.LSTRMM_L4x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x8_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L4x8_SAVE + b .LSTRMM_L4x8_SUB2 + +.LSTRMM_L4x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L4x8_SAVE + +.LSTRMM_L4x8_SUB2: + + KERNEL4x8_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L4x8_SUB2 + +.LSTRMM_L4x8_SAVE: + + SAVE4x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +.LSTRMM_L4x8_END: + +.LSTRMM_L4x4_BEGIN: + + andi. T1, M, 4 + ble .LSTRMM_L4x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L4x4_SUB4 + +.LSTRMM_L4x4_LOOP_START: + + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -2 + ble .LSTRMM_L4x4_LOOP_END + + .align 5 + +.LSTRMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -1 + bgt .LSTRMM_L4x4_LOOP + +.LSTRMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b .LSTRMM_L4x4_SUB1 + +.LSTRMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b .LSTRMM_L4x4_SUB1 + +.LSTRMM_L4x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L4x4_SAVE + b .LSTRMM_L4x4_SUB2 + +.LSTRMM_L4x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L4x4_SAVE + +.LSTRMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L4x4_SUB2 + +.LSTRMM_L4x4_SAVE: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LSTRMM_L4x4_END: + +.LSTRMM_L4x2_BEGIN: + + andi. T1, M, 2 + ble .LSTRMM_L4x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L4x2_SUB4 + +.LSTRMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble .LSTRMM_L4x2_LOOP_END + + .align 5 + +.LSTRMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt .LSTRMM_L4x2_LOOP + +.LSTRMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b .LSTRMM_L4x2_SUB1 + +.LSTRMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b .LSTRMM_L4x2_SUB1 + +.LSTRMM_L4x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L4x2_SAVE + b .LSTRMM_L4x2_SUB2 + +.LSTRMM_L4x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L4x2_SAVE + +.LSTRMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L4x2_SUB2 + +.LSTRMM_L4x2_SAVE: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LSTRMM_L4x2_END: + +.LSTRMM_L4x1_BEGIN: + + andi. T1, M, 1 + ble .LSTRMM_L4x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 2 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L4x1_SUB4 + +.LSTRMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble .LSTRMM_L4x1_LOOP_END + + .align 5 + +.LSTRMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt .LSTRMM_L4x1_LOOP + +.LSTRMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b .LSTRMM_L4x1_SUB1 + +.LSTRMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b .LSTRMM_L4x1_SUB1 + +.LSTRMM_L4x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L4x1_SAVE + b .LSTRMM_L4x1_SUB2 + +.LSTRMM_L4x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L4x1_SAVE + +.LSTRMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L4x1_SUB2 + +.LSTRMM_L4x1_SAVE: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LSTRMM_L4x1_END: + + slwi T1, K, 4 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 4 // KK += Number of values in B +#endif + + +.LSTRMM_L4_END: +.LSTRMM_L2_BEGIN: + + andi. T1, N, 2 + ble .LSTRMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble .LSTRMM_L2x16_END + +.LSTRMM_L2x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L2x16_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L2x16_SUB4 + +.LSTRMM_L2x16_LOOP_START: + + dcbt AO, PRE + LOAD2x16_1 + dcbt AO, PRE + KERNEL2x16_I1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -2 + ble .LSTRMM_L2x16_LOOP_END + + .align 5 + +.LSTRMM_L2x16_LOOP: + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -1 + bgt .LSTRMM_L2x16_LOOP + +.LSTRMM_L2x16_LOOP_END: + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + KERNEL2x16_E2 + + b .LSTRMM_L2x16_SUB1 + +.LSTRMM_L2x16_SUB4: + + dcbt AO, PRE + KERNEL2x16_SUBI1 + dcbt AO, PRE + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + b .LSTRMM_L2x16_SUB1 + +.LSTRMM_L2x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x16_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L2x16_SAVE + b .LSTRMM_L2x16_SUB2 + +.LSTRMM_L2x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L2x16_SAVE + +.LSTRMM_L2x16_SUB2: + + KERNEL2x16_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L2x16_SUB2 + +.LSTRMM_L2x16_SAVE: + + SAVE2x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LSTRMM_L2x16_BEGIN + +.LSTRMM_L2x16_END: + +.LSTRMM_L2x8_BEGIN: + andi. T2, M, 15 + ble .LSTRMM_L2x1_END + + andi. T1, M, 8 + ble .LSTRMM_L2x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L2x8_SUB4 + +.LSTRMM_L2x8_LOOP_START: + + LOAD2x8_1 + KERNEL2x8_I1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -2 + ble .LSTRMM_L2x8_LOOP_END + + .align 5 + +.LSTRMM_L2x8_LOOP: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -1 + bgt .LSTRMM_L2x8_LOOP + +.LSTRMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b .LSTRMM_L2x8_SUB1 + +.LSTRMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b .LSTRMM_L2x8_SUB1 + +.LSTRMM_L2x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L2x8_SAVE + b .LSTRMM_L2x8_SUB2 + +.LSTRMM_L2x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L2x8_SAVE + +.LSTRMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L2x8_SUB2 + +.LSTRMM_L2x8_SAVE: + + SAVE2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +.LSTRMM_L2x8_END: + +.LSTRMM_L2x4_BEGIN: + + andi. T1, M, 4 + ble .LSTRMM_L2x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L2x4_SUB4 + +.LSTRMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble .LSTRMM_L2x4_LOOP_END + + .align 5 + +.LSTRMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt .LSTRMM_L2x4_LOOP + +.LSTRMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b .LSTRMM_L2x4_SUB1 + +.LSTRMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b .LSTRMM_L2x4_SUB1 + +.LSTRMM_L2x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L2x4_SAVE + b .LSTRMM_L2x4_SUB2 + +.LSTRMM_L2x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L2x4_SAVE + +.LSTRMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L2x4_SUB2 + +.LSTRMM_L2x4_SAVE: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LSTRMM_L2x4_END: + +.LSTRMM_L2x2_BEGIN: + + andi. T1, M, 2 + ble .LSTRMM_L2x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L2x2_SUB4 + +.LSTRMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble .LSTRMM_L2x2_LOOP_END + + .align 5 + +.LSTRMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt .LSTRMM_L2x2_LOOP + +.LSTRMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b .LSTRMM_L2x2_SUB1 + +.LSTRMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b .LSTRMM_L2x2_SUB1 + +.LSTRMM_L2x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L2x2_SAVE + b .LSTRMM_L2x2_SUB2 + +.LSTRMM_L2x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L2x2_SAVE + +.LSTRMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L2x2_SUB2 + +.LSTRMM_L2x2_SAVE: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LSTRMM_L2x2_END: + +.LSTRMM_L2x1_BEGIN: + + andi. T1, M, 1 + ble .LSTRMM_L2x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 2 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L2x1_SUB4 + +.LSTRMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble .LSTRMM_L2x1_LOOP_END + + .align 5 + +.LSTRMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt .LSTRMM_L2x1_LOOP + +.LSTRMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b .LSTRMM_L2x1_SUB1 + +.LSTRMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b .LSTRMM_L2x1_SUB1 + +.LSTRMM_L2x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L2x1_SAVE + b .LSTRMM_L2x1_SUB2 + +.LSTRMM_L2x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L2x1_SAVE + +.LSTRMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L2x1_SUB2 + +.LSTRMM_L2x1_SAVE: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LSTRMM_L2x1_END: + + slwi T1, K, 3 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 2 // KK += Number of values in B +#endif + + +.LSTRMM_L2_END: +.LSTRMM_L1_BEGIN: + + andi. T1, N, 1 + ble .LSTRMM_L1_END + mr CO, C + mr AO, A + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble .LSTRMM_L1x16_END + +.LSTRMM_L1x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 2 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L1x16_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L1x16_SUB4 + +.LSTRMM_L1x16_LOOP_START: + + dcbt AO, PRE + LOAD1x16_1 + dcbt AO, PRE + KERNEL1x16_I1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -2 + ble .LSTRMM_L1x16_LOOP_END + + .align 5 + +.LSTRMM_L1x16_LOOP: + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -1 + bgt .LSTRMM_L1x16_LOOP + +.LSTRMM_L1x16_LOOP_END: + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + KERNEL1x16_E2 + + b .LSTRMM_L1x16_SUB1 + +.LSTRMM_L1x16_SUB4: + + dcbt AO, PRE + KERNEL1x16_SUBI1 + dcbt AO, PRE + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + b .LSTRMM_L1x16_SUB1 + +.LSTRMM_L1x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x16_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L1x16_SAVE + b .LSTRMM_L1x16_SUB2 + +.LSTRMM_L1x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L1x16_SAVE + +.LSTRMM_L1x16_SUB2: + + KERNEL1x16_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L1x16_SUB2 + +.LSTRMM_L1x16_SAVE: + + SAVE1x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LSTRMM_L1x16_BEGIN + +.LSTRMM_L1x16_END: + +.LSTRMM_L1x8_BEGIN: + andi. T2, M, 15 + ble .LSTRMM_L1x1_END + + andi. T1, M, 8 + ble .LSTRMM_L1x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 2 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L1x8_SUB4 + +.LSTRMM_L1x8_LOOP_START: + + LOAD1x8_1 + KERNEL1x8_I1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -2 + ble .LSTRMM_L1x8_LOOP_END + + .align 5 + +.LSTRMM_L1x8_LOOP: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -1 + bgt .LSTRMM_L1x8_LOOP + +.LSTRMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b .LSTRMM_L1x8_SUB1 + +.LSTRMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b .LSTRMM_L1x8_SUB1 + +.LSTRMM_L1x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L1x8_SAVE + b .LSTRMM_L1x8_SUB2 + +.LSTRMM_L1x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L1x8_SAVE + +.LSTRMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L1x8_SUB2 + +.LSTRMM_L1x8_SAVE: + + SAVE1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +.LSTRMM_L1x8_END: + +.LSTRMM_L1x4_BEGIN: + + andi. T1, M, 4 + ble .LSTRMM_L1x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 2 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L1x4_SUB4 + +.LSTRMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble .LSTRMM_L1x4_LOOP_END + + .align 5 + +.LSTRMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt .LSTRMM_L1x4_LOOP + +.LSTRMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b .LSTRMM_L1x4_SUB1 + +.LSTRMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b .LSTRMM_L1x4_SUB1 + +.LSTRMM_L1x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L1x4_SAVE + b .LSTRMM_L1x4_SUB2 + +.LSTRMM_L1x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L1x4_SAVE + +.LSTRMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L1x4_SUB2 + +.LSTRMM_L1x4_SAVE: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LSTRMM_L1x4_END: + +.LSTRMM_L1x2_BEGIN: + + andi. T1, M, 2 + ble .LSTRMM_L1x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 2 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L1x2_SUB4 + +.LSTRMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble .LSTRMM_L1x2_LOOP_END + + .align 5 + +.LSTRMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt .LSTRMM_L1x2_LOOP + +.LSTRMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b .LSTRMM_L1x2_SUB1 + +.LSTRMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b .LSTRMM_L1x2_SUB1 + +.LSTRMM_L1x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L1x2_SAVE + b .LSTRMM_L1x2_SUB2 + +.LSTRMM_L1x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L1x2_SAVE + +.LSTRMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L1x2_SUB2 + +.LSTRMM_L1x2_SAVE: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LSTRMM_L1x2_END: + +.LSTRMM_L1x1_BEGIN: + + andi. T1, M, 1 + ble .LSTRMM_L1x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 2 // Number of values in B shifted + slwi T2, KK, 2 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L1x1_SUB4 + +.LSTRMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble .LSTRMM_L1x1_LOOP_END + + .align 5 + +.LSTRMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt .LSTRMM_L1x1_LOOP + +.LSTRMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b .LSTRMM_L1x1_SUB1 + +.LSTRMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b .LSTRMM_L1x1_SUB1 + +.LSTRMM_L1x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L1x1_SAVE + b .LSTRMM_L1x1_SUB2 + +.LSTRMM_L1x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L1x1_SAVE + +.LSTRMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L1x1_SUB2 + +.LSTRMM_L1x1_SAVE: + + SAVE1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LSTRMM_L1x1_END: + +#if !defined(LEFT) + addi KK, KK, 1 // KK += Number of values in B +#endif + + +.LSTRMM_L1_END: diff --git a/param.h b/param.h index 31125d8e4..f5d1ab2ea 100644 --- a/param.h +++ b/param.h @@ -1961,15 +1961,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) -#define SNUMOPT 4 +#define SNUMOPT 16 #define DNUMOPT 8 #define GEMM_DEFAULT_OFFSET_A 384 #define GEMM_DEFAULT_OFFSET_B 1024 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 4 -#define SGEMM_DEFAULT_UNROLL_N 4 +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 2 @@ -1977,12 +1977,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 992 +#define SGEMM_DEFAULT_P 960 #define DGEMM_DEFAULT_P 480 #define CGEMM_DEFAULT_P 488 #define ZGEMM_DEFAULT_P 240 -#define SGEMM_DEFAULT_Q 504 +#define SGEMM_DEFAULT_Q 720 #define DGEMM_DEFAULT_Q 720 #define CGEMM_DEFAULT_Q 400 #define ZGEMM_DEFAULT_Q 360 From dcd15b546c14d06b70721fe1a08d43ceb61e6b6f Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 14 Mar 2016 14:36:59 +0100 Subject: [PATCH 2/4] BUGFIX: KERNEL.POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index d40b20dd8..f8be1d40f 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -150,7 +150,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SGEMVTKERNEL = ../arm/gemv_t.c #DGEMVTKERNEL = ../arm/gemv_t.c #CGEMVTKERNEL = ../arm/zgemv_t.c -ZGEMVTKERNEL = zgemv_t_4.c +#ZGEMVTKERNEL = zgemv_t_4.c #SSYMV_U_KERNEL = ../generic/symv_k.c From 5c658f8746835ea8e0b22829ed049888cbd6fe7d Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 18 Mar 2016 08:17:25 +0100 Subject: [PATCH 3/4] add optimized cgemm- and ctrmm-kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 12 +- kernel/power/cgemm_kernel_8x4_power8.S | 375 ++ kernel/power/cgemm_logic_8x4_power8.S | 1342 +++++ kernel/power/cgemm_macros_8x4_power8.S | 6713 ++++++++++++++++++++++++ kernel/power/ctrmm_kernel_8x4_power8.S | 385 ++ kernel/power/ctrmm_logic_8x4_power8.S | 1756 +++++++ param.h | 9 +- 7 files changed, 10584 insertions(+), 8 deletions(-) create mode 100644 kernel/power/cgemm_kernel_8x4_power8.S create mode 100644 kernel/power/cgemm_logic_8x4_power8.S create mode 100644 kernel/power/cgemm_macros_8x4_power8.S create mode 100644 kernel/power/ctrmm_kernel_8x4_power8.S create mode 100644 kernel/power/ctrmm_logic_8x4_power8.S diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index f8be1d40f..eaa9f26ed 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -5,7 +5,7 @@ STRMMKERNEL = strmm_kernel_16x8_power8.S DTRMMKERNEL = dtrmm_kernel_16x4_power8.S -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +CTRMMKERNEL = ctrmm_kernel_8x4_power8.S ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S SGEMMKERNEL = sgemm_kernel_16x8_power8.S @@ -28,11 +28,15 @@ DGEMMITCOPYOBJ = dgemm_itcopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMKERNEL = cgemm_kernel_8x4_power8.S +CGEMMINCOPY = ../generic/zgemm_ncopy_8.c +CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o ZGEMMKERNEL = zgemm_kernel_8x2_power8.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S new file mode 100644 index 000000000..f732c8132 --- /dev/null +++ b/kernel/power/cgemm_kernel_8x4_power8.S @@ -0,0 +1,375 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 400 +#define ALPHA_R_SP 296(SP) +#define ALPHA_I_SP 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R_SP 224(SP) +#define ALPHA_I_SP 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define o0 0 +#define alpha_r vs30 +#define alpha_i vs31 + +#define TBUFFER r14 +#define L r15 +#define o12 r16 +#define o4 r17 +#define T2 r19 +#define KK r20 +#define o8 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) + stw r14, 212(SP) +#endif + + stfs f1, ALPHA_R_SP + stfs f2, ALPHA_I_SP + // stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) +#else + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + +#include "cgemm_macros_8x4_power8.S" + + cmpwi cr0, M, 0 + ble .L999_H1 + cmpwi cr0, N, 0 + ble .L999_H1 + cmpwi cr0, K, 0 + ble .L999_H1 + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 256 + li o4 , 4 + li o8 , 8 + li o12 , 12 + li o16 , 16 + li o32 , 32 + li o48 , 48 + addi TBUFFER, SP, 360 + + +#ifdef __64BIT__ + addi T1 , SP, 296 +#else + addi T1 , SP, 224 +#endif + + lxsspx alpha_r, 0, T1 + lxsspx alpha_i, o8, T1 + + .align 5 + +#include "cgemm_logic_8x4_power8.S" + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) + lwz r14, 212(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/cgemm_logic_8x4_power8.S b/kernel/power/cgemm_logic_8x4_power8.S new file mode 100644 index 000000000..51a063126 --- /dev/null +++ b/kernel/power/cgemm_logic_8x4_power8.S @@ -0,0 +1,1342 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. J, N, 2 + ble .LCGEMM_L4_END + +.LCGEMM_L4_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 2 + add C, C, T1 + srawi. I, M, 3 + ble .LCGEMM_L4x8_END + +.LCGEMM_L4x8_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L4x8_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L4x8_SUB4 + +.LCGEMM_L4x8_LOOP_START: + + dcbt AO, PRE + LOAD4x8_1 + KERNEL4x8_I1 + dcbt AO, PRE + KERNEL4x8_2 + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + + addic. L, L, -2 + ble .LCGEMM_L4x8_LOOP_END + + .align 5 + +.LCGEMM_L4x8_LOOP: + + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + + addic. L, L, -1 + bgt .LCGEMM_L4x8_LOOP + +.LCGEMM_L4x8_LOOP_END: + + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_E2 + + b .LCGEMM_L4x8_SUB1 + +.LCGEMM_L4x8_SUB4: + + KERNEL4x8_SUBI1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + b .LCGEMM_L4x8_SUB1 + +.LCGEMM_L4x8_SUB0: + + andi. L, K, 7 + + KERNEL4x8_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L4x8_SAVE + b .LCGEMM_L4x8_SUB2 + +.LCGEMM_L4x8_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L4x8_SAVE + +.LCGEMM_L4x8_SUB2: + + KERNEL4x8_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L4x8_SUB2 + +.LCGEMM_L4x8_SAVE: + + SAVE4x8 + + addic. I, I, -1 + bgt .LCGEMM_L4x8_BEGIN + +.LCGEMM_L4x8_END: + +.LCGEMM_L4x4_BEGIN: + + andi. T2, M, 7 + ble .LCGEMM_L4x1_END + + andi. T1, M, 4 + ble .LCGEMM_L4x4_END + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L4x4_SUB4 + +.LCGEMM_L4x4_LOOP_START: + + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -2 + ble .LCGEMM_L4x4_LOOP_END + + .align 5 + +.LCGEMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -1 + bgt .LCGEMM_L4x4_LOOP + +.LCGEMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b .LCGEMM_L4x4_SUB1 + +.LCGEMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b .LCGEMM_L4x4_SUB1 + +.LCGEMM_L4x4_SUB0: + + andi. L, K, 7 + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L4x4_SAVE + b .LCGEMM_L4x4_SUB2 + +.LCGEMM_L4x4_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L4x4_SAVE + +.LCGEMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L4x4_SUB2 + +.LCGEMM_L4x4_SAVE: + + SAVE4x4 + +.LCGEMM_L4x4_END: + +.LCGEMM_L4x2_BEGIN: + + + andi. T1, M, 2 + ble .LCGEMM_L4x2_END + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L4x2_SUB4 + +.LCGEMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble .LCGEMM_L4x2_LOOP_END + + .align 5 + +.LCGEMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt .LCGEMM_L4x2_LOOP + +.LCGEMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b .LCGEMM_L4x2_SUB1 + +.LCGEMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b .LCGEMM_L4x2_SUB1 + +.LCGEMM_L4x2_SUB0: + + andi. L, K, 7 + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L4x2_SAVE + b .LCGEMM_L4x2_SUB2 + +.LCGEMM_L4x2_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L4x2_SAVE + +.LCGEMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L4x2_SUB2 + +.LCGEMM_L4x2_SAVE: + + SAVE4x2 + +.LCGEMM_L4x2_END: + +.LCGEMM_L4x1_BEGIN: + + + andi. T1, M, 1 + ble .LCGEMM_L4x1_END + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L4x1_SUB4 + +.LCGEMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble .LCGEMM_L4x1_LOOP_END + + .align 5 + +.LCGEMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt .LCGEMM_L4x1_LOOP + +.LCGEMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b .LCGEMM_L4x1_SUB1 + +.LCGEMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b .LCGEMM_L4x1_SUB1 + +.LCGEMM_L4x1_SUB0: + + andi. L, K, 7 + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L4x1_SAVE + b .LCGEMM_L4x1_SUB2 + +.LCGEMM_L4x1_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L4x1_SAVE + +.LCGEMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L4x1_SUB2 + +.LCGEMM_L4x1_SAVE: + + SAVE4x1 + +.LCGEMM_L4x1_END: + + slwi T1, K, 5 + add B, B, T1 + + addic. J, J, -1 + bgt .LCGEMM_L4_BEGIN + + andi. T2, N, 3 + ble .L999_H2 + +.LCGEMM_L4_END: + + b .LCGEMM_L2_BEGIN + +.L999_H1: + + b .L999_H2 + +.LCGEMM_L2_BEGIN: + + andi. T1, N, 2 + ble .LCGEMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + srawi. I, M, 3 + ble .LCGEMM_L2x8_END + +.LCGEMM_L2x8_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L2x8_SUB4 + +.LCGEMM_L2x8_LOOP_START: + + dcbt AO, PRE + LOAD2x8_1 + KERNEL2x8_I1 + dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -2 + ble .LCGEMM_L2x8_LOOP_END + + .align 5 + +.LCGEMM_L2x8_LOOP: + + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -1 + bgt .LCGEMM_L2x8_LOOP + +.LCGEMM_L2x8_LOOP_END: + + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b .LCGEMM_L2x8_SUB1 + +.LCGEMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b .LCGEMM_L2x8_SUB1 + +.LCGEMM_L2x8_SUB0: + + andi. L, K, 7 + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L2x8_SAVE + b .LCGEMM_L2x8_SUB2 + +.LCGEMM_L2x8_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L2x8_SAVE + +.LCGEMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L2x8_SUB2 + +.LCGEMM_L2x8_SAVE: + + SAVE2x8 + + addic. I, I, -1 + bgt .LCGEMM_L2x8_BEGIN + +.LCGEMM_L2x8_END: + +.LCGEMM_L2x4_BEGIN: + + andi. T2, M, 7 + ble .LCGEMM_L2x1_END + + andi. T1, M, 4 + ble .LCGEMM_L2x4_END + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L2x4_SUB4 + +.LCGEMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble .LCGEMM_L2x4_LOOP_END + + .align 5 + +.LCGEMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt .LCGEMM_L2x4_LOOP + +.LCGEMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b .LCGEMM_L2x4_SUB1 + +.LCGEMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b .LCGEMM_L2x4_SUB1 + +.LCGEMM_L2x4_SUB0: + + andi. L, K, 7 + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L2x4_SAVE + b .LCGEMM_L2x4_SUB2 + +.LCGEMM_L2x4_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L2x4_SAVE + +.LCGEMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L2x4_SUB2 + +.LCGEMM_L2x4_SAVE: + + SAVE2x4 + +.LCGEMM_L2x4_END: + +.LCGEMM_L2x2_BEGIN: + + + andi. T1, M, 2 + ble .LCGEMM_L2x2_END + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L2x2_SUB4 + +.LCGEMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble .LCGEMM_L2x2_LOOP_END + + .align 5 + +.LCGEMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt .LCGEMM_L2x2_LOOP + +.LCGEMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b .LCGEMM_L2x2_SUB1 + +.LCGEMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b .LCGEMM_L2x2_SUB1 + +.LCGEMM_L2x2_SUB0: + + andi. L, K, 7 + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L2x2_SAVE + b .LCGEMM_L2x2_SUB2 + +.LCGEMM_L2x2_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L2x2_SAVE + +.LCGEMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L2x2_SUB2 + +.LCGEMM_L2x2_SAVE: + + SAVE2x2 + +.LCGEMM_L2x2_END: + +.LCGEMM_L2x1_BEGIN: + + + andi. T1, M, 1 + ble .LCGEMM_L2x1_END + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L2x1_SUB4 + +.LCGEMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble .LCGEMM_L2x1_LOOP_END + + .align 5 + +.LCGEMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt .LCGEMM_L2x1_LOOP + +.LCGEMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b .LCGEMM_L2x1_SUB1 + +.LCGEMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b .LCGEMM_L2x1_SUB1 + +.LCGEMM_L2x1_SUB0: + + andi. L, K, 7 + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L2x1_SAVE + b .LCGEMM_L2x1_SUB2 + +.LCGEMM_L2x1_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L2x1_SAVE + +.LCGEMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L2x1_SUB2 + +.LCGEMM_L2x1_SAVE: + + SAVE2x1 + +.LCGEMM_L2x1_END: + + slwi T1, K, 4 + add B, B, T1 + +.LCGEMM_L2_END: + + b .LCGEMM_L1_BEGIN + +.L999_H2: + + b .L999 + +.LCGEMM_L1_BEGIN: + + andi. T1, N, 1 + ble .LCGEMM_L1_END + mr CO, C + mr AO, A + srawi. I, M, 3 + ble .LCGEMM_L1x8_END + +.LCGEMM_L1x8_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L1x8_SUB4 + +.LCGEMM_L1x8_LOOP_START: + + dcbt AO, PRE + LOAD1x8_1 + KERNEL1x8_I1 + dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -2 + ble .LCGEMM_L1x8_LOOP_END + + .align 5 + +.LCGEMM_L1x8_LOOP: + + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -1 + bgt .LCGEMM_L1x8_LOOP + +.LCGEMM_L1x8_LOOP_END: + + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b .LCGEMM_L1x8_SUB1 + +.LCGEMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b .LCGEMM_L1x8_SUB1 + +.LCGEMM_L1x8_SUB0: + + andi. L, K, 7 + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L1x8_SAVE + b .LCGEMM_L1x8_SUB2 + +.LCGEMM_L1x8_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L1x8_SAVE + +.LCGEMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L1x8_SUB2 + +.LCGEMM_L1x8_SAVE: + + SAVE1x8 + + addic. I, I, -1 + bgt .LCGEMM_L1x8_BEGIN + +.LCGEMM_L1x8_END: + +.LCGEMM_L1x4_BEGIN: + + andi. T2, M, 7 + ble .LCGEMM_L1x1_END + + andi. T1, M, 4 + ble .LCGEMM_L1x4_END + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L1x4_SUB4 + +.LCGEMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble .LCGEMM_L1x4_LOOP_END + + .align 5 + +.LCGEMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt .LCGEMM_L1x4_LOOP + +.LCGEMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b .LCGEMM_L1x4_SUB1 + +.LCGEMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b .LCGEMM_L1x4_SUB1 + +.LCGEMM_L1x4_SUB0: + + andi. L, K, 7 + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L1x4_SAVE + b .LCGEMM_L1x4_SUB2 + +.LCGEMM_L1x4_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L1x4_SAVE + +.LCGEMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L1x4_SUB2 + +.LCGEMM_L1x4_SAVE: + + SAVE1x4 + +.LCGEMM_L1x4_END: + +.LCGEMM_L1x2_BEGIN: + + + andi. T1, M, 2 + ble .LCGEMM_L1x2_END + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L1x2_SUB4 + +.LCGEMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble .LCGEMM_L1x2_LOOP_END + + .align 5 + +.LCGEMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt .LCGEMM_L1x2_LOOP + +.LCGEMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b .LCGEMM_L1x2_SUB1 + +.LCGEMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b .LCGEMM_L1x2_SUB1 + +.LCGEMM_L1x2_SUB0: + + andi. L, K, 7 + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L1x2_SAVE + b .LCGEMM_L1x2_SUB2 + +.LCGEMM_L1x2_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L1x2_SAVE + +.LCGEMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L1x2_SUB2 + +.LCGEMM_L1x2_SAVE: + + SAVE1x2 + +.LCGEMM_L1x2_END: + +.LCGEMM_L1x1_BEGIN: + + + andi. T1, M, 1 + ble .LCGEMM_L1x1_END + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L1x1_SUB4 + +.LCGEMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble .LCGEMM_L1x1_LOOP_END + + .align 5 + +.LCGEMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt .LCGEMM_L1x1_LOOP + +.LCGEMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b .LCGEMM_L1x1_SUB1 + +.LCGEMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b .LCGEMM_L1x1_SUB1 + +.LCGEMM_L1x1_SUB0: + + andi. L, K, 7 + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L1x1_SAVE + b .LCGEMM_L1x1_SUB2 + +.LCGEMM_L1x1_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L1x1_SAVE + +.LCGEMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L1x1_SUB2 + +.LCGEMM_L1x1_SAVE: + + SAVE1x1 + +.LCGEMM_L1x1_END: + +.LCGEMM_L1_END: diff --git a/kernel/power/cgemm_macros_8x4_power8.S b/kernel/power/cgemm_macros_8x4_power8.S new file mode 100644 index 000000000..2085d3764 --- /dev/null +++ b/kernel/power/cgemm_macros_8x4_power8.S @@ -0,0 +1,6713 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define XSFADD_R1 xsaddsp + #define XSFADD_R2 xssubsp + #define XSFADD_I1 xsaddsp + #define XSFADD_I2 xsaddsp + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + #define XSFADD_R1 xsaddsp + #define XSFADD_R2 xsaddsp + #define XSFADD_I1 xssubsp + #define XSFADD_I2 xsaddsp + +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + + #define XSFADD_R1 xsaddsp + #define XSFADD_R2 xsaddsp + #define XSFADD_I1 xsaddsp + #define XSFADD_I2 xssubsp + +#else // CC || CR || RC || RR + + #define XSFADD_R1 xsaddsp + #define XSFADD_R2 xssubsp + #define XSFADD_I1 xssubsp + #define XSFADD_I2 xssubsp + +#endif + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro LOAD4x8_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_1 + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs4, o0, AO // load a0, a1 + + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs5, o16, AO // load a2, a3 + + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + + lxvw4x vs6, o32, AO // load a4, a5 + lxvw4x vs7, o48, AO // load a6, a7 + + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + addi BO, BO, 32 + xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + addi AO, AO, 64 + xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + +.endm + +.macro KERNEL4x8_2 + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs0, o0, AO // load a0, a1 + + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs1, o16, AO // load a2, a3 + + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + lxvw4x vs2, o32, AO // load a4, a5 + lxvw4x vs3, o48, AO // load a6, a7 + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs6, vs20 // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs6, vs21 // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + addi AO, AO, 64 + xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i + addi BO, BO, 32 + xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + +.endm + +.macro KERNEL4x8_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs6, vs20 // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs6, vs21 // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro SAVE4x8 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs37, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs38, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs39, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs40, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs41, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs42, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs43, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs44, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs45, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs46, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs47, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs48, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs49, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs50, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs51, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs52, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs53, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs54, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs55, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs56, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs57, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs58, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs59, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs60, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs61, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs62, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs63, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro LOAD4x4_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + +.endm + +.macro KERNEL4x4_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro SAVE4x4 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs37, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs38, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs39, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs40, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs41, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs42, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs43, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs44, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs45, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs46, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs47, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro LOAD4x2_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + +.endm + +.macro KERNEL4x2_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro SAVE4x2 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs37, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs38, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs39, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro LOAD4x1_1 + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs12, o0, T1 // load b2_r + lxsspx vs13, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs14, o0, T1 // load b3_r + lxsspx vs15, o4, T1 // load b3_i + + addi BO, BO, 32 + +.endm + +.macro KERNEL4x1_I1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs18, o0, T1 // load b1_r + lxsspx vs19, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs20, o0, T1 // load b2_r + lxsspx vs21, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs22, o0, T1 // load b3_r + lxsspx vs23, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmulsp vs32, vs0, vs8 // a0_r*b0_r + xsmulsp vs33, vs1, vs9 // a0_i*b0_i + xsmulsp vs34, vs0, vs9 // a0_r*b0_i + xsmulsp vs35, vs1, vs8 // a0_i*b0_r + + xsmulsp vs36, vs0, vs10 // a0_r*b1_r + xsmulsp vs37, vs1, vs11 // a0_i*b1_i + xsmulsp vs38, vs0, vs11 // a0_r*b1_i + xsmulsp vs39, vs1, vs10 // a0_i*b1_r + + xsmulsp vs40, vs0, vs12 // a0_r*b2_r + xsmulsp vs41, vs1, vs13 // a0_i*b2_i + xsmulsp vs42, vs0, vs13 // a0_r*b2_i + xsmulsp vs43, vs1, vs12 // a0_i*b2_r + + xsmulsp vs44, vs0, vs14 // a0_r*b3_r + xsmulsp vs45, vs1, vs15 // a0_i*b3_i + xsmulsp vs46, vs0, vs15 // a0_r*b3_i + xsmulsp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro KERNEL4x1_1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs18, o0, T1 // load b1_r + lxsspx vs19, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs20, o0, T1 // load b2_r + lxsspx vs21, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs22, o0, T1 // load b3_r + lxsspx vs23, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmaddasp vs32, vs0, vs8 // a0_r*b0_r + xsmaddasp vs33, vs1, vs9 // a0_i*b0_i + xsmaddasp vs34, vs0, vs9 // a0_r*b0_i + xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddasp vs36, vs0, vs10 // a0_r*b1_r + xsmaddasp vs37, vs1, vs11 // a0_i*b1_i + xsmaddasp vs38, vs0, vs11 // a0_r*b1_i + xsmaddasp vs39, vs1, vs10 // a0_i*b1_r + + xsmaddasp vs40, vs0, vs12 // a0_r*b2_r + xsmaddasp vs41, vs1, vs13 // a0_i*b2_i + xsmaddasp vs42, vs0, vs13 // a0_r*b2_i + xsmaddasp vs43, vs1, vs12 // a0_i*b2_r + + xsmaddasp vs44, vs0, vs14 // a0_r*b3_r + xsmaddasp vs45, vs1, vs15 // a0_i*b3_i + xsmaddasp vs46, vs0, vs15 // a0_r*b3_i + xsmaddasp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro KERNEL4x1_2 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs12, o0, T1 // load b2_r + lxsspx vs13, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs14, o0, T1 // load b3_r + lxsspx vs15, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmaddasp vs32, vs4, vs16 // a4_r*b0_r + xsmaddasp vs33, vs5, vs17 // a4_i*b0_i + xsmaddasp vs34, vs4, vs17 // a4_r*b0_i + xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddasp vs36, vs4, vs18 // a4_r*b1_r + xsmaddasp vs37, vs5, vs19 // a4_i*b1_i + xsmaddasp vs38, vs4, vs19 // a4_r*b1_i + xsmaddasp vs39, vs5, vs18 // a4_i*b1_r + + xsmaddasp vs40, vs4, vs20 // a4_r*b2_r + xsmaddasp vs41, vs5, vs21 // a4_i*b2_i + xsmaddasp vs42, vs4, vs21 // a4_r*b2_i + xsmaddasp vs43, vs5, vs20 // a4_i*b2_r + + xsmaddasp vs44, vs4, vs22 // a4_r*b3_r + xsmaddasp vs45, vs5, vs23 // a4_i*b3_i + xsmaddasp vs46, vs4, vs23 // a4_r*b3_i + xsmaddasp vs47, vs5, vs22 // a4_i*b3_r + + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddasp vs32, vs4, vs16 // a4_r*b0_r + xsmaddasp vs33, vs5, vs17 // a4_i*b0_i + xsmaddasp vs34, vs4, vs17 // a4_r*b0_i + xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddasp vs36, vs4, vs18 // a4_r*b1_r + xsmaddasp vs37, vs5, vs19 // a4_i*b1_i + xsmaddasp vs38, vs4, vs19 // a4_r*b1_i + xsmaddasp vs39, vs5, vs18 // a4_i*b1_r + + xsmaddasp vs40, vs4, vs20 // a4_r*b2_r + xsmaddasp vs41, vs5, vs21 // a4_i*b2_i + xsmaddasp vs42, vs4, vs21 // a4_r*b2_i + xsmaddasp vs43, vs5, vs20 // a4_i*b2_r + + xsmaddasp vs44, vs4, vs22 // a4_r*b3_r + xsmaddasp vs45, vs5, vs23 // a4_i*b3_i + xsmaddasp vs46, vs4, vs23 // a4_r*b3_i + xsmaddasp vs47, vs5, vs22 // a4_i*b3_r + + +.endm + +.macro KERNEL4x1_SUBI1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs12, o0, T1 // load b2_r + lxsspx vs13, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs14, o0, T1 // load b3_r + lxsspx vs15, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmulsp vs32, vs0, vs8 // a0_r*b0_r + xsmulsp vs33, vs1, vs9 // a0_i*b0_i + xsmulsp vs34, vs0, vs9 // a0_r*b0_i + xsmulsp vs35, vs1, vs8 // a0_i*b0_r + + xsmulsp vs36, vs0, vs10 // a0_r*b1_r + xsmulsp vs37, vs1, vs11 // a0_i*b1_i + xsmulsp vs38, vs0, vs11 // a0_r*b1_i + xsmulsp vs39, vs1, vs10 // a0_i*b1_r + + xsmulsp vs40, vs0, vs12 // a0_r*b2_r + xsmulsp vs41, vs1, vs13 // a0_i*b2_i + xsmulsp vs42, vs0, vs13 // a0_r*b2_i + xsmulsp vs43, vs1, vs12 // a0_i*b2_r + + xsmulsp vs44, vs0, vs14 // a0_r*b3_r + xsmulsp vs45, vs1, vs15 // a0_i*b3_i + xsmulsp vs46, vs0, vs15 // a0_r*b3_i + xsmulsp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro KERNEL4x1_SUB1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs12, o0, T1 // load b2_r + lxsspx vs13, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs14, o0, T1 // load b3_r + lxsspx vs15, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmaddasp vs32, vs0, vs8 // a0_r*b0_r + xsmaddasp vs33, vs1, vs9 // a0_i*b0_i + xsmaddasp vs34, vs0, vs9 // a0_r*b0_i + xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddasp vs36, vs0, vs10 // a0_r*b1_r + xsmaddasp vs37, vs1, vs11 // a0_i*b1_i + xsmaddasp vs38, vs0, vs11 // a0_r*b1_i + xsmaddasp vs39, vs1, vs10 // a0_i*b1_r + + xsmaddasp vs40, vs0, vs12 // a0_r*b2_r + xsmaddasp vs41, vs1, vs13 // a0_i*b2_i + xsmaddasp vs42, vs0, vs13 // a0_r*b2_i + xsmaddasp vs43, vs1, vs12 // a0_i*b2_r + + xsmaddasp vs44, vs0, vs14 // a0_r*b3_r + xsmaddasp vs45, vs1, vs15 // a0_i*b3_i + xsmaddasp vs46, vs0, vs15 // a0_r*b3_i + xsmaddasp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro SAVE4x1 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsaddsp vs0, vs0, vs20 + xsaddsp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs36 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs39 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsaddsp vs0, vs0, vs20 + xsaddsp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs40 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs43 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs41 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs42 // add a0_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsaddsp vs0, vs0, vs20 + xsaddsp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs44 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs47 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs45 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs46 // add a0_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsaddsp vs0, vs0, vs20 + xsaddsp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro LOAD2x8_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + +.endm + +.macro KERNEL2x8_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro SAVE2x8 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs37, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs38, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs39, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs40, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs41, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs42, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs43, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs44, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs45, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs46, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs47, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro LOAD2x4_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + +.endm + +.macro KERNEL2x4_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro SAVE2x4 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs37, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs38, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs39, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro LOAD2x2_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + +.endm + +.macro KERNEL2x2_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro SAVE2x2 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro LOAD2x1_1 + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi BO, BO, 16 + +.endm + +.macro KERNEL2x1_I1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs18, o0, T1 // load b1_r + lxsspx vs19, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmulsp vs32, vs0, vs8 // a0_r*b0_r + xsmulsp vs33, vs1, vs9 // a0_i*b0_i + xsmulsp vs34, vs0, vs9 // a0_r*b0_i + xsmulsp vs35, vs1, vs8 // a0_i*b0_r + + xsmulsp vs36, vs0, vs10 // a0_r*b1_r + xsmulsp vs37, vs1, vs11 // a0_i*b1_i + xsmulsp vs38, vs0, vs11 // a0_r*b1_i + xsmulsp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro KERNEL2x1_1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs18, o0, T1 // load b1_r + lxsspx vs19, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmaddasp vs32, vs0, vs8 // a0_r*b0_r + xsmaddasp vs33, vs1, vs9 // a0_i*b0_i + xsmaddasp vs34, vs0, vs9 // a0_r*b0_i + xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddasp vs36, vs0, vs10 // a0_r*b1_r + xsmaddasp vs37, vs1, vs11 // a0_i*b1_i + xsmaddasp vs38, vs0, vs11 // a0_r*b1_i + xsmaddasp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro KERNEL2x1_2 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmaddasp vs32, vs4, vs16 // a4_r*b0_r + xsmaddasp vs33, vs5, vs17 // a4_i*b0_i + xsmaddasp vs34, vs4, vs17 // a4_r*b0_i + xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddasp vs36, vs4, vs18 // a4_r*b1_r + xsmaddasp vs37, vs5, vs19 // a4_i*b1_i + xsmaddasp vs38, vs4, vs19 // a4_r*b1_i + xsmaddasp vs39, vs5, vs18 // a4_i*b1_r + + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddasp vs32, vs4, vs16 // a4_r*b0_r + xsmaddasp vs33, vs5, vs17 // a4_i*b0_i + xsmaddasp vs34, vs4, vs17 // a4_r*b0_i + xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddasp vs36, vs4, vs18 // a4_r*b1_r + xsmaddasp vs37, vs5, vs19 // a4_i*b1_i + xsmaddasp vs38, vs4, vs19 // a4_r*b1_i + xsmaddasp vs39, vs5, vs18 // a4_i*b1_r + + +.endm + +.macro KERNEL2x1_SUBI1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmulsp vs32, vs0, vs8 // a0_r*b0_r + xsmulsp vs33, vs1, vs9 // a0_i*b0_i + xsmulsp vs34, vs0, vs9 // a0_r*b0_i + xsmulsp vs35, vs1, vs8 // a0_i*b0_r + + xsmulsp vs36, vs0, vs10 // a0_r*b1_r + xsmulsp vs37, vs1, vs11 // a0_i*b1_i + xsmulsp vs38, vs0, vs11 // a0_r*b1_i + xsmulsp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro KERNEL2x1_SUB1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmaddasp vs32, vs0, vs8 // a0_r*b0_r + xsmaddasp vs33, vs1, vs9 // a0_i*b0_i + xsmaddasp vs34, vs0, vs9 // a0_r*b0_i + xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddasp vs36, vs0, vs10 // a0_r*b1_r + xsmaddasp vs37, vs1, vs11 // a0_i*b1_i + xsmaddasp vs38, vs0, vs11 // a0_r*b1_i + xsmaddasp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro SAVE2x1 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsaddsp vs0, vs0, vs20 + xsaddsp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs36 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs39 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsaddsp vs0, vs0, vs20 + xsaddsp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro LOAD1x8_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + +.endm + +.macro KERNEL1x8_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro SAVE1x8 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs37, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs38, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs39, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro LOAD1x4_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + +.endm + +.macro KERNEL1x4_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro SAVE1x4 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro LOAD1x2_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + +.endm + +.macro KERNEL1x2_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro SAVE1x2 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro LOAD1x1_1 + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi BO, BO, 8 + +.endm + +.macro KERNEL1x1_I1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmulsp vs32, vs0, vs8 // a0_r*b0_r + xsmulsp vs33, vs1, vs9 // a0_i*b0_i + xsmulsp vs34, vs0, vs9 // a0_r*b0_i + xsmulsp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro KERNEL1x1_1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmaddasp vs32, vs0, vs8 // a0_r*b0_r + xsmaddasp vs33, vs1, vs9 // a0_i*b0_i + xsmaddasp vs34, vs0, vs9 // a0_r*b0_i + xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro KERNEL1x1_2 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmaddasp vs32, vs4, vs16 // a4_r*b0_r + xsmaddasp vs33, vs5, vs17 // a4_i*b0_i + xsmaddasp vs34, vs4, vs17 // a4_r*b0_i + xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddasp vs32, vs4, vs16 // a4_r*b0_r + xsmaddasp vs33, vs5, vs17 // a4_i*b0_i + xsmaddasp vs34, vs4, vs17 // a4_r*b0_i + xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + + +.endm + +.macro KERNEL1x1_SUBI1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmulsp vs32, vs0, vs8 // a0_r*b0_r + xsmulsp vs33, vs1, vs9 // a0_i*b0_i + xsmulsp vs34, vs0, vs9 // a0_r*b0_i + xsmulsp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro KERNEL1x1_SUB1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmaddasp vs32, vs0, vs8 // a0_r*b0_r + xsmaddasp vs33, vs1, vs9 // a0_i*b0_i + xsmaddasp vs34, vs0, vs9 // a0_r*b0_i + xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro SAVE1x1 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsaddsp vs0, vs0, vs20 + xsaddsp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + diff --git a/kernel/power/ctrmm_kernel_8x4_power8.S b/kernel/power/ctrmm_kernel_8x4_power8.S new file mode 100644 index 000000000..b15485751 --- /dev/null +++ b/kernel/power/ctrmm_kernel_8x4_power8.S @@ -0,0 +1,385 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 400 +#define ALPHA_R_SP 304(SP) +#define ALPHA_I_SP 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R_SP 224(SP) +#define ALPHA_I_SP 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define o0 0 +#define alpha_r vs30 +#define alpha_i vs31 +#define alpha_vr vs28 +#define alpha_vi vs29 + + +#define o12 r12 +#define KKK r13 +#define K1 r14 +#define L r15 +#define o16 r16 +#define TBUFFER r17 +#define T2 r19 +#define KK r20 +#define o8 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o4 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + std r13, 288(SP) + std r12, 296(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) + stw r14, 212(SP) + stw r13, 216(SP) +#endif + + stfs f1, ALPHA_R_SP + stfs f2, ALPHA_I_SP + // stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) +#else + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + +#include "cgemm_macros_8x4_power8.S" + + cmpwi cr0, M, 0 + ble .L999_H1 + cmpwi cr0, N, 0 + ble .L999_H1 + cmpwi cr0, K, 0 + ble .L999_H1 + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 256 + li o4 , 4 + li o8 , 8 + li o12 , 12 + li o16 , 16 + li o32 , 32 + li o48 , 48 + addi TBUFFER, SP, 360 + + +#ifdef __64BIT__ + addi T1, SP, 304 +#else + addi T1, SP, 224 +#endif + + lxsspx alpha_r, 0, T1 + lxsspx alpha_i, o8, T1 + + .align 5 + +#include "ctrmm_logic_8x4_power8.S" + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + ld r13, 288(SP) + ld r12, 296(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) + lwz r14, 212(SP) + lwz r13, 216(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ctrmm_logic_8x4_power8.S b/kernel/power/ctrmm_logic_8x4_power8.S new file mode 100644 index 000000000..f9656e90b --- /dev/null +++ b/kernel/power/ctrmm_logic_8x4_power8.S @@ -0,0 +1,1756 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. J, N, 2 + ble .LCTRMM_L4_END + +.LCTRMM_L4_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 2 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 3 + ble .LCTRMM_L4x8_END + +.LCTRMM_L4x8_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L4x8_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L4x8_SUB4 + +.LCTRMM_L4x8_LOOP_START: + + LOAD4x8_1 + KERNEL4x8_I1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -2 + ble .LCTRMM_L4x8_LOOP_END + + .align 5 + +.LCTRMM_L4x8_LOOP: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -1 + bgt .LCTRMM_L4x8_LOOP + +.LCTRMM_L4x8_LOOP_END: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_E2 + + b .LCTRMM_L4x8_SUB1 + +.LCTRMM_L4x8_SUB4: + + KERNEL4x8_SUBI1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + b .LCTRMM_L4x8_SUB1 + +.LCTRMM_L4x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x8_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L4x8_SAVE + b .LCTRMM_L4x8_SUB2 + +.LCTRMM_L4x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L4x8_SAVE + +.LCTRMM_L4x8_SUB2: + + KERNEL4x8_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L4x8_SUB2 + +.LCTRMM_L4x8_SAVE: + + SAVE4x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LCTRMM_L4x8_BEGIN + +.LCTRMM_L4x8_END: + +.LCTRMM_L4x4_BEGIN: + andi. T2, M, 7 + ble .LCTRMM_L4x1_END + + andi. T1, M, 4 + ble .LCTRMM_L4x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L4x4_SUB4 + +.LCTRMM_L4x4_LOOP_START: + + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -2 + ble .LCTRMM_L4x4_LOOP_END + + .align 5 + +.LCTRMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -1 + bgt .LCTRMM_L4x4_LOOP + +.LCTRMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b .LCTRMM_L4x4_SUB1 + +.LCTRMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b .LCTRMM_L4x4_SUB1 + +.LCTRMM_L4x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L4x4_SAVE + b .LCTRMM_L4x4_SUB2 + +.LCTRMM_L4x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L4x4_SAVE + +.LCTRMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L4x4_SUB2 + +.LCTRMM_L4x4_SAVE: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LCTRMM_L4x4_END: + +.LCTRMM_L4x2_BEGIN: + + andi. T1, M, 2 + ble .LCTRMM_L4x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L4x2_SUB4 + +.LCTRMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble .LCTRMM_L4x2_LOOP_END + + .align 5 + +.LCTRMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt .LCTRMM_L4x2_LOOP + +.LCTRMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b .LCTRMM_L4x2_SUB1 + +.LCTRMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b .LCTRMM_L4x2_SUB1 + +.LCTRMM_L4x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L4x2_SAVE + b .LCTRMM_L4x2_SUB2 + +.LCTRMM_L4x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L4x2_SAVE + +.LCTRMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L4x2_SUB2 + +.LCTRMM_L4x2_SAVE: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LCTRMM_L4x2_END: + +.LCTRMM_L4x1_BEGIN: + + andi. T1, M, 1 + ble .LCTRMM_L4x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L4x1_SUB4 + +.LCTRMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble .LCTRMM_L4x1_LOOP_END + + .align 5 + +.LCTRMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt .LCTRMM_L4x1_LOOP + +.LCTRMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b .LCTRMM_L4x1_SUB1 + +.LCTRMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b .LCTRMM_L4x1_SUB1 + +.LCTRMM_L4x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L4x1_SAVE + b .LCTRMM_L4x1_SUB2 + +.LCTRMM_L4x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L4x1_SAVE + +.LCTRMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L4x1_SUB2 + +.LCTRMM_L4x1_SAVE: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LCTRMM_L4x1_END: + + slwi T1, K, 5 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 4 // KK += Number of values in B +#endif + + + addic. J, J, -1 + bgt .LCTRMM_L4_BEGIN + + andi. T2, N, 3 + ble .L999_H2 + +.LCTRMM_L4_END: + + b .LCTRMM_L2_BEGIN + +.L999_H1: + + b .L999_H2 + +.LCTRMM_L2_BEGIN: + + andi. T1, N, 2 + ble .LCTRMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 3 + ble .LCTRMM_L2x8_END + +.LCTRMM_L2x8_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L2x8_SUB4 + +.LCTRMM_L2x8_LOOP_START: + + LOAD2x8_1 + KERNEL2x8_I1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -2 + ble .LCTRMM_L2x8_LOOP_END + + .align 5 + +.LCTRMM_L2x8_LOOP: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -1 + bgt .LCTRMM_L2x8_LOOP + +.LCTRMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b .LCTRMM_L2x8_SUB1 + +.LCTRMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b .LCTRMM_L2x8_SUB1 + +.LCTRMM_L2x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L2x8_SAVE + b .LCTRMM_L2x8_SUB2 + +.LCTRMM_L2x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L2x8_SAVE + +.LCTRMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L2x8_SUB2 + +.LCTRMM_L2x8_SAVE: + + SAVE2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LCTRMM_L2x8_BEGIN + +.LCTRMM_L2x8_END: + +.LCTRMM_L2x4_BEGIN: + andi. T2, M, 7 + ble .LCTRMM_L2x1_END + + andi. T1, M, 4 + ble .LCTRMM_L2x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L2x4_SUB4 + +.LCTRMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble .LCTRMM_L2x4_LOOP_END + + .align 5 + +.LCTRMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt .LCTRMM_L2x4_LOOP + +.LCTRMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b .LCTRMM_L2x4_SUB1 + +.LCTRMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b .LCTRMM_L2x4_SUB1 + +.LCTRMM_L2x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L2x4_SAVE + b .LCTRMM_L2x4_SUB2 + +.LCTRMM_L2x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L2x4_SAVE + +.LCTRMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L2x4_SUB2 + +.LCTRMM_L2x4_SAVE: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LCTRMM_L2x4_END: + +.LCTRMM_L2x2_BEGIN: + + andi. T1, M, 2 + ble .LCTRMM_L2x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L2x2_SUB4 + +.LCTRMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble .LCTRMM_L2x2_LOOP_END + + .align 5 + +.LCTRMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt .LCTRMM_L2x2_LOOP + +.LCTRMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b .LCTRMM_L2x2_SUB1 + +.LCTRMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b .LCTRMM_L2x2_SUB1 + +.LCTRMM_L2x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L2x2_SAVE + b .LCTRMM_L2x2_SUB2 + +.LCTRMM_L2x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L2x2_SAVE + +.LCTRMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L2x2_SUB2 + +.LCTRMM_L2x2_SAVE: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LCTRMM_L2x2_END: + +.LCTRMM_L2x1_BEGIN: + + andi. T1, M, 1 + ble .LCTRMM_L2x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L2x1_SUB4 + +.LCTRMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble .LCTRMM_L2x1_LOOP_END + + .align 5 + +.LCTRMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt .LCTRMM_L2x1_LOOP + +.LCTRMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b .LCTRMM_L2x1_SUB1 + +.LCTRMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b .LCTRMM_L2x1_SUB1 + +.LCTRMM_L2x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L2x1_SAVE + b .LCTRMM_L2x1_SUB2 + +.LCTRMM_L2x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L2x1_SAVE + +.LCTRMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L2x1_SUB2 + +.LCTRMM_L2x1_SAVE: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LCTRMM_L2x1_END: + + slwi T1, K, 4 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 2 // KK += Number of values in B +#endif + + +.LCTRMM_L2_END: + + b .LCTRMM_L1_BEGIN + +.L999_H2: + + b .L999 + +.LCTRMM_L1_BEGIN: + + andi. T1, N, 1 + ble .LCTRMM_L1_END + mr CO, C + mr AO, A + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 3 + ble .LCTRMM_L1x8_END + +.LCTRMM_L1x8_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L1x8_SUB4 + +.LCTRMM_L1x8_LOOP_START: + + LOAD1x8_1 + KERNEL1x8_I1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -2 + ble .LCTRMM_L1x8_LOOP_END + + .align 5 + +.LCTRMM_L1x8_LOOP: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -1 + bgt .LCTRMM_L1x8_LOOP + +.LCTRMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b .LCTRMM_L1x8_SUB1 + +.LCTRMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b .LCTRMM_L1x8_SUB1 + +.LCTRMM_L1x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L1x8_SAVE + b .LCTRMM_L1x8_SUB2 + +.LCTRMM_L1x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L1x8_SAVE + +.LCTRMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L1x8_SUB2 + +.LCTRMM_L1x8_SAVE: + + SAVE1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LCTRMM_L1x8_BEGIN + +.LCTRMM_L1x8_END: + +.LCTRMM_L1x4_BEGIN: + andi. T2, M, 7 + ble .LCTRMM_L1x1_END + + andi. T1, M, 4 + ble .LCTRMM_L1x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L1x4_SUB4 + +.LCTRMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble .LCTRMM_L1x4_LOOP_END + + .align 5 + +.LCTRMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt .LCTRMM_L1x4_LOOP + +.LCTRMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b .LCTRMM_L1x4_SUB1 + +.LCTRMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b .LCTRMM_L1x4_SUB1 + +.LCTRMM_L1x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L1x4_SAVE + b .LCTRMM_L1x4_SUB2 + +.LCTRMM_L1x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L1x4_SAVE + +.LCTRMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L1x4_SUB2 + +.LCTRMM_L1x4_SAVE: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LCTRMM_L1x4_END: + +.LCTRMM_L1x2_BEGIN: + + andi. T1, M, 2 + ble .LCTRMM_L1x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L1x2_SUB4 + +.LCTRMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble .LCTRMM_L1x2_LOOP_END + + .align 5 + +.LCTRMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt .LCTRMM_L1x2_LOOP + +.LCTRMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b .LCTRMM_L1x2_SUB1 + +.LCTRMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b .LCTRMM_L1x2_SUB1 + +.LCTRMM_L1x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L1x2_SAVE + b .LCTRMM_L1x2_SUB2 + +.LCTRMM_L1x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L1x2_SAVE + +.LCTRMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L1x2_SUB2 + +.LCTRMM_L1x2_SAVE: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LCTRMM_L1x2_END: + +.LCTRMM_L1x1_BEGIN: + + andi. T1, M, 1 + ble .LCTRMM_L1x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L1x1_SUB4 + +.LCTRMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble .LCTRMM_L1x1_LOOP_END + + .align 5 + +.LCTRMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt .LCTRMM_L1x1_LOOP + +.LCTRMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b .LCTRMM_L1x1_SUB1 + +.LCTRMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b .LCTRMM_L1x1_SUB1 + +.LCTRMM_L1x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L1x1_SAVE + b .LCTRMM_L1x1_SUB2 + +.LCTRMM_L1x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L1x1_SAVE + +.LCTRMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L1x1_SUB2 + +.LCTRMM_L1x1_SAVE: + + SAVE1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LCTRMM_L1x1_END: + +#if !defined(LEFT) + addi KK, KK, 1 // KK += Number of values in B +#endif + + +.LCTRMM_L1_END: diff --git a/param.h b/param.h index f5d1ab2ea..980650e09 100644 --- a/param.h +++ b/param.h @@ -1972,23 +1972,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 960 #define DGEMM_DEFAULT_P 480 -#define CGEMM_DEFAULT_P 488 +#define CGEMM_DEFAULT_P 480 #define ZGEMM_DEFAULT_P 240 #define SGEMM_DEFAULT_Q 720 #define DGEMM_DEFAULT_Q 720 -#define CGEMM_DEFAULT_Q 400 +#define CGEMM_DEFAULT_Q 720 #define ZGEMM_DEFAULT_Q 360 #define SGEMM_DEFAULT_R 28800 #define DGEMM_DEFAULT_R 14400 +#define CGEMM_DEFAULT_R 14400 #define ZGEMM_DEFAULT_R 7200 #define SYMV_P 8 From e1df5a6e23c2ab73385984289f24472cb2f0cb66 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 18 Mar 2016 12:12:03 +0100 Subject: [PATCH 4/4] fixed sgemm- and strmm-kernel --- kernel/power/sgemm_kernel_16x8_power8.S | 22 +- kernel/power/sgemm_logic_16x8_power8.S | 4 +- kernel/power/sgemm_macros_16x8_power8.S | 3190 +++++++++++++++++++---- kernel/power/strmm_kernel_16x8_power8.S | 21 +- kernel/power/strmm_logic_16x8_power8.S | 4 +- param.h | 4 +- 6 files changed, 2703 insertions(+), 542 deletions(-) diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S index 9f221301a..031f342ad 100644 --- a/kernel/power/sgemm_kernel_16x8_power8.S +++ b/kernel/power/sgemm_kernel_16x8_power8.S @@ -26,10 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* 2016/03/18 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ @@ -81,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef __64BIT__ -#define STACKSIZE 320 +#define STACKSIZE 340 #define ALPHA_SP 296(SP) #define FZERO 304(SP) #else @@ -127,10 +128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define alpha_r vs30 -#define alpha_vr vs31 #define o0 0 +#define TBUFFER r14 #define o4 r15 #define o12 r16 #define o8 r17 @@ -202,6 +203,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) + std r14, 280(SP) #else stw r31, 144(SP) stw r30, 148(SP) @@ -220,6 +222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stw r17, 200(SP) stw r16, 204(SP) stw r15, 208(SP) + stw r14, 212(SP) #endif // stfd f1, ALPHA_SP @@ -259,24 +262,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmpwi cr0, K, 0 ble .L999_H1 - li PRE, 384 + li PRE, 256 li o4 , 4 li o8 , 8 li o12, 12 li o16, 16 li o32, 32 li o48, 48 + addi TBUFFER, SP, 320 addi T1, SP, 300 stfs f1, 0(T1) - stfs f1, 4(T1) - stfs f1, 8(T1) - stfs f1,12(T1) - lxsspx vs28, 0, T1 - - xxspltw alpha_r, vs28 , 0 - lxvw4x alpha_vr, 0, T1 + lxsspx alpha_r, 0, T1 @@ -326,6 +324,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) + ld r14, 280(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) @@ -344,6 +343,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lwz r17, 200(SP) lwz r16, 204(SP) lwz r15, 208(SP) + lwz r14, 212(SP) #endif addi SP, SP, STACKSIZE diff --git a/kernel/power/sgemm_logic_16x8_power8.S b/kernel/power/sgemm_logic_16x8_power8.S index 6c5a1c7ef..0ae6413ce 100644 --- a/kernel/power/sgemm_logic_16x8_power8.S +++ b/kernel/power/sgemm_logic_16x8_power8.S @@ -26,13 +26,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* 2016/03/18 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ - srawi. J, N, 3 ble .LSGEMM_L8_END diff --git a/kernel/power/sgemm_macros_16x8_power8.S b/kernel/power/sgemm_macros_16x8_power8.S index 78f530cfa..a2d36c089 100644 --- a/kernel/power/sgemm_macros_16x8_power8.S +++ b/kernel/power/sgemm_macros_16x8_power8.S @@ -26,10 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* 2016/03/18 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ /********************************************************************************************** @@ -38,49 +39,65 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD8x16_1 - lxvw4x vs28, o0, BO - lxvw4x vs29, o16, BO - lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 + lxvw4x vs29, o16, BO + xxspltw vs12, vs29, 0 xxspltw vs13, vs29, 1 - addi AO, AO, 64 - addi BO, BO, 32 xxspltw vs14, vs29, 2 xxspltw vs15, vs29, 3 + addi BO, BO, 32 .endm .macro KERNEL8x16_I1 - xvmulsp vs32, vs0, vs8 - xvmulsp vs33, vs1, vs8 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO - - xvmulsp vs34, vs2, vs8 - xvmulsp vs35, vs3, vs8 - - lxvw4x vs28, o0, BO - lxvw4x vs29, o16, BO - - xvmulsp vs36, vs0, vs9 - xvmulsp vs37, vs1, vs9 - lxvw4x vs6, o32, AO lxvw4x vs7, o48, AO + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 xvmulsp vs38, vs2, vs9 xvmulsp vs39, vs3, vs9 @@ -104,27 +121,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs54, vs2, vs13 xvmulsp vs55, vs3, vs13 - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 - xvmulsp vs56, vs0, vs14 xvmulsp vs57, vs1, vs14 xvmulsp vs58, vs2, vs14 xvmulsp vs59, vs3, vs14 - xxspltw vs20, vs29, 0 - xxspltw vs21, vs29, 1 - xxspltw vs22, vs29, 2 - xxspltw vs23, vs29, 3 - xvmulsp vs60, vs0, vs15 xvmulsp vs61, vs1, vs15 - - addi AO, AO, 64 - addi BO, BO, 32 - xvmulsp vs62, vs2, vs15 xvmulsp vs63, vs3, vs15 @@ -135,36 +138,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 - + lxvw4x vs28, o0, BO lxvw4x vs4, o0, AO - lxvw4x vs5, o16, AO - xvmaddasp vs34, vs2, vs8 xvmaddasp vs35, vs3, vs8 xvmaddasp vs36, vs0, vs9 xvmaddasp vs37, vs1, vs9 - lxvw4x vs28, o0, BO + lxvw4x vs29, o16, BO + lxvw4x vs5, o16, AO xvmaddasp vs38, vs2, vs9 xvmaddasp vs39, vs3, vs9 xvmaddasp vs40, vs0, vs10 xvmaddasp vs41, vs1, vs10 - lxvw4x vs6, o32, AO lxvw4x vs7, o48, AO - xvmaddasp vs42, vs2, vs10 xvmaddasp vs43, vs3, vs10 + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + xvmaddasp vs44, vs0, vs11 xvmaddasp vs45, vs1, vs11 - - lxvw4x vs29, o16, BO - xvmaddasp vs46, vs2, vs11 xvmaddasp vs47, vs3, vs11 + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + xvmaddasp vs48, vs0, vs12 xvmaddasp vs49, vs1, vs12 xvmaddasp vs50, vs2, vs12 @@ -172,36 +179,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs52, vs0, vs13 xvmaddasp vs53, vs1, vs13 - - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xvmaddasp vs54, vs2, vs13 xvmaddasp vs55, vs3, vs13 xvmaddasp vs56, vs0, vs14 xvmaddasp vs57, vs1, vs14 - - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 - + addi AO, AO, 64 + addi BO, BO, 32 xvmaddasp vs58, vs2, vs14 xvmaddasp vs59, vs3, vs14 - xxspltw vs20, vs29, 0 - xxspltw vs21, vs29, 1 - xvmaddasp vs60, vs0, vs15 xvmaddasp vs61, vs1, vs15 - - addi AO, AO, 64 - addi BO, BO, 32 - xvmaddasp vs62, vs2, vs15 xvmaddasp vs63, vs3, vs15 - xxspltw vs22, vs29, 2 - xxspltw vs23, vs29, 3 .endm @@ -210,8 +202,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 + lxvw4x vs28, o0, BO lxvw4x vs0, o0, AO - lxvw4x vs1, o16, AO xvmaddasp vs34, vs6, vs16 xvmaddasp vs35, vs7, vs16 @@ -219,28 +211,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs36, vs4, vs17 xvmaddasp vs37, vs5, vs17 - lxvw4x vs28, o0, BO + lxvw4x vs29, o16, BO + lxvw4x vs1, o16, AO xvmaddasp vs38, vs6, vs17 xvmaddasp vs39, vs7, vs17 - xvmaddasp vs40, vs4, vs18 - xvmaddasp vs41, vs5, vs18 - lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 xvmaddasp vs42, vs6, vs18 xvmaddasp vs43, vs7, vs18 + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + xvmaddasp vs44, vs4, vs19 xvmaddasp vs45, vs5, vs19 - - lxvw4x vs29, o16, BO - xvmaddasp vs46, vs6, vs19 xvmaddasp vs47, vs7, vs19 + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + xvmaddasp vs48, vs4, vs20 xvmaddasp vs49, vs5, vs20 xvmaddasp vs50, vs6, vs20 @@ -248,32 +247,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs52, vs4, vs21 xvmaddasp vs53, vs5, vs21 - - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 - xvmaddasp vs54, vs6, vs21 xvmaddasp vs55, vs7, vs21 xvmaddasp vs56, vs4, vs22 xvmaddasp vs57, vs5, vs22 - - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 - xvmaddasp vs58, vs6, vs22 xvmaddasp vs59, vs7, vs22 xvmaddasp vs60, vs4, vs23 xvmaddasp vs61, vs5, vs23 - addi AO, AO, 64 addi BO, BO, 32 - xvmaddasp vs62, vs6, vs23 xvmaddasp vs63, vs7, vs23 @@ -479,22 +464,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs32, alpha_vr - xvmulsp vs1, vs33, alpha_vr - xvmulsp vs2, vs34, alpha_vr - xvmulsp vs3, vs35, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs32, alpha_vr - xvmaddasp vs1, vs33, alpha_vr - xvmaddasp vs2, vs34, alpha_vr - xvmaddasp vs3, vs35, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs33, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -512,22 +581,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs36, alpha_vr - xvmulsp vs1, vs37, alpha_vr - xvmulsp vs2, vs38, alpha_vr - xvmulsp vs3, vs39, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs36, alpha_vr - xvmaddasp vs1, vs37, alpha_vr - xvmaddasp vs2, vs38, alpha_vr - xvmaddasp vs3, vs39, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs37, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs38, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs39, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -545,22 +698,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs40, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs40, alpha_vr - xvmulsp vs1, vs41, alpha_vr - xvmulsp vs2, vs42, alpha_vr - xvmulsp vs3, vs43, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs40, alpha_vr - xvmaddasp vs1, vs41, alpha_vr - xvmaddasp vs2, vs42, alpha_vr - xvmaddasp vs3, vs43, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs41, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs42, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs43, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -578,22 +815,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs44, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs44, alpha_vr - xvmulsp vs1, vs45, alpha_vr - xvmulsp vs2, vs46, alpha_vr - xvmulsp vs3, vs47, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs44, alpha_vr - xvmaddasp vs1, vs45, alpha_vr - xvmaddasp vs2, vs46, alpha_vr - xvmaddasp vs3, vs47, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs45, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs46, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs47, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -611,22 +932,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs48, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs48, alpha_vr - xvmulsp vs1, vs49, alpha_vr - xvmulsp vs2, vs50, alpha_vr - xvmulsp vs3, vs51, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs48, alpha_vr - xvmaddasp vs1, vs49, alpha_vr - xvmaddasp vs2, vs50, alpha_vr - xvmaddasp vs3, vs51, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs49, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs50, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs51, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -644,22 +1049,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs52, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs52, alpha_vr - xvmulsp vs1, vs53, alpha_vr - xvmulsp vs2, vs54, alpha_vr - xvmulsp vs3, vs55, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs52, alpha_vr - xvmaddasp vs1, vs53, alpha_vr - xvmaddasp vs2, vs54, alpha_vr - xvmaddasp vs3, vs55, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs53, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs54, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs55, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -677,22 +1166,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs56, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs56, alpha_vr - xvmulsp vs1, vs57, alpha_vr - xvmulsp vs2, vs58, alpha_vr - xvmulsp vs3, vs59, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs56, alpha_vr - xvmaddasp vs1, vs57, alpha_vr - xvmaddasp vs2, vs58, alpha_vr - xvmaddasp vs3, vs59, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs57, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs58, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs59, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -710,22 +1283,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs60, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs60, alpha_vr - xvmulsp vs1, vs61, alpha_vr - xvmulsp vs2, vs62, alpha_vr - xvmulsp vs3, vs63, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs60, alpha_vr - xvmaddasp vs1, vs61, alpha_vr - xvmaddasp vs2, vs62, alpha_vr - xvmaddasp vs3, vs63, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs61, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs62, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs63, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -1068,18 +1725,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs32, alpha_vr - xvmulsp vs1, vs33, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs32, alpha_vr - xvmaddasp vs1, vs33, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs33, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1093,18 +1790,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs34, alpha_vr - xvmulsp vs1, vs35, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs34, alpha_vr - xvmaddasp vs1, vs35, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs35, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1118,18 +1855,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs36, alpha_vr - xvmulsp vs1, vs37, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs36, alpha_vr - xvmaddasp vs1, vs37, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs37, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1143,18 +1920,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs38, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs38, alpha_vr - xvmulsp vs1, vs39, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs38, alpha_vr - xvmaddasp vs1, vs39, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs39, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1168,18 +1985,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs40, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs40, alpha_vr - xvmulsp vs1, vs41, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs40, alpha_vr - xvmaddasp vs1, vs41, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs41, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1193,18 +2050,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs42, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs42, alpha_vr - xvmulsp vs1, vs43, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs42, alpha_vr - xvmaddasp vs1, vs43, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs43, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1218,18 +2115,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs44, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs44, alpha_vr - xvmulsp vs1, vs45, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs44, alpha_vr - xvmaddasp vs1, vs45, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs45, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1243,18 +2180,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs46, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs46, alpha_vr - xvmulsp vs1, vs47, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs46, alpha_vr - xvmaddasp vs1, vs47, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs47, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1540,16 +2517,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs32, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs32, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -1561,16 +2556,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs33, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs33, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -1582,16 +2595,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs34, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs34, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -1603,16 +2634,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs35, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs35, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -1624,16 +2673,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs36, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs36, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -1645,16 +2712,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs37, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs37, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs37, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -1666,16 +2751,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs38, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs38, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs38, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -1687,16 +2790,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs39, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs39, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs39, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -2043,8 +3164,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs32, alpha_r - xsmaddasp vs1, vs33, alpha_r + xsmulsp vs28, vs32, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs33, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -2068,8 +3191,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs34, alpha_r - xsmaddasp vs1, vs35, alpha_r + xsmulsp vs28, vs34, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs35, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -2093,8 +3218,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs36, alpha_r - xsmaddasp vs1, vs37, alpha_r + xsmulsp vs28, vs36, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs37, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -2118,8 +3245,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs38, alpha_r - xsmaddasp vs1, vs39, alpha_r + xsmulsp vs28, vs38, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs39, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -2143,8 +3272,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs40, alpha_r - xsmaddasp vs1, vs41, alpha_r + xsmulsp vs28, vs40, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs41, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -2168,8 +3299,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs42, alpha_r - xsmaddasp vs1, vs43, alpha_r + xsmulsp vs28, vs42, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs43, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -2193,8 +3326,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs44, alpha_r - xsmaddasp vs1, vs45, alpha_r + xsmulsp vs28, vs44, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs45, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -2218,8 +3353,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs46, alpha_r - xsmaddasp vs1, vs47, alpha_r + xsmulsp vs28, vs46, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs47, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -2514,7 +3651,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs32, alpha_r + xsmulsp vs28, vs32, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -2535,7 +3673,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs33, alpha_r + xsmulsp vs28, vs33, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -2556,7 +3695,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs34, alpha_r + xsmulsp vs28, vs34, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -2577,7 +3717,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs35, alpha_r + xsmulsp vs28, vs35, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -2598,7 +3739,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs36, alpha_r + xsmulsp vs28, vs36, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -2619,7 +3761,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs37, alpha_r + xsmulsp vs28, vs37, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -2640,7 +3783,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs38, alpha_r + xsmulsp vs28, vs38, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -2661,7 +3805,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs39, alpha_r + xsmulsp vs28, vs39, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -2952,22 +4097,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs32, alpha_vr - xvmulsp vs1, vs33, alpha_vr - xvmulsp vs2, vs34, alpha_vr - xvmulsp vs3, vs35, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs32, alpha_vr - xvmaddasp vs1, vs33, alpha_vr - xvmaddasp vs2, vs34, alpha_vr - xvmaddasp vs3, vs35, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs33, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -2985,22 +4214,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs36, alpha_vr - xvmulsp vs1, vs37, alpha_vr - xvmulsp vs2, vs38, alpha_vr - xvmulsp vs3, vs39, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs36, alpha_vr - xvmaddasp vs1, vs37, alpha_vr - xvmaddasp vs2, vs38, alpha_vr - xvmaddasp vs3, vs39, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs37, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs38, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs39, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -3018,22 +4331,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs40, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs40, alpha_vr - xvmulsp vs1, vs41, alpha_vr - xvmulsp vs2, vs42, alpha_vr - xvmulsp vs3, vs43, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs40, alpha_vr - xvmaddasp vs1, vs41, alpha_vr - xvmaddasp vs2, vs42, alpha_vr - xvmaddasp vs3, vs43, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs41, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs42, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs43, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -3051,22 +4448,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs44, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs44, alpha_vr - xvmulsp vs1, vs45, alpha_vr - xvmulsp vs2, vs46, alpha_vr - xvmulsp vs3, vs47, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs44, alpha_vr - xvmaddasp vs1, vs45, alpha_vr - xvmaddasp vs2, vs46, alpha_vr - xvmaddasp vs3, vs47, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs45, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs46, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs47, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -3295,18 +4776,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs32, alpha_vr - xvmulsp vs1, vs33, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs32, alpha_vr - xvmaddasp vs1, vs33, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs33, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -3320,18 +4841,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs34, alpha_vr - xvmulsp vs1, vs35, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs34, alpha_vr - xvmaddasp vs1, vs35, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs35, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -3345,18 +4906,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs36, alpha_vr - xvmulsp vs1, vs37, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs36, alpha_vr - xvmaddasp vs1, vs37, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs37, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -3370,18 +4971,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs38, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs38, alpha_vr - xvmulsp vs1, vs39, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs38, alpha_vr - xvmaddasp vs1, vs39, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs39, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -3577,16 +5218,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs32, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs32, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -3598,16 +5257,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs33, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs33, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -3619,16 +5296,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs34, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs34, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -3640,16 +5335,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs35, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs35, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -3882,8 +5595,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs32, alpha_r - xsmaddasp vs1, vs33, alpha_r + xsmulsp vs28, vs32, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs33, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -3907,8 +5622,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs34, alpha_r - xsmaddasp vs1, vs35, alpha_r + xsmulsp vs28, vs34, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs35, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -3932,8 +5649,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs36, alpha_r - xsmaddasp vs1, vs37, alpha_r + xsmulsp vs28, vs36, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs37, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -3957,8 +5676,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs38, alpha_r - xsmaddasp vs1, vs39, alpha_r + xsmulsp vs28, vs38, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs39, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -4163,7 +5884,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs32, alpha_r + xsmulsp vs28, vs32, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -4184,7 +5906,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs33, alpha_r + xsmulsp vs28, vs33, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -4205,7 +5928,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs34, alpha_r + xsmulsp vs28, vs34, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -4226,7 +5950,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs35, alpha_r + xsmulsp vs28, vs35, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -4445,22 +6170,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs32, alpha_vr - xvmulsp vs1, vs33, alpha_vr - xvmulsp vs2, vs34, alpha_vr - xvmulsp vs3, vs35, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs32, alpha_vr - xvmaddasp vs1, vs33, alpha_vr - xvmaddasp vs2, vs34, alpha_vr - xvmaddasp vs3, vs35, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs33, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -4478,22 +6287,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs36, alpha_vr - xvmulsp vs1, vs37, alpha_vr - xvmulsp vs2, vs38, alpha_vr - xvmulsp vs3, vs39, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs36, alpha_vr - xvmaddasp vs1, vs37, alpha_vr - xvmaddasp vs2, vs38, alpha_vr - xvmaddasp vs3, vs39, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs37, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs38, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs39, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -4674,18 +6567,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs32, alpha_vr - xvmulsp vs1, vs33, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs32, alpha_vr - xvmaddasp vs1, vs33, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs33, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -4699,18 +6632,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs34, alpha_vr - xvmulsp vs1, vs35, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs34, alpha_vr - xvmaddasp vs1, vs35, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs35, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -4870,16 +6843,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs32, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs32, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -4891,16 +6882,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs33, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs33, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -5085,8 +7094,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs32, alpha_r - xsmaddasp vs1, vs33, alpha_r + xsmulsp vs28, vs32, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs33, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -5110,8 +7121,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs34, alpha_r - xsmaddasp vs1, vs35, alpha_r + xsmulsp vs28, vs34, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs35, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -5280,7 +7293,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs32, alpha_r + xsmulsp vs28, vs32, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -5301,7 +7315,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs33, alpha_r + xsmulsp vs28, vs33, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -5484,22 +7499,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs32, alpha_vr - xvmulsp vs1, vs33, alpha_vr - xvmulsp vs2, vs34, alpha_vr - xvmulsp vs3, vs35, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs32, alpha_vr - xvmaddasp vs1, vs33, alpha_vr - xvmaddasp vs2, vs34, alpha_vr - xvmaddasp vs3, vs35, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs33, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -5656,18 +7755,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs32, alpha_vr - xvmulsp vs1, vs33, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs32, alpha_vr - xvmaddasp vs1, vs33, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + stxvw4x vs33, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -5809,16 +7948,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL - - xvmulsp vs0, vs32, alpha_vr - + lxvw4x vs0, o0, TBUFFER #else - - xvmaddasp vs0, vs32, alpha_vr - + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -5979,8 +8136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs32, alpha_r - xsmaddasp vs1, vs33, alpha_r + xsmulsp vs28, vs32, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs33, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -6131,7 +8290,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs32, alpha_r + xsmulsp vs28, vs32, alpha_r + xsaddsp vs0, vs0, vs28 #endif diff --git a/kernel/power/strmm_kernel_16x8_power8.S b/kernel/power/strmm_kernel_16x8_power8.S index 5b1c5ca6b..5e607c58f 100644 --- a/kernel/power/strmm_kernel_16x8_power8.S +++ b/kernel/power/strmm_kernel_16x8_power8.S @@ -26,10 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* 2016/03/18 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ @@ -81,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef __64BIT__ -#define STACKSIZE 320 +#define STACKSIZE 340 #define ALPHA_SP 296(SP) #define FZERO 304(SP) #else @@ -127,10 +128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define alpha_r vs30 -#define alpha_vr vs31 #define o0 0 +#define TBUFFER r13 #define o12 r14 #define o4 r15 #define K1 r16 @@ -138,7 +139,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define L r18 #define T1 r19 #define KK r20 -#define KKK 21 +#define KKK r21 #define I r22 #define J r23 #define AO r24 @@ -204,6 +205,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r16, 264(SP) std r15, 272(SP) std r14, 280(SP) + std r13, 288(SP) #else stw r31, 144(SP) stw r30, 148(SP) @@ -223,6 +225,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stw r16, 204(SP) stw r15, 208(SP) stw r14, 212(SP) + stw r13, 216(SP) #endif // stfd f1, ALPHA_SP @@ -274,17 +277,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. li o16, 16 li o32, 32 li o48, 48 + addi TBUFFER, SP, 320 addi T1, SP, 300 stfs f1, 0(T1) - stfs f1, 4(T1) - stfs f1, 8(T1) - stfs f1,12(T1) - lxsspx vs28, 0, T1 + lxsspx alpha_r, 0, T1 - xxspltw alpha_r, vs28 , 0 - lxvw4x alpha_vr, 0, T1 @@ -335,6 +334,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) + ld r13, 288(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) @@ -354,6 +354,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lwz r16, 204(SP) lwz r15, 208(SP) lwz r14, 212(SP) + lwz r13, 216(SP) #endif addi SP, SP, STACKSIZE diff --git a/kernel/power/strmm_logic_16x8_power8.S b/kernel/power/strmm_logic_16x8_power8.S index 0d6d04858..8ec11f1ef 100644 --- a/kernel/power/strmm_logic_16x8_power8.S +++ b/kernel/power/strmm_logic_16x8_power8.S @@ -26,14 +26,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* 2016/03/18 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ - srawi. J, N, 3 ble .LSTRMM_L8_END diff --git a/param.h b/param.h index 980650e09..370d10b9a 100644 --- a/param.h +++ b/param.h @@ -1977,12 +1977,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 960 +#define SGEMM_DEFAULT_P 480 #define DGEMM_DEFAULT_P 480 #define CGEMM_DEFAULT_P 480 #define ZGEMM_DEFAULT_P 240 -#define SGEMM_DEFAULT_Q 720 +#define SGEMM_DEFAULT_Q 1440 #define DGEMM_DEFAULT_Q 720 #define CGEMM_DEFAULT_Q 720 #define ZGEMM_DEFAULT_Q 360