From 9185d419d3c0452a898eb44618d47c11c9cd450e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 31 Dec 2018 23:09:20 +0100 Subject: [PATCH 01/28] Version 0.3.5 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 24c169afe..ac5dd93de 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 5.dev) +set(OpenBLAS_PATCH_VERSION 5) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From eebc18928715775c9ed254684edee16e4efe0342 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 31 Dec 2018 23:09:59 +0100 Subject: [PATCH 02/28] Version 0.3.5 --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 0d5b83b39..3033455d3 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.5.dev +VERSION = 0.3.5 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 11530b76f7b19fbb2d9089ab8166ab54bde8b423 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Apr 2019 09:58:56 +0200 Subject: [PATCH 03/28] Correct INFO=4 condition --- relapack/src/cgetrf.c | 2 +- relapack/src/dgetrf.c | 5 ++--- relapack/src/sgetrf.c | 7 +------ relapack/src/zgetrf.c | 2 +- 4 files changed, 5 insertions(+), 11 deletions(-) diff --git a/relapack/src/cgetrf.c b/relapack/src/cgetrf.c index 9aab718a0..878c9ec15 100644 --- a/relapack/src/cgetrf.c +++ b/relapack/src/cgetrf.c @@ -22,7 +22,7 @@ void RELAPACK_cgetrf( *info = -1; else if (*n < 0) *info = -2; - else if (*ldA < MAX(1, *n)) + else if (*ldA < MAX(1, *m)) *info = -4; if (*info) { const blasint minfo = -*info; diff --git a/relapack/src/dgetrf.c b/relapack/src/dgetrf.c index c4bce8fc5..be960fde9 100644 --- a/relapack/src/dgetrf.c +++ b/relapack/src/dgetrf.c @@ -15,16 +15,15 @@ void RELAPACK_dgetrf( double *A, const blasint *ldA, blasint *ipiv, blasint *info ) { - // Check arguments *info = 0; if (*m < 0) *info = -1; else if (*n < 0) *info = -2; - else if (*ldA < MAX(1, *n)) + else if (*ldA < MAX(1, *m)) *info = -4; - if (*info) { + if (*info!=0) { const blasint minfo = -*info; LAPACK(xerbla)("DGETRF", &minfo, strlen("DGETRF")); return; diff --git a/relapack/src/sgetrf.c b/relapack/src/sgetrf.c index 9d0ff1039..0231cc166 100644 --- a/relapack/src/sgetrf.c +++ b/relapack/src/sgetrf.c @@ -1,5 +1,4 @@ #include "relapack.h" - static void RELAPACK_sgetrf_rec(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); @@ -22,16 +21,14 @@ void RELAPACK_sgetrf( *info = -1; else if (*n < 0) *info = -2; - else if (*ldA < MAX(1, *n)) + else if (*ldA < MAX(1, *m)) *info = -4; if (*info) { const blasint minfo = -*info; LAPACK(xerbla)("SGETRF", &minfo, strlen("SGETRF")); return; } - const blasint sn = MIN(*m, *n); - RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info); // Right remainder @@ -61,7 +58,6 @@ static void RELAPACK_sgetrf_rec( float *A, const blasint *ldA, blasint *ipiv, blasint *info ) { - if (*n <= MAX(CROSSOVER_SGETRF, 1)) { // Unblocked LAPACK(sgetf2)(m, n, A, ldA, ipiv, info); @@ -77,7 +73,6 @@ static void RELAPACK_sgetrf_rec( const blasint n1 = SREC_SPLIT(*n); const blasint n2 = *n - n1; const blasint m2 = *m - n1; - // A_L A_R float *const A_L = A; float *const A_R = A + *ldA * n1; diff --git a/relapack/src/zgetrf.c b/relapack/src/zgetrf.c index 121b03401..b0d14ffb1 100644 --- a/relapack/src/zgetrf.c +++ b/relapack/src/zgetrf.c @@ -22,7 +22,7 @@ void RELAPACK_zgetrf( *info = -1; else if (*n < 0) *info = -2; - else if (*ldA < MAX(1, *n)) + else if (*ldA < MAX(1, *m)) *info = -4; if (*info) { const blasint minfo = -*info; From 2cd463eabdcecce01a379c7aaebbb0c48e21c27d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Apr 2019 10:02:28 +0200 Subject: [PATCH 04/28] Disable reallocation of work array in xSYTRF as it appears to cause memory management problems (seen in the LAPACK tests) --- relapack/config.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/relapack/config.h b/relapack/config.h index 9113a712d..e4fab0a12 100644 --- a/relapack/config.h +++ b/relapack/config.h @@ -36,8 +36,8 @@ // allow malloc in xsygst for improved performance #define XSYGST_ALLOW_MALLOC ALLOW_MALLOC // allow malloc in xsytrf if the passed work buffer is too small -#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC - +//#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC +#define XSYTRF_ALLOW_MALLOC 0 //////////////////////////////// // LAPACK routine replacement // From 1036299da06d4ebd60139529885804fa63400e10 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 29 Apr 2019 00:12:37 +0200 Subject: [PATCH 05/28] Disable repeated recursion on Ab_BR in ReLAPACK xGBTRF due to crashes in LAPACK tests --- relapack/src/cgbtrf.c | 4 +++- relapack/src/dgbtrf.c | 6 ++++-- relapack/src/sgbtrf.c | 20 +++++++++++++------- relapack/src/zgbtrf.c | 12 +++++++----- 4 files changed, 27 insertions(+), 15 deletions(-) diff --git a/relapack/src/cgbtrf.c b/relapack/src/cgbtrf.c index eddfdedf7..61332c6a6 100644 --- a/relapack/src/cgbtrf.c +++ b/relapack/src/cgbtrf.c @@ -221,7 +221,9 @@ static void RELAPACK_cgbtrf_rec( } // recursion(Ab_BR, ipiv_B) - RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + //RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + LAPACK(cgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); + if (*info) *info += n1; // shift pivots diff --git a/relapack/src/dgbtrf.c b/relapack/src/dgbtrf.c index f4b443629..cdf06ad5b 100644 --- a/relapack/src/dgbtrf.c +++ b/relapack/src/dgbtrf.c @@ -1,5 +1,6 @@ #include "relapack.h" -#include "stdlib.h" +#include +#include static void RELAPACK_dgbtrf_rec(const blasint *, const blasint *, const blasint *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, double *, const blasint *, blasint *); @@ -218,7 +219,8 @@ static void RELAPACK_dgbtrf_rec( } // recursion(Ab_BR, ipiv_B) - RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); +// RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + LAPACK(dgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); if (*info) *info += n1; // shift pivots diff --git a/relapack/src/sgbtrf.c b/relapack/src/sgbtrf.c index 3a4de4ece..3e3fdf455 100644 --- a/relapack/src/sgbtrf.c +++ b/relapack/src/sgbtrf.c @@ -27,7 +27,7 @@ void RELAPACK_sgbtrf( *info = -3; else if (*ku < 0) *info = -4; - else if (*ldAb < 2 * *kl + *ku + 1) + else if (*ldAb < 2 * *kl + *ku + 1) *info = -6; if (*info) { const blasint minfo = -*info; @@ -55,15 +55,16 @@ void RELAPACK_sgbtrf( // Allocate work space const blasint n1 = SREC_SPLIT(*n); - const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; - const blasint nWorkl = (kv > n1) ? n1 : kv; - const blasint mWorku = (*kl > n1) ? n1 : *kl; - const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + const blasint mWorkl = abs( (kv > n1) ? MAX(1, *m - *kl) : kv ); + const blasint nWorkl = abs( (kv > n1) ? n1 : kv ); + const blasint mWorku = abs( (*kl > n1) ? n1 : *kl ); + const blasint nWorku = abs( (*kl > n1) ? MAX(0, *n - *kl) : *kl ); float *Workl = malloc(mWorkl * nWorkl * sizeof(float)); float *Worku = malloc(mWorku * nWorku * sizeof(float)); LAPACK(slaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); LAPACK(slaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); + // Recursive kernel RELAPACK_sgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); @@ -81,6 +82,7 @@ static void RELAPACK_sgbtrf_rec( blasint *info ) { + if (*n <= MAX(CROSSOVER_SGBTRF, 1)) { // Unblocked LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); @@ -127,7 +129,7 @@ static void RELAPACK_sgbtrf_rec( float *const A_BR = A + *ldA * n1 + m1; // ipiv_T - // ipiv_B + // ipiv_B blasint *const ipiv_T = ipiv; blasint *const ipiv_B = ipiv + n1; @@ -155,6 +157,7 @@ static void RELAPACK_sgbtrf_rec( float *const A_BRbl = A_BR + m21; float *const A_BRbr = A_BR + *ldA * n21 + m21; + // recursion(Ab_L, ipiv_T) RELAPACK_sgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); @@ -216,8 +219,11 @@ static void RELAPACK_sgbtrf_rec( } } + // recursion(Ab_BR, ipiv_B) - RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); +//cause of infinite recursion here ? +// RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + LAPACK(sgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); if (*info) *info += n1; // shift pivots diff --git a/relapack/src/zgbtrf.c b/relapack/src/zgbtrf.c index 0dd3fa7c3..d4ba41753 100644 --- a/relapack/src/zgbtrf.c +++ b/relapack/src/zgbtrf.c @@ -56,10 +56,10 @@ void RELAPACK_zgbtrf( // Allocate work space const blasint n1 = ZREC_SPLIT(*n); - const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; - const blasint nWorkl = (kv > n1) ? n1 : kv; - const blasint mWorku = (*kl > n1) ? n1 : *kl; - const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + const blasint mWorkl = abs ( (kv > n1) ? MAX(1, *m - *kl) : kv); + const blasint nWorkl = abs ( (kv > n1) ? n1 : kv); + const blasint mWorku = abs ( (*kl > n1) ? n1 : *kl); + const blasint nWorku = abs ( (*kl > n1) ? MAX(0, *n - *kl) : *kl); double *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(double)); double *Worku = malloc(mWorku * nWorku * 2 * sizeof(double)); LAPACK(zlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); @@ -221,7 +221,9 @@ static void RELAPACK_zgbtrf_rec( } // recursion(Ab_BR, ipiv_B) - RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + // RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + LAPACK(zgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); + if (*info) *info += n1; // shift pivots From 0f105dd8a5a597b2f468f774a52da226581efbdc Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Sat, 13 Apr 2019 13:56:19 +0000 Subject: [PATCH 06/28] sgemm/strmm --- CONTRIBUTORS.md | 5 +- kernel/power/KERNEL.POWER9 | 6 +- kernel/power/sgemm_kernel_power9.S | 286 ++ kernel/power/sgemm_logic_power9.S | 2133 ++++++++++ kernel/power/sgemm_macros_power9.S | 5828 ++++++++++++++++++++++++++++ param.h | 4 +- 6 files changed, 8256 insertions(+), 6 deletions(-) create mode 100644 kernel/power/sgemm_kernel_power9.S create mode 100644 kernel/power/sgemm_logic_power9.S create mode 100644 kernel/power/sgemm_macros_power9.S diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 08f8cc69d..3859a9c19 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -167,4 +167,7 @@ In chronological order: * [2017-02-26] ztrmm kernel for IBM z13 * [2017-03-13] strmm and ctrmm kernel for IBM z13 * [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13 - + * [2018-03-07] added missing Blas Level 1-2 (double precision) simd codes + * [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes + * [2019-03-14] power9 dgemm/dtrmm kernel + * [2019-04-29] power9 sgemm/strmm kernel diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 86a931971..6d5cf9068 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -3,16 +3,16 @@ #CGEMM_BETA = ../generic/zgemm_beta.c #ZGEMM_BETA = ../generic/zgemm_beta.c -STRMMKERNEL = strmm_kernel_16x8_power8.S +STRMMKERNEL = sgemm_kernel_power9.S DTRMMKERNEL = dgemm_kernel_power9.S CTRMMKERNEL = ctrmm_kernel_8x4_power8.S ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S -SGEMMKERNEL = sgemm_kernel_16x8_power8.S +SGEMMKERNEL = sgemm_kernel_power9.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c -SGEMMOTCOPY = sgemm_tcopy_8_power8.S +SGEMMOTCOPY = sgemm_tcopy_8_power8.S SGEMMINCOPYOBJ = sgemm_incopy.o SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S new file mode 100644 index 000000000..a44659468 --- /dev/null +++ b/kernel/power/sgemm_kernel_power9.S @@ -0,0 +1,286 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld +#define STACKSIZE (512 ) + +#define M r3 +#define N r4 +#define K r5 + + +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 + + + +#define alpha_r vs20 +#define save_permute_1 vs21 +#define save_permute_2 vs22 +#define permute_mask vs23 +#define o0 0 + + +#define T1 r11 +#define T2 r12 +#define T3 r14 +#define T4 r15 +#define T5 r16 +#define T6 r17 +#define L r18 +#define T7 r19 +#define T8 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T9 r27 +#define T10 r28 +#define T11 r29 + +#define T12 r30 +#define T13 r31 + +#include "sgemm_macros_power9.S" + +.equ perm_const1, 0x0405060700010203 +.equ perm_const2, 0x0c0d0e0f08090a0b +.equ save_permute_11, 0x1415161718191a1b +.equ save_permute_12, 0x0405060708090a0b +.equ save_permute_21, 0x101112131c1d1e1f +.equ save_permute_22, 0x000102030c0d0e0f + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv v20, 288(SP) + stxv v21, 304(SP) + stxv v22, 320(SP) + stxv v23, 336(SP) + stxv v24, 352(SP) + stxv v25, 368(SP) + stxv v26, 384(SP) + stxv v27, 400(SP) + stxv v28, 416(SP) + stxv v29, 432(SP) + stxv v30, 448(SP) + stxv v31, 464(SP) + + + +#if defined(TRMMKERNEL) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + slwi LDC, LDC, 2 + + +/* cmpwi cr0, M, 0 + ble .L999_H1 + cmpwi cr0, N, 0 + ble .L999_H1 + cmpwi cr0, K, 0 + ble .L999_H1 +*/ + + + /*alpha is stored in f1. convert to single and splat*/ + xscvdpspn alpha_r,vs1 + xxspltw alpha_r,alpha_r,0 + + +/*load reverse permute mask for big endian + uint128 = 0xc0d0e0f08090a0b0405060700010203 +*/ + + lis T2, perm_const2@highest + ori T2, T2, perm_const2@higher + rldicr T2, T2, 32, 31 + oris T2, T2, perm_const2@h + ori T2, T2, perm_const2@l + + lis T1, perm_const1@highest + ori T1, T1, perm_const1@higher + rldicr T1, T1, 32, 31 + oris T1, T1, perm_const1@h + ori T1, T1, perm_const1@l + + mtvsrdd permute_mask,T2,T1 + + lis T2, save_permute_12@highest + ori T2, T2, save_permute_12@higher + rldicr T2, T2, 32, 31 + oris T2, T2, save_permute_12@h + ori T2, T2, save_permute_12@l + + lis T1, save_permute_11@highest + ori T1, T1, save_permute_11@higher + rldicr T1, T1, 32, 31 + oris T1, T1, save_permute_11@h + ori T1, T1, save_permute_11@l + + mtvsrdd save_permute_1,T2,T1 + + lis T2, save_permute_22@highest + ori T2, T2, save_permute_22@higher + rldicr T2, T2, 32, 31 + oris T2, T2, save_permute_22@h + ori T2, T2, save_permute_22@l + + lis T1, save_permute_21@highest + ori T1, T1, save_permute_21@higher + rldicr T1, T1, 32, 31 + oris T1, T1, save_permute_21@h + ori T1, T1, save_permute_21@l + + mtvsrdd save_permute_2,T2,T1 + +#include "sgemm_logic_power9.S" + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + lxv v20, 288(SP) + lxv v21, 304(SP) + lxv v22, 320(SP) + lxv v23, 336(SP) + lxv v24, 352(SP) + lxv v25, 368(SP) + lxv v26, 384(SP) + lxv v27, 400(SP) + lxv v28, 416(SP) + lxv v29, 432(SP) + lxv v30, 448(SP) + lxv v31, 464(SP) + + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S new file mode 100644 index 000000000..300e30470 --- /dev/null +++ b/kernel/power/sgemm_logic_power9.S @@ -0,0 +1,2133 @@ +#define MY_ALIGN .align 3 + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + + srawi. J, N, 3 + + ble LSGEMM_L8_END + +LSGEMM_L8_BEGIN: + + li T1, 128 + li T2, 256 + + mr AO, A + mr CO, C + slwi T3, LDC , 3 + add C, C, T3 + + dcbt A, T1 + dcbt A, T2 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_L8x16_END + + MY_ALIGN +LSGEMM_L8x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 6 /**(T11-1) % 64x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 6 /**(K-1) % 64x */ +#endif + + ZERO8x16 + ble LSGEMM_L8x16_SUB0 + + MY_ALIGN +LSGEMM_L8x16_LOOP_START: + + LOAD8x16_0 /*we already zeroed */ + ##OffsetA=64 OffsetB=32 + addi AO,AO,2112 + addi BO,BO,32 + + mtctr L + + MY_ALIGN + +LSGEMM_L8x16_LOOP: + + KERNEL8x16_I1_L4_2 -2048,0, 0,0 + KERNEL8x16_I1_L4_2 -2048,0, 1,0 + KERNEL8x16_I1_L4_2 -2048,0, 2,0 + KERNEL8x16_I1_L4_2 -2048,0, 3,0 + KERNEL8x16_I1_L4_2 -2048,0, 4,0 + KERNEL8x16_I1_L4_2 -2048,0, 5,0 + KERNEL8x16_I1_L4_2 -2048,0, 6,0 + KERNEL8x16_I1_L4_2 -2048,0, 7,0 + KERNEL8x16_I1_L4_2 -2048,0, 8,0 + KERNEL8x16_I1_L4_2 -2048,0, 9,0 + KERNEL8x16_I1_L4_2 -2048,0, 10,0 + KERNEL8x16_I1_L4_2 -2048,0, 11,0 + KERNEL8x16_I1_L4_2 -2048,0, 12,0 + KERNEL8x16_I1_L4_2 -2048,0, 13,0 + KERNEL8x16_I1_L4_2 -2048,0, 14,0 + KERNEL8x16_I1_L4_2 -2048,0, 15,1 + + bdnz LSGEMM_L8x16_LOOP + + MY_ALIGN +LSGEMM_L8x16_LOOP_END: + + END8x16 0, AO, BO, -2048, 0 + + b LSGEMM_L8x16_SUB1 + MY_ALIGN +LSGEMM_L8x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 127 +#else + andi. L, K, 127 +#endif + b LSGEMM_L8x16_SUB2 + MY_ALIGN +LSGEMM_L8x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 63 +#else + andi. L, T12, 63 +#endif + ble LSGEMM_L8x16_SAVE + MY_ALIGN +LSGEMM_L8x16_SUB2: + + srawi. T10,L, 5 + ble LSGEMM_L8x16_SUB2_16 + mtctr T10 + MY_ALIGN +LSGEMM_L8x16_SUB2_LOOP: + LOAD8x16_0 + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_2 64,32, 1,0 + KERNEL8x16_I1_L4_2 64,32, 2,0 + KERNEL8x16_I1_L4_2 64,32, 3,0 + KERNEL8x16_I1_L4_2 64,32, 4,0 + KERNEL8x16_I1_L4_2 64,32, 5,0 + KERNEL8x16_I1_L4_2 64,32, 6,0 + KERNEL8x16_I1_L4_3 64,32, 7,1 + bdnz LSGEMM_L8x16_SUB2_LOOP + MY_ALIGN +LSGEMM_L8x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L8x16_SUB2_8 + LOAD8x16_0 + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_2 64,32, 1,0 + KERNEL8x16_I1_L4_2 64,32, 2,0 + KERNEL8x16_I1_L4_3 64,32, 3,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L8x16_SUB2_4 + LOAD8x16_0 + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_3 64,32, 1,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L8x16_SUB2_2 + LOAD8x16_0 + KERNEL8x16_I1_L4_3 64,32, 0,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L8x16_SUB2_1 + LOAD8x16_0 + KERNEL8x16_I1_L2_3 64,32, 0,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L8x16_SAVE + KERNEL8x16 0 +# addic. L, L, -1 +# bgt LSGEMM_L8x16_SUB2 + + MY_ALIGN +LSGEMM_L8x16_SAVE: + SAVE8x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8 +#endif + addic. I, I, -1 + bgt+ LSGEMM_L8x16_BEGIN + MY_ALIGN +LSGEMM_L8x16_END: +LSGEMM_L8x8_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L8x1_END + + andi. T1, M, 8 + ble LSGEMM_L8x8_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,8 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO8x8 + ble LSGEMM_L8x8_SUB0 + + MY_ALIGN +LSGEMM_L8x8_LOOP_START: + + LOAD8x8_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L8x8_LOOP: + + KERNEL8x8_I1_L4_2 32,32, 0,0 + KERNEL8x8_I1_L4_2 32,32, 1,0 + KERNEL8x8_I1_L4_2 32,32, 2,0 + KERNEL8x8_I1_L4_2 32,32, 3,1 + + bdnz LSGEMM_L8x8_LOOP + + MY_ALIGN +LSGEMM_L8x8_LOOP_END: + + END8x8 0, AO, BO, 32, 32 + + b LSGEMM_L8x8_SUB1 + MY_ALIGN +LSGEMM_L8x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L8x8_SUB2 + MY_ALIGN +LSGEMM_L8x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L8x8_SAVE + MY_ALIGN +LSGEMM_L8x8_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L8x8_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L8x8_SUB2_LOOP: + LOAD8x8_0 + KERNEL8x8_I1_L4_2 32,32, 0,0 + KERNEL8x8_I1_L4_3 32,32, 1,1 + bdnz LSGEMM_L8x8_SUB2_LOOP + MY_ALIGN +LSGEMM_L8x8_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L8x8_SUB2_2 + LOAD8x8_0 + KERNEL8x8_I1_L4_3 32,32, 0,1 + MY_ALIGN +LSGEMM_L8x8_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x8_SUB2_1 + LOAD8x8_0 + KERNEL8x8_I1_L2_3 32,32, 0,1 + MY_ALIGN +LSGEMM_L8x8_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x8_SAVE + KERNEL8x8 0 + + + MY_ALIGN +LSGEMM_L8x8_SAVE: + SAVE8x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8 +#endif + MY_ALIGN +LSGEMM_L8x8_END: +LSGEMM_L8x4_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L8x1_END + + andi. T1, M, 4 + ble LSGEMM_L8x4_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,8 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO8x4 + ble LSGEMM_L8x4_SUB0 + + MY_ALIGN +LSGEMM_L8x4_LOOP_START: + + LOAD8x4_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L8x4_LOOP: + + KERNEL8x4_I1_L4_2 16,32, 0,0 + KERNEL8x4_I1_L4_2 16,32, 1,0 + KERNEL8x4_I1_L4_2 16,32, 2,0 + KERNEL8x4_I1_L4_2 16,32, 3,1 + + bdnz LSGEMM_L8x4_LOOP + + MY_ALIGN +LSGEMM_L8x4_LOOP_END: + + END8x4 0, AO, BO, 16, 32 + + b LSGEMM_L8x4_SUB1 + MY_ALIGN +LSGEMM_L8x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L8x4_SUB2 + MY_ALIGN +LSGEMM_L8x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L8x4_SAVE + MY_ALIGN +LSGEMM_L8x4_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L8x4_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L8x4_SUB2_LOOP: + LOAD8x4_0 + KERNEL8x4_I1_L4_2 16,32, 0,0 + KERNEL8x4_I1_L4_3 16,32, 1,1 + bdnz LSGEMM_L8x4_SUB2_LOOP + MY_ALIGN +LSGEMM_L8x4_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L8x4_SUB2_2 + LOAD8x4_0 + KERNEL8x4_I1_L4_3 16,32, 0,1 + MY_ALIGN +LSGEMM_L8x4_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x4_SUB2_1 + LOAD8x4_0 + KERNEL8x4_I1_L2_3 16,32, 0,1 + MY_ALIGN +LSGEMM_L8x4_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x4_SAVE + KERNEL8x4 0 + + + MY_ALIGN +LSGEMM_L8x4_SAVE: + SAVE8x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8 +#endif + MY_ALIGN +LSGEMM_L8x4_END: +LSGEMM_L8x2_BEGIN: + andi. T1, M, 2 + ble LSGEMM_L8x2_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,8 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO8x2 + ble LSGEMM_L8x2_SUB0 + + MY_ALIGN +LSGEMM_L8x2_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L8x2_LOOP: + + KERNEL8x2_2 0,0, 0,0 + KERNEL8x2_2 0,0, 1,0 + KERNEL8x2_2 0,0, 2,0 + KERNEL8x2_2 0,0, 3,1 + + bdnz LSGEMM_L8x2_LOOP + + MY_ALIGN +LSGEMM_L8x2_LOOP_END: + +LSGEMM_L8x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L8x2_SAVE + MY_ALIGN +LSGEMM_L8x2_SUB2: + andi. T1,L, 4 + ble LSGEMM_L8x2_SUB2_2 + KERNEL8x2_2 0,0, 0,0 + KERNEL8x2_2 0,0, 1,1 + MY_ALIGN +LSGEMM_L8x2_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x2_SUB2_1 + KERNEL8x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L8x2_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x2_SAVE + KERNEL8x2 + + MY_ALIGN +LSGEMM_L8x2_SAVE: + SAVE8x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8 +#endif + MY_ALIGN +LSGEMM_L8x2_END: +LSGEMM_L8x1_BEGIN: + andi. T1, M, 1 + ble LSGEMM_L8x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,8 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO8x1 + ble LSGEMM_L8x1_SUB0 + + MY_ALIGN +LSGEMM_L8x1_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L8x1_LOOP: + + KERNEL8x1_4 0,0, 0,0 + KERNEL8x1_4 0,0, 1,1 + + bdnz LSGEMM_L8x1_LOOP + + MY_ALIGN +LSGEMM_L8x1_LOOP_END: + +LSGEMM_L8x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L8x1_SAVE + MY_ALIGN +LSGEMM_L8x1_SUB2: + andi. T1,L, 4 + ble LSGEMM_L8x1_SUB2_2 + KERNEL8x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L8x1_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x1_SUB2_1 + KERNEL8x1_2 + MY_ALIGN +LSGEMM_L8x1_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x1_SAVE + KERNEL8x1 + + MY_ALIGN +LSGEMM_L8x1_SAVE: + SAVE8x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8 +#endif + MY_ALIGN +LSGEMM_L8x1_END: + + slwi T1, K, 5 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 8 +#endif + addic. J, J, -1 + bgt LSGEMM_L8_BEGIN + + +LSGEMM_L8_END: + +/* b LSGEMM_L4_BEGIN*/ + andi. T1, N, 4 + ble LSGEMM_L4_END +LSGEMM_L4_BEGIN: + + + mr AO, A + mr CO, C + slwi T3, LDC , 2 + add C, C, T3 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_L4x16_END + + MY_ALIGN +LSGEMM_L4x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,4 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 6 /**(T11-1) % 64x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 6 /**(K-1) % 64x */ +#endif + + ZERO4x16 + ble LSGEMM_L4x16_SUB0 + + MY_ALIGN +LSGEMM_L4x16_LOOP_START: + + LOAD4x16_0 /*we already zeroed */ + ##OffsetA=64 OffsetB=16 + addi AO,AO,2112 + addi BO,BO,16 + + mtctr L + + MY_ALIGN + +LSGEMM_L4x16_LOOP: + + KERNEL4x16_I1_L4_2 -2048,0, 0,0 + KERNEL4x16_I1_L4_2 -2048,0, 1,0 + KERNEL4x16_I1_L4_2 -2048,0, 2,0 + KERNEL4x16_I1_L4_2 -2048,0, 3,0 + KERNEL4x16_I1_L4_2 -2048,0, 4,0 + KERNEL4x16_I1_L4_2 -2048,0, 5,0 + KERNEL4x16_I1_L4_2 -2048,0, 6,0 + KERNEL4x16_I1_L4_2 -2048,0, 7,0 + KERNEL4x16_I1_L4_2 -2048,0, 8,0 + KERNEL4x16_I1_L4_2 -2048,0, 9,0 + KERNEL4x16_I1_L4_2 -2048,0, 10,0 + KERNEL4x16_I1_L4_2 -2048,0, 11,0 + KERNEL4x16_I1_L4_2 -2048,0, 12,0 + KERNEL4x16_I1_L4_2 -2048,0, 13,0 + KERNEL4x16_I1_L4_2 -2048,0, 14,0 + KERNEL4x16_I1_L4_2 -2048,0, 15,1 + + bdnz LSGEMM_L4x16_LOOP + + MY_ALIGN +LSGEMM_L4x16_LOOP_END: + + END4x16 0, AO, BO, -2048, 0 + + b LSGEMM_L4x16_SUB1 + MY_ALIGN +LSGEMM_L4x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 127 +#else + andi. L, K, 127 +#endif + b LSGEMM_L4x16_SUB2 + MY_ALIGN +LSGEMM_L4x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 63 +#else + andi. L, T12, 63 +#endif + ble LSGEMM_L4x16_SAVE + MY_ALIGN +LSGEMM_L4x16_SUB2: + + srawi. T10,L, 5 + ble LSGEMM_L4x16_SUB2_16 + mtctr T10 + MY_ALIGN +LSGEMM_L4x16_SUB2_LOOP: + LOAD4x16_0 + KERNEL4x16_I1_L4_2 64,16, 0,0 + KERNEL4x16_I1_L4_2 64,16, 1,0 + KERNEL4x16_I1_L4_2 64,16, 2,0 + KERNEL4x16_I1_L4_2 64,16, 3,0 + KERNEL4x16_I1_L4_2 64,16, 4,0 + KERNEL4x16_I1_L4_2 64,16, 5,0 + KERNEL4x16_I1_L4_2 64,16, 6,0 + KERNEL4x16_I1_L4_3 64,16, 7,1 + bdnz LSGEMM_L4x16_SUB2_LOOP + MY_ALIGN +LSGEMM_L4x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L4x16_SUB2_8 + LOAD4x16_0 + KERNEL4x16_I1_L4_2 64,16, 0,0 + KERNEL4x16_I1_L4_2 64,16, 1,0 + KERNEL4x16_I1_L4_2 64,16, 2,0 + KERNEL4x16_I1_L4_3 64,16, 3,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L4x16_SUB2_4 + LOAD4x16_0 + KERNEL4x16_I1_L4_2 64,16, 0,0 + KERNEL4x16_I1_L4_3 64,16, 1,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L4x16_SUB2_2 + LOAD4x16_0 + KERNEL4x16_I1_L4_3 64,16, 0,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L4x16_SUB2_1 + LOAD4x16_0 + KERNEL4x16_I1_L2_3 64,16, 0,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L4x16_SAVE + KERNEL4x16 0 +# addic. L, L, -1 +# bgt LSGEMM_L4x16_SUB2 + + MY_ALIGN +LSGEMM_L4x16_SAVE: + SAVE4x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4 +#endif + addic. I, I, -1 + bgt+ LSGEMM_L4x16_BEGIN + MY_ALIGN +LSGEMM_L4x16_END: +LSGEMM_L4x8_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L4x1_END + + andi. T1, M, 8 + ble LSGEMM_L4x8_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,4 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO4x8 + ble LSGEMM_L4x8_SUB0 + + MY_ALIGN +LSGEMM_L4x8_LOOP_START: + + LOAD4x8_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L4x8_LOOP: + + KERNEL4x8_I1_L4_2 32,16, 0,0 + KERNEL4x8_I1_L4_2 32,16, 1,0 + KERNEL4x8_I1_L4_2 32,16, 2,0 + KERNEL4x8_I1_L4_2 32,16, 3,1 + + bdnz LSGEMM_L4x8_LOOP + + MY_ALIGN +LSGEMM_L4x8_LOOP_END: + + END4x8 0, AO, BO, 32, 16 + + b LSGEMM_L4x8_SUB1 + MY_ALIGN +LSGEMM_L4x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L4x8_SUB2 + MY_ALIGN +LSGEMM_L4x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L4x8_SAVE + MY_ALIGN +LSGEMM_L4x8_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L4x8_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L4x8_SUB2_LOOP: + LOAD4x8_0 + KERNEL4x8_I1_L4_2 32,16, 0,0 + KERNEL4x8_I1_L4_3 32,16, 1,1 + bdnz LSGEMM_L4x8_SUB2_LOOP + MY_ALIGN +LSGEMM_L4x8_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L4x8_SUB2_2 + LOAD4x8_0 + KERNEL4x8_I1_L4_3 32,16, 0,1 + MY_ALIGN +LSGEMM_L4x8_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x8_SUB2_1 + LOAD4x8_0 + KERNEL4x8_I1_L2_3 32,16, 0,1 + MY_ALIGN +LSGEMM_L4x8_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x8_SAVE + KERNEL4x8 0 + + + MY_ALIGN +LSGEMM_L4x8_SAVE: + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4 +#endif + MY_ALIGN +LSGEMM_L4x8_END: +LSGEMM_L4x4_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L4x1_END + + andi. T1, M, 4 + ble LSGEMM_L4x4_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,4 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO4x4 + ble LSGEMM_L4x4_SUB0 + + MY_ALIGN +LSGEMM_L4x4_LOOP_START: + + LOAD4x4_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L4x4_LOOP: + + KERNEL4x4_I1_L4_2 16,16, 0,0 + KERNEL4x4_I1_L4_2 16,16, 1,0 + KERNEL4x4_I1_L4_2 16,16, 2,0 + KERNEL4x4_I1_L4_2 16,16, 3,1 + + bdnz LSGEMM_L4x4_LOOP + + MY_ALIGN +LSGEMM_L4x4_LOOP_END: + + END4x4 0, AO, BO, 16, 16 + + b LSGEMM_L4x4_SUB1 + MY_ALIGN +LSGEMM_L4x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L4x4_SUB2 + MY_ALIGN +LSGEMM_L4x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L4x4_SAVE + MY_ALIGN +LSGEMM_L4x4_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L4x4_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L4x4_SUB2_LOOP: + LOAD4x4_0 + KERNEL4x4_I1_L4_2 16,16, 0,0 + KERNEL4x4_I1_L4_3 16,16, 1,1 + bdnz LSGEMM_L4x4_SUB2_LOOP + MY_ALIGN +LSGEMM_L4x4_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L4x4_SUB2_2 + LOAD4x4_0 + KERNEL4x4_I1_L4_3 16,16, 0,1 + MY_ALIGN +LSGEMM_L4x4_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x4_SUB2_1 + LOAD4x4_0 + KERNEL4x4_I1_L2_3 16,16, 0,1 + MY_ALIGN +LSGEMM_L4x4_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x4_SAVE + KERNEL4x4 0 + + + MY_ALIGN +LSGEMM_L4x4_SAVE: + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4 +#endif + MY_ALIGN +LSGEMM_L4x4_END: +LSGEMM_L4x2_BEGIN: + andi. T1, M, 2 + ble LSGEMM_L4x2_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,4 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO4x2 + ble LSGEMM_L4x2_SUB0 + + MY_ALIGN +LSGEMM_L4x2_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L4x2_LOOP: + + KERNEL4x2_2 0,0, 0,0 + KERNEL4x2_2 0,0, 1,0 + KERNEL4x2_2 0,0, 2,0 + KERNEL4x2_2 0,0, 3,1 + + bdnz LSGEMM_L4x2_LOOP + + MY_ALIGN +LSGEMM_L4x2_LOOP_END: + +LSGEMM_L4x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L4x2_SAVE + MY_ALIGN +LSGEMM_L4x2_SUB2: + andi. T1,L, 4 + ble LSGEMM_L4x2_SUB2_2 + KERNEL4x2_2 0,0, 0,0 + KERNEL4x2_2 0,0, 1,1 + MY_ALIGN +LSGEMM_L4x2_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x2_SUB2_1 + KERNEL4x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L4x2_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x2_SAVE + KERNEL4x2 + + MY_ALIGN +LSGEMM_L4x2_SAVE: + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4 +#endif + MY_ALIGN +LSGEMM_L4x2_END: +LSGEMM_L4x1_BEGIN: + andi. T1, M, 1 + ble LSGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,4 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO4x1 + ble LSGEMM_L4x1_SUB0 + + MY_ALIGN +LSGEMM_L4x1_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L4x1_LOOP: + + KERNEL4x1_4 0,0, 0,0 + KERNEL4x1_4 0,0, 1,1 + + bdnz LSGEMM_L4x1_LOOP + + MY_ALIGN +LSGEMM_L4x1_LOOP_END: + +LSGEMM_L4x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L4x1_SAVE + MY_ALIGN +LSGEMM_L4x1_SUB2: + andi. T1,L, 4 + ble LSGEMM_L4x1_SUB2_2 + KERNEL4x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L4x1_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x1_SUB2_1 + KERNEL4x1_2 + MY_ALIGN +LSGEMM_L4x1_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x1_SAVE + KERNEL4x1 + + MY_ALIGN +LSGEMM_L4x1_SAVE: + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4 +#endif + MY_ALIGN +LSGEMM_L4x1_END: + + slwi T1, K, 4 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + + andi. T2, N, 3 + ble .L999 + +LSGEMM_L4_END: + andi. T1, N, 2 + ble LSGEMM_L2_END +LSGEMM_L2_BEGIN: + + + mr AO, A + mr CO, C + slwi T3, LDC , 1 + add C, C, T3 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_L2x16_END + + MY_ALIGN +LSGEMM_L2x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x16 + ble LSGEMM_L2x16_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_L2x16_LOOP: + + KERNEL2x16_4 -2048,0, 0,0 + KERNEL2x16_4 -2048,0, 1,0 + KERNEL2x16_4 -2048,0, 2,0 + KERNEL2x16_4 -2048,0, 3,0 + KERNEL2x16_4 -2048,0, 4,0 + KERNEL2x16_4 -2048,0, 5,0 + KERNEL2x16_4 -2048,0, 6,0 + KERNEL2x16_4 -2048,0, 7,0 + KERNEL2x16_4 -2048,0, 8,0 + KERNEL2x16_4 -2048,0, 9,0 + KERNEL2x16_4 -2048,0, 10,0 + KERNEL2x16_4 -2048,0, 11,0 + KERNEL2x16_4 -2048,0, 12,0 + KERNEL2x16_4 -2048,0, 13,0 + KERNEL2x16_4 -2048,0, 14,0 + KERNEL2x16_4 -2048,0, 15,1 + + bdnz LSGEMM_L2x16_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_L2x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x16_SAVE + MY_ALIGN +LSGEMM_L2x16_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x16_SUB2_16 + KERNEL2x16_4 0,0, 0,0 + KERNEL2x16_4 0,0, 1,0 + KERNEL2x16_4 0,0, 2,0 + KERNEL2x16_4 0,0, 3,0 + KERNEL2x16_4 0,0, 4,0 + KERNEL2x16_4 0,0, 5,0 + KERNEL2x16_4 0,0, 6,0 + KERNEL2x16_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x16_SUB2_8 + KERNEL2x16_4 0,0, 0,0 + KERNEL2x16_4 0,0, 1,0 + KERNEL2x16_4 0,0, 2,0 + KERNEL2x16_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x16_SUB2_4 + KERNEL2x16_4 0,0, 0,0 + KERNEL2x16_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x16_SUB2_2 + KERNEL2x16_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x16_SUB2_1 + KERNEL2x16_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x16_SAVE + KERNEL2x16 + + MY_ALIGN +LSGEMM_L2x16_SAVE: + SAVE2x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2 +#endif + addic. I, I, -1 + bgt+ LSGEMM_L2x16_BEGIN + MY_ALIGN +LSGEMM_L2x16_END: + andi. I, M, 8 + ble LSGEMM_L2x8_END + + MY_ALIGN +LSGEMM_L2x8_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x8 + ble LSGEMM_L2x8_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_L2x8_LOOP: + + KERNEL2x8_4 -2048,0, 0,0 + KERNEL2x8_4 -2048,0, 1,0 + KERNEL2x8_4 -2048,0, 2,0 + KERNEL2x8_4 -2048,0, 3,0 + KERNEL2x8_4 -2048,0, 4,0 + KERNEL2x8_4 -2048,0, 5,0 + KERNEL2x8_4 -2048,0, 6,0 + KERNEL2x8_4 -2048,0, 7,0 + KERNEL2x8_4 -2048,0, 8,0 + KERNEL2x8_4 -2048,0, 9,0 + KERNEL2x8_4 -2048,0, 10,0 + KERNEL2x8_4 -2048,0, 11,0 + KERNEL2x8_4 -2048,0, 12,0 + KERNEL2x8_4 -2048,0, 13,0 + KERNEL2x8_4 -2048,0, 14,0 + KERNEL2x8_4 -2048,0, 15,1 + + bdnz LSGEMM_L2x8_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_L2x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x8_SAVE + MY_ALIGN +LSGEMM_L2x8_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x8_SUB2_16 + KERNEL2x8_4 0,0, 0,0 + KERNEL2x8_4 0,0, 1,0 + KERNEL2x8_4 0,0, 2,0 + KERNEL2x8_4 0,0, 3,0 + KERNEL2x8_4 0,0, 4,0 + KERNEL2x8_4 0,0, 5,0 + KERNEL2x8_4 0,0, 6,0 + KERNEL2x8_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x8_SUB2_8 + KERNEL2x8_4 0,0, 0,0 + KERNEL2x8_4 0,0, 1,0 + KERNEL2x8_4 0,0, 2,0 + KERNEL2x8_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x8_SUB2_4 + KERNEL2x8_4 0,0, 0,0 + KERNEL2x8_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x8_SUB2_2 + KERNEL2x8_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x8_SUB2_1 + KERNEL2x8_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x8_SAVE + KERNEL2x8 + + MY_ALIGN +LSGEMM_L2x8_SAVE: + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2 +#endif + MY_ALIGN +LSGEMM_L2x8_END: + andi. I, M, 4 + ble LSGEMM_L2x4_END + + MY_ALIGN +LSGEMM_L2x4_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x4 + ble LSGEMM_L2x4_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_L2x4_LOOP: + + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,0 + KERNEL2x4_4 0,0, 2,0 + KERNEL2x4_4 0,0, 3,0 + KERNEL2x4_4 0,0, 4,0 + KERNEL2x4_4 0,0, 5,0 + KERNEL2x4_4 0,0, 6,0 + KERNEL2x4_4 0,0, 7,0 + KERNEL2x4_4 0,0, 8,0 + KERNEL2x4_4 0,0, 9,0 + KERNEL2x4_4 0,0, 10,0 + KERNEL2x4_4 0,0, 11,0 + KERNEL2x4_4 0,0, 12,0 + KERNEL2x4_4 0,0, 13,0 + KERNEL2x4_4 0,0, 14,0 + KERNEL2x4_4 0,0, 15,1 + + bdnz LSGEMM_L2x4_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_L2x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x4_SAVE + MY_ALIGN +LSGEMM_L2x4_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x4_SUB2_16 + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,0 + KERNEL2x4_4 0,0, 2,0 + KERNEL2x4_4 0,0, 3,0 + KERNEL2x4_4 0,0, 4,0 + KERNEL2x4_4 0,0, 5,0 + KERNEL2x4_4 0,0, 6,0 + KERNEL2x4_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x4_SUB2_8 + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,0 + KERNEL2x4_4 0,0, 2,0 + KERNEL2x4_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x4_SUB2_4 + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x4_SUB2_2 + KERNEL2x4_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x4_SUB2_1 + KERNEL2x4_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x4_SAVE + KERNEL2x4 + + MY_ALIGN +LSGEMM_L2x4_SAVE: + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2 +#endif + MY_ALIGN +LSGEMM_L2x4_END: + andi. I, M, 2 + ble LSGEMM_L2x2_END + + MY_ALIGN +LSGEMM_L2x2_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x2 + ble LSGEMM_L2x2_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_L2x2_LOOP: + + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,0 + KERNEL2x2_4 0,0, 2,0 + KERNEL2x2_4 0,0, 3,0 + KERNEL2x2_4 0,0, 4,0 + KERNEL2x2_4 0,0, 5,0 + KERNEL2x2_4 0,0, 6,0 + KERNEL2x2_4 0,0, 7,0 + KERNEL2x2_4 0,0, 8,0 + KERNEL2x2_4 0,0, 9,0 + KERNEL2x2_4 0,0, 10,0 + KERNEL2x2_4 0,0, 11,0 + KERNEL2x2_4 0,0, 12,0 + KERNEL2x2_4 0,0, 13,0 + KERNEL2x2_4 0,0, 14,0 + KERNEL2x2_4 0,0, 15,1 + + bdnz LSGEMM_L2x2_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_L2x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x2_SAVE + MY_ALIGN +LSGEMM_L2x2_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x2_SUB2_16 + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,0 + KERNEL2x2_4 0,0, 2,0 + KERNEL2x2_4 0,0, 3,0 + KERNEL2x2_4 0,0, 4,0 + KERNEL2x2_4 0,0, 5,0 + KERNEL2x2_4 0,0, 6,0 + KERNEL2x2_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x2_SUB2_8 + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,0 + KERNEL2x2_4 0,0, 2,0 + KERNEL2x2_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x2_SUB2_4 + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x2_SUB2_2 + KERNEL2x2_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x2_SUB2_1 + KERNEL2x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x2_SAVE + KERNEL2x2 + + MY_ALIGN +LSGEMM_L2x2_SAVE: + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2 +#endif + MY_ALIGN +LSGEMM_L2x2_END: + andi. I, M, 1 + ble LSGEMM_L2x1_END + + MY_ALIGN +LSGEMM_L2x1_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x1 + ble LSGEMM_L2x1_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_L2x1_LOOP: + + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,0 + KERNEL2x1_4 0,0, 2,0 + KERNEL2x1_4 0,0, 3,0 + KERNEL2x1_4 0,0, 4,0 + KERNEL2x1_4 0,0, 5,0 + KERNEL2x1_4 0,0, 6,0 + KERNEL2x1_4 0,0, 7,0 + KERNEL2x1_4 0,0, 8,0 + KERNEL2x1_4 0,0, 9,0 + KERNEL2x1_4 0,0, 10,0 + KERNEL2x1_4 0,0, 11,0 + KERNEL2x1_4 0,0, 12,0 + KERNEL2x1_4 0,0, 13,0 + KERNEL2x1_4 0,0, 14,0 + KERNEL2x1_4 0,0, 15,1 + + bdnz LSGEMM_L2x1_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_L2x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x1_SAVE + MY_ALIGN +LSGEMM_L2x1_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x1_SUB2_16 + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,0 + KERNEL2x1_4 0,0, 2,0 + KERNEL2x1_4 0,0, 3,0 + KERNEL2x1_4 0,0, 4,0 + KERNEL2x1_4 0,0, 5,0 + KERNEL2x1_4 0,0, 6,0 + KERNEL2x1_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x1_SUB2_8 + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,0 + KERNEL2x1_4 0,0, 2,0 + KERNEL2x1_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x1_SUB2_4 + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x1_SUB2_2 + KERNEL2x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x1_SUB2_1 + KERNEL2x1_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x1_SAVE + KERNEL2x1 + + MY_ALIGN +LSGEMM_L2x1_SAVE: + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2 +#endif + MY_ALIGN +LSGEMM_L2x1_END: + slwi T1, K, 3 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif +LSGEMM_L2_END: + andi. T1, N, 1 + ble LSGEMM_END +LSGEMM_1_BEGIN: + + + mr AO, A + mr CO, C + add C, C, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_1x16_END + + MY_ALIGN +LSGEMM_1x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x16 + ble LSGEMM_1x16_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_1x16_LOOP: + + KERNEL1x16_4 -2048,0, 0,0 + KERNEL1x16_4 -2048,0, 1,0 + KERNEL1x16_4 -2048,0, 2,0 + KERNEL1x16_4 -2048,0, 3,0 + KERNEL1x16_4 -2048,0, 4,0 + KERNEL1x16_4 -2048,0, 5,0 + KERNEL1x16_4 -2048,0, 6,0 + KERNEL1x16_4 -2048,0, 7,0 + KERNEL1x16_4 -2048,0, 8,0 + KERNEL1x16_4 -2048,0, 9,0 + KERNEL1x16_4 -2048,0, 10,0 + KERNEL1x16_4 -2048,0, 11,0 + KERNEL1x16_4 -2048,0, 12,0 + KERNEL1x16_4 -2048,0, 13,0 + KERNEL1x16_4 -2048,0, 14,0 + KERNEL1x16_4 -2048,0, 15,1 + + bdnz LSGEMM_1x16_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_1x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x16_SAVE + MY_ALIGN +LSGEMM_1x16_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x16_SUB2_16 + KERNEL1x16_4 0,0, 0,0 + KERNEL1x16_4 0,0, 1,0 + KERNEL1x16_4 0,0, 2,0 + KERNEL1x16_4 0,0, 3,0 + KERNEL1x16_4 0,0, 4,0 + KERNEL1x16_4 0,0, 5,0 + KERNEL1x16_4 0,0, 6,0 + KERNEL1x16_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x16_SUB2_8 + KERNEL1x16_4 0,0, 0,0 + KERNEL1x16_4 0,0, 1,0 + KERNEL1x16_4 0,0, 2,0 + KERNEL1x16_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x16_SUB2_4 + KERNEL1x16_4 0,0, 0,0 + KERNEL1x16_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x16_SUB2_2 + KERNEL1x16_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x16_SUB2_1 + KERNEL1x16_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x16_SAVE + KERNEL1x16 + + MY_ALIGN +LSGEMM_1x16_SAVE: + SAVE1x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1 +#endif + addic. I, I, -1 + bgt+ LSGEMM_1x16_BEGIN + MY_ALIGN +LSGEMM_1x16_END: + andi. I, M, 8 + ble LSGEMM_1x8_END + + MY_ALIGN +LSGEMM_1x8_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x8 + ble LSGEMM_1x8_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_1x8_LOOP: + + KERNEL1x8_4 -2048,0, 0,0 + KERNEL1x8_4 -2048,0, 1,0 + KERNEL1x8_4 -2048,0, 2,0 + KERNEL1x8_4 -2048,0, 3,0 + KERNEL1x8_4 -2048,0, 4,0 + KERNEL1x8_4 -2048,0, 5,0 + KERNEL1x8_4 -2048,0, 6,0 + KERNEL1x8_4 -2048,0, 7,0 + KERNEL1x8_4 -2048,0, 8,0 + KERNEL1x8_4 -2048,0, 9,0 + KERNEL1x8_4 -2048,0, 10,0 + KERNEL1x8_4 -2048,0, 11,0 + KERNEL1x8_4 -2048,0, 12,0 + KERNEL1x8_4 -2048,0, 13,0 + KERNEL1x8_4 -2048,0, 14,0 + KERNEL1x8_4 -2048,0, 15,1 + + bdnz LSGEMM_1x8_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_1x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x8_SAVE + MY_ALIGN +LSGEMM_1x8_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x8_SUB2_16 + KERNEL1x8_4 0,0, 0,0 + KERNEL1x8_4 0,0, 1,0 + KERNEL1x8_4 0,0, 2,0 + KERNEL1x8_4 0,0, 3,0 + KERNEL1x8_4 0,0, 4,0 + KERNEL1x8_4 0,0, 5,0 + KERNEL1x8_4 0,0, 6,0 + KERNEL1x8_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x8_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x8_SUB2_8 + KERNEL1x8_4 0,0, 0,0 + KERNEL1x8_4 0,0, 1,0 + KERNEL1x8_4 0,0, 2,0 + KERNEL1x8_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x8_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x8_SUB2_4 + KERNEL1x8_4 0,0, 0,0 + KERNEL1x8_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x8_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x8_SUB2_2 + KERNEL1x8_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x8_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x8_SUB2_1 + KERNEL1x8_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x8_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x8_SAVE + KERNEL1x8 + + MY_ALIGN +LSGEMM_1x8_SAVE: + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1 +#endif + MY_ALIGN +LSGEMM_1x8_END: + andi. I, M, 4 + ble LSGEMM_1x4_END + + MY_ALIGN +LSGEMM_1x4_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x4 + ble LSGEMM_1x4_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_1x4_LOOP: + + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,0 + KERNEL1x4_4 0,0, 2,0 + KERNEL1x4_4 0,0, 3,0 + KERNEL1x4_4 0,0, 4,0 + KERNEL1x4_4 0,0, 5,0 + KERNEL1x4_4 0,0, 6,0 + KERNEL1x4_4 0,0, 7,0 + KERNEL1x4_4 0,0, 8,0 + KERNEL1x4_4 0,0, 9,0 + KERNEL1x4_4 0,0, 10,0 + KERNEL1x4_4 0,0, 11,0 + KERNEL1x4_4 0,0, 12,0 + KERNEL1x4_4 0,0, 13,0 + KERNEL1x4_4 0,0, 14,0 + KERNEL1x4_4 0,0, 15,1 + + bdnz LSGEMM_1x4_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_1x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x4_SAVE + MY_ALIGN +LSGEMM_1x4_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x4_SUB2_16 + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,0 + KERNEL1x4_4 0,0, 2,0 + KERNEL1x4_4 0,0, 3,0 + KERNEL1x4_4 0,0, 4,0 + KERNEL1x4_4 0,0, 5,0 + KERNEL1x4_4 0,0, 6,0 + KERNEL1x4_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x4_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x4_SUB2_8 + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,0 + KERNEL1x4_4 0,0, 2,0 + KERNEL1x4_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x4_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x4_SUB2_4 + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x4_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x4_SUB2_2 + KERNEL1x4_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x4_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x4_SUB2_1 + KERNEL1x4_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x4_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x4_SAVE + KERNEL1x4 + + MY_ALIGN +LSGEMM_1x4_SAVE: + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1 +#endif + MY_ALIGN +LSGEMM_1x4_END: + andi. I, M, 2 + ble LSGEMM_1x2_END + + MY_ALIGN +LSGEMM_1x2_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x2 + ble LSGEMM_1x2_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_1x2_LOOP: + + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,0 + KERNEL1x2_4 0,0, 2,0 + KERNEL1x2_4 0,0, 3,0 + KERNEL1x2_4 0,0, 4,0 + KERNEL1x2_4 0,0, 5,0 + KERNEL1x2_4 0,0, 6,0 + KERNEL1x2_4 0,0, 7,0 + KERNEL1x2_4 0,0, 8,0 + KERNEL1x2_4 0,0, 9,0 + KERNEL1x2_4 0,0, 10,0 + KERNEL1x2_4 0,0, 11,0 + KERNEL1x2_4 0,0, 12,0 + KERNEL1x2_4 0,0, 13,0 + KERNEL1x2_4 0,0, 14,0 + KERNEL1x2_4 0,0, 15,1 + + bdnz LSGEMM_1x2_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_1x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x2_SAVE + MY_ALIGN +LSGEMM_1x2_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x2_SUB2_16 + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,0 + KERNEL1x2_4 0,0, 2,0 + KERNEL1x2_4 0,0, 3,0 + KERNEL1x2_4 0,0, 4,0 + KERNEL1x2_4 0,0, 5,0 + KERNEL1x2_4 0,0, 6,0 + KERNEL1x2_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x2_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x2_SUB2_8 + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,0 + KERNEL1x2_4 0,0, 2,0 + KERNEL1x2_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x2_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x2_SUB2_4 + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x2_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x2_SUB2_2 + KERNEL1x2_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x2_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x2_SUB2_1 + KERNEL1x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x2_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x2_SAVE + KERNEL1x2 + + MY_ALIGN +LSGEMM_1x2_SAVE: + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1 +#endif + MY_ALIGN +LSGEMM_1x2_END: + andi. I, M, 1 + ble LSGEMM_1x1_END + + MY_ALIGN +LSGEMM_1x1_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x1 + ble LSGEMM_1x1_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_1x1_LOOP: + + KERNEL1x1_16 0,0, 0,0 + KERNEL1x1_16 0,0, 1,0 + KERNEL1x1_16 0,0, 2,0 + KERNEL1x1_16 0,0, 3,1 + + bdnz LSGEMM_1x1_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_1x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x1_SAVE + MY_ALIGN +LSGEMM_1x1_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x1_SUB2_16 + KERNEL1x1_16 0,0, 0,0 + KERNEL1x1_16 0,0, 1,1 + MY_ALIGN +LSGEMM_1x1_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x1_SUB2_8 + KERNEL1x1_16 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x1_SUB2_4 + KERNEL1x1_8 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x1_SUB2_2 + KERNEL1x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x1_SUB2_1 + KERNEL1x1_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x1_SAVE + KERNEL1x1 + + MY_ALIGN +LSGEMM_1x1_SAVE: + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1 +#endif + MY_ALIGN +LSGEMM_1x1_END: + slwi T1, K, 2 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif +LSGEMM_END: \ No newline at end of file diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S new file mode 100644 index 000000000..c61f419ac --- /dev/null +++ b/kernel/power/sgemm_macros_power9.S @@ -0,0 +1,5828 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define unit_size 4 +#define DISP64(ind,disp) (ind*unit_size*64+disp) +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) + +/********************************************************************************************** +* Macros for N=8 and M=16 +**********************************************************************************************/ + +.macro LOAD8x16_1 + LOAD8x16 1 +.endm + +.macro LOAD8x16_0 + LOAD8x16 0 +.endm + +.macro KERNEL8x16_L1_L4 Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL8x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero8X16 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endm + +.macro LOAD8x16 Zero + + lxv vs24, 0(BO) + lxv vs28, 16(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endif +.endm + +.macro END8x16_NORMAL + END8x16 0, AO, BO, 64,32 +.endm + +.macro END8x16 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + xvmulsp vs50, vs2,vs28 + xvmulsp vs51, vs3,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + xvmulsp vs54, vs2,vs29 + xvmulsp vs55, vs3,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + xvmulsp vs58, vs2,vs30 + xvmulsp vs59, vs3,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + xvmulsp vs62, vs2,vs31 + xvmulsp vs63, vs3,vs31 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 + +.endif +.endm + +.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG) + + lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 + + lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) + + lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG) + lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 + + lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) + + lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG) + lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG) + lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG) + lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 + +.if \Complete==0 + lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) + + lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG) + lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG) + lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG) + lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) + addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP32(\Index,128) + addi \AREG, \AREG, DISP64(\Index,256) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 + +.endm + +.macro KERNEL8x16 First + + LOAD8x16 0 + END8x16 \First, AO, BO, 64,32 +.endm + +.macro KERNEL8x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 +.endif + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + xvmulsp vs50, vs2,vs28 + xvmulsp vs51, vs3,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + xvmulsp vs54, vs2,vs29 + xvmulsp vs55, vs3,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + xvmulsp vs58, vs2,vs30 + xvmulsp vs59, vs3,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + xvmulsp vs62, vs2,vs31 + xvmulsp vs63, vs3,vs31 + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 + +.endif +.if \Complete==0 + lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) + lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) + +.else + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + xvmulsp vs34, vs6,vs8 + xvmulsp vs35, vs7,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + xvmulsp vs38, vs6,vs9 + xvmulsp vs39, vs7,vs9 +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + xvmulsp vs42, vs6,vs10 + xvmulsp vs43, vs7,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + xvmulsp vs46, vs6,vs11 + xvmulsp vs47, vs7,vs11 + + xvmulsp vs48, vs4,vs12 + xvmulsp vs49, vs5,vs12 + xvmulsp vs50, vs6,vs12 + xvmulsp vs51, vs7,vs12 + + xvmulsp vs52, vs4,vs13 + xvmulsp vs53, vs5,vs13 + xvmulsp vs54, vs6,vs13 + xvmulsp vs55, vs7,vs13 + + xvmulsp vs56, vs4,vs14 + xvmulsp vs57, vs5,vs14 + xvmulsp vs58, vs6,vs14 + xvmulsp vs59, vs7,vs14 + + xvmulsp vs60, vs4,vs15 + xvmulsp vs61, vs5,vs15 + xvmulsp vs62, vs6,vs15 + xvmulsp vs63, vs7,vs15 + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 + +.endif + +.endm + + +.macro SAVE8x16 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + add T4, T2, T10 + add T5, T3, T10 + + add T6, T4, T10 + add T7, T5, T10 + + + + /* permute to restore butterfly rank 1 updateto normal promoted one */ + /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC) vs11 MEM(CO+3*LDC) */ + /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC) vs15 MEM(16+CO+3*LDC) */ + /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC) vs19 MEM(32+CO+3*LDC) */ + /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC) vs27 MEM(32+CO+3*LDC) */ + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxmrglw vs16, vs34, vs46 + xxmrglw vs18, vs38, vs42 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxmrghw vs4, vs38, vs42 + xxmrghw vs5, vs34, vs46 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxmrglw vs24, vs35, vs47 + xxmrglw vs26, vs39, vs43 + + xxlor vs17, vs16, vs16 + xxlor vs19, vs18, vs18 + + xxmrghw vs30, vs39, vs43 + xxmrghw vs31, vs35, vs47 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + +#ifndef TRMMKERNEL + lxv vs32, 0(CO) + lxv vs33, 16(CO) + lxv vs34, 32(CO) + lxv vs35, 48(CO) +#endif + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + +#ifndef TRMMKERNEL + lxv vs36, 0(T1) + lxv vs37, 16(T1) + lxv vs38, 32(T1) + lxv vs39, 48(T1) +#endif +#ifndef TRMMKERNEL + lxv vs40, 0(T2) + lxv vs41, 16(T2) + lxv vs42, 32(T2) + lxv vs43, 48(T2) +#endif +#ifndef TRMMKERNEL + lxv vs44, 0(T3) + lxv vs45, 16(T3) + lxv vs46, 32(T3) + lxv vs47, 48(T3) +#endif + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 + + xxperm vs17, vs4, save_permute_2 + xxperm vs19, vs5, save_permute_2 + + xxperm vs24, vs30, save_permute_1 + xxperm vs26, vs31, save_permute_1 + + xxperm vs25, vs30, save_permute_2 + xxperm vs27, vs31, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r + xvmulsp vs36, vs9, alpha_r + xvmulsp vs37, vs13, alpha_r + xvmulsp vs38, vs17, alpha_r + xvmulsp vs39, vs25, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r + xvmaddasp vs38, vs17, alpha_r + xvmaddasp vs39, vs25, alpha_r +#endif + + + +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r + +#endif + + stxv vs32, 0(CO) + stxv vs33, 16(CO) + stxv vs34, 32(CO) + stxv vs35, 48(CO) + + stxv vs36, 0(T1) + stxv vs37, 16(T1) + stxv vs38, 32(T1) + stxv vs39, 48(T1) + + stxv vs40, 0(T2) + stxv vs41, 16(T2) + stxv vs42, 32(T2) + stxv vs43, 48(T2) + stxv vs44, 0(T3) + stxv vs45, 16(T3) + stxv vs46, 32(T3) + stxv vs47, 48(T3) + + /*****the same with the second 8X8 ****/ +#ifndef TRMMKERNEL + + lxv vs32, 0(T4) + lxv vs33, 16(T4) + lxv vs34, 32(T4) + lxv vs35, 48(T4) + lxv vs36, 0(T5) + lxv vs37, 16(T5) + lxv vs38,32(T5) + lxv vs39, 48(T5) +#endif + + xxmrglw vs8, vs48, vs60 + xxmrglw vs10, vs52, vs56 + + xxmrghw vs1, vs48, vs60 + xxmrghw vs0, vs52, vs56 + xxmrglw vs12, vs49, vs61 + xxmrglw vs14, vs53, vs57 + +#ifndef TRMMKERNEL + lxv vs40, 0(T6) + lxv vs41, 16(T6) + lxv vs42, 32(T6) + lxv vs43, 48(T6) + lxv vs44, 0(T7) + lxv vs45, 16(T7) + lxv vs46, 32(T7) + lxv vs47, 48(T7) +#endif + xxmrghw vs2, vs53, vs57 + xxmrghw vs3, vs49, vs61 + + xxmrglw vs16, vs50, vs62 + xxmrglw vs18, vs54, vs58 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + xxmrghw vs4, vs54, vs58 + xxmrghw vs5, vs50, vs62 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxmrglw vs24, vs51, vs63 + xxmrglw vs26, vs55, vs59 + + xxlor vs17, vs16, vs16 + xxlor vs19, vs18, vs18 + xxmrghw vs30, vs55, vs59 + xxmrghw vs31, vs51, vs63 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 + xxperm vs17, vs4, save_permute_2 + xxperm vs19, vs5, save_permute_2 + xxperm vs24, vs30, save_permute_1 + xxperm vs26, vs31, save_permute_1 + xxperm vs25, vs30, save_permute_2 + xxperm vs27, vs31, save_permute_2 + +#ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r + xvmulsp vs36, vs9, alpha_r + xvmulsp vs37, vs13, alpha_r + xvmulsp vs38, vs17, alpha_r + xvmulsp vs39, vs25, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r + xvmaddasp vs38, vs17, alpha_r + xvmaddasp vs39, vs25, alpha_r +#endif + + stxv vs32, 0(T4) + stxv vs33, 16(T4) + stxv vs34, 32(T4) + stxv vs35, 48(T4) + + stxv vs36, 0(T5) + stxv vs37, 16(T5) + stxv vs38, 32(T5) + stxv vs39, 48(T5) + +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r + +#endif + + stxv vs40, 0(T6) + stxv vs41, 16(T6) + stxv vs42, 32(T6) + stxv vs43, 48(T6) + stxv vs44, 0(T7) + stxv vs45, 16(T7) + stxv vs46, 32(T7) + stxv vs47, 48(T7) + + + addi CO,CO,64 + + +.endm + + + +/********************************************************************************************** +* Macros for N=8 and M=8 +**********************************************************************************************/ + +.macro LOAD8x8_1 + LOAD8x8 1 +.endm + +.macro LOAD8x8_0 + LOAD8x8 0 +.endm + +.macro KERNEL8x8_L1_L4 Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL8x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL8x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro END8x8_NORMAL + END8x8 0, AO, BO, 32,32 +.endm + +.macro Zero8X8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + +.endm + +.macro LOAD8x8 Zero + + lxv vs24, 0(BO) + lxv vs28, 16(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 +.endif +.endm + + +.macro END8x8 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + +.endif +.endm + +.macro KERNEL8x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + + lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + + lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + +.if \Complete==0 + lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP32(\Index,128) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + +.endm + +.macro KERNEL8x8 First + + LOAD8x8 0 + END8x8 \First, AO, BO, 32,32 +.endm + +.macro KERNEL8x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + +.endif + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + +.endif +.if \Complete==0 + lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) + + lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) + addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) + +.else + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP16(\Index,64) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + + xvmulsp vs48, vs4,vs12 + xvmulsp vs49, vs5,vs12 + + xvmulsp vs52, vs4,vs13 + xvmulsp vs53, vs5,vs13 + + xvmulsp vs56, vs4,vs14 + xvmulsp vs57, vs5,vs14 + + xvmulsp vs60, vs4,vs15 + xvmulsp vs61, vs5,vs15 + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + +.endif + +.endm + + +.macro SAVE8x8 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + add T4, T2, T10 + add T5, T3, T10 + + add T6, T4, T10 + add T7, T5, T10 + +#ifndef TRMMKERNEL + lxv vs34, 0(CO) + lxv vs35, 16(CO) + lxv vs38, 0(T1) + lxv vs39, 16(T1) + lxv vs42, 0(T2) + lxv vs43, 16(T2) + lxv vs46, 0(T3) + lxv vs47, 16(T3) + + lxv vs50, 0(T4) + lxv vs51, 16(T4) + lxv vs54, 0(T5) + lxv vs55, 16(T5) + lxv vs58, 0(T6) + lxv vs59, 16(T6) + lxv vs62, 0(T7) + lxv vs63, 16(T7) +#endif + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs34, vs8, alpha_r + xvmulsp vs35, vs12, alpha_r + xvmulsp vs38, vs9, alpha_r + xvmulsp vs39, vs13, alpha_r + xvmulsp vs42, vs10, alpha_r + xvmulsp vs43, vs14, alpha_r + xvmulsp vs46, vs11, alpha_r + xvmulsp vs47, vs15, alpha_r +#else + xvmaddasp vs34, vs8, alpha_r + xvmaddasp vs35, vs12, alpha_r + xvmaddasp vs38, vs9, alpha_r + xvmaddasp vs39, vs13, alpha_r + xvmaddasp vs42, vs10, alpha_r + xvmaddasp vs43, vs14, alpha_r + xvmaddasp vs46, vs11, alpha_r + xvmaddasp vs47, vs15, alpha_r +#endif + + + xxmrglw vs8, vs48, vs60 + xxmrglw vs10, vs52, vs56 + + xxmrghw vs1, vs48, vs60 + xxmrghw vs0, vs52, vs56 + stxv vs34, 0(CO) + stxv vs35, 16(CO) + xxmrglw vs12, vs49, vs61 + xxmrglw vs14, vs53, vs57 + stxv vs38, 0(T1) + stxv vs39, 16(T1) + xxmrghw vs2, vs53, vs57 + xxmrghw vs3, vs49, vs61 + stxv vs42, 0(T2) + stxv vs43, 16(T2) + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + stxv vs46, 0(T3) + stxv vs47, 16(T3) + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + + + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + #ifdef TRMMKERNEL + xvmulsp vs50, vs8, alpha_r + xvmulsp vs51, vs12, alpha_r + xvmulsp vs54, vs9, alpha_r + xvmulsp vs55, vs13, alpha_r + xvmulsp vs58, vs10, alpha_r + xvmulsp vs59, vs14, alpha_r + xvmulsp vs62, vs11, alpha_r + xvmulsp vs63, vs15, alpha_r +#else + xvmaddasp vs50, vs8, alpha_r + xvmaddasp vs51, vs12, alpha_r + xvmaddasp vs54, vs9, alpha_r + xvmaddasp vs55, vs13, alpha_r + xvmaddasp vs58, vs10, alpha_r + xvmaddasp vs59, vs14, alpha_r + xvmaddasp vs62, vs11, alpha_r + xvmaddasp vs63, vs15, alpha_r +#endif + + stxv vs50, 0(T4) + stxv vs51, 16(T4) + stxv vs54, 0(T5) + stxv vs55, 16(T5) + stxv vs58, 0(T6) + stxv vs59, 16(T6) + stxv vs62, 0(T7) + stxv vs63, 16(T7) + + addi CO,CO,32 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=4 +**********************************************************************************************/ + +.macro LOAD8x4_1 + LOAD8x4 1 +.endm + +.macro LOAD8x4_0 + LOAD8x4 0 +.endm + +.macro KERNEL8x4_L1_L4 Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL8x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL8x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero8X4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + +.endm + +.macro LOAD8x4 Zero + + lxv vs0, 0(AO) + lxv vs24, 0(BO) + lxv vs25, 16(BO) + + + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 +.endif +.endm + +.macro END8x4_NORMAL + END8x4 0, AO, BO, 16,32 +.endm + +.macro END8x4 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 + + xvmulsp vs48, vs25, vs0 + xvmulsp vs49, vs25, vs1 + xvmulsp vs50, vs25, vs2 + xvmulsp vs51, vs25, vs3 +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 + +.endif +.endm + +.macro KERNEL8x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 + + lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP32(\Index, 32+\OffsetB)(\BREG) + lxv vs25, DISP32(\Index, 48+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + xvmaddasp vs48, vs27, vs4 + xvmaddasp vs49, vs27, vs5 + xvmaddasp vs50, vs27, vs6 + xvmaddasp vs51, vs27, vs7 + + + lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs26, DISP32(\Index, 64+\OffsetB)(\BREG) + lxv vs27, DISP32(\Index, 80+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 + +.if \Complete==0 + + lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) + lxv vs24, DISP32(\Index, 96+\OffsetB)(\BREG) + lxv vs25, DISP32(\Index, 96+16+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + xvmaddasp vs48, vs27, vs4 + xvmaddasp vs49, vs27, vs5 + xvmaddasp vs50, vs27, vs6 + xvmaddasp vs51, vs27, vs7 + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) + addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) + +.else + addi \AREG, \AREG, DISP16(\Index,64) + addi \BREG, \BREG, DISP32(\Index,128) + +.endif +.endif + + +.endm + +.macro KERNEL8x4 First + LOAD8x4 0 + END8x4 \First, AO, BO, 16,32 +.endm + +.macro KERNEL8x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 + + xvmulsp vs48, vs25, vs0 + xvmulsp vs49, vs25, vs1 + xvmulsp vs50, vs25, vs2 + xvmulsp vs51, vs25, vs3 +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 +.endif + +.if \Complete==0 + + lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP16(\Index, 32+\OffsetB)(\BREG) + lxv vs25, DISP16(\Index, 48+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + +.if \First==1 + xvmulsp vs32, vs26, vs4 + xvmulsp vs33, vs26, vs5 + xvmulsp vs34, vs26, vs6 + xvmulsp vs35, vs26, vs7 + + xvmulsp vs48, vs27, vs4 + xvmulsp vs49, vs27, vs5 + xvmulsp vs50, vs27, vs6 + xvmulsp vs51, vs27, vs7 + + +.else + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + xvmaddasp vs48, vs27, vs4 + xvmaddasp vs49, vs27, vs5 + xvmaddasp vs50, vs27, vs6 + xvmaddasp vs51, vs27, vs7 +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) + +.else + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP16(\Index,64) + +.endif +.endif + + +.endm + + +.macro SAVE8x4 + slwi T10, LDC , 1 + add T1, CO, LDC +#if !defined(TRMMKERNEL) + lxv vs36, 0(CO) + lxv vs37, 0(T1) +#endif + add T2, CO, T10 + add T3, T1, T10 +#if !defined(TRMMKERNEL) + lxv vs38, 0(T2) + lxv vs39, 0(T3) +#endif + add T4, T2, T10 + add T5, T3, T10 +#if !defined(TRMMKERNEL) + lxv vs40, 0(T4) + lxv vs41, 0(T5) +#endif + add T6, T4, T10 + add T7, T5, T10 +#if !defined(TRMMKERNEL) + lxv vs42, 0(T6) + lxv vs43, 0(T7) +#endif + xxmrglw vs0, vs35,vs32 + xxmrglw vs1, vs34,vs33 + xxmrglw vs4, vs32,vs35 + xxmrglw vs5, vs33,vs34 + + + xxmrghw vs2, vs35,vs32 + xxmrghw vs3, vs34,vs33 + xxmrghw vs6, vs32,vs35 + xxmrghw vs7, vs33,vs34 + + xxmrgld vs24, vs1, vs0 + xxmrghd vs25,vs5,vs4 + + xxmrgld vs26, vs2, vs3 + xxmrghd vs27,vs6,vs7 + + + xxmrglw vs0, vs51,vs48 + xxmrglw vs1, vs50,vs49 + xxmrglw vs4, vs48,vs51 + xxmrglw vs5, vs49,vs50 + + xxmrghw vs2, vs51,vs48 + xxmrghw vs3, vs50,vs49 + xxmrghw vs6, vs48,vs51 + xxmrghw vs7, vs49,vs50 + + xxmrgld vs28, vs1, vs0 + xxmrghd vs29,vs5,vs4 + + xxmrgld vs30, vs2, vs3 + xxmrghd vs31,vs6,vs7 +#if defined(TRMMKERNEL) + + xvmulsp vs36, vs24, alpha_r + xvmulsp vs37, vs25, alpha_r + xvmulsp vs38, vs26, alpha_r + xvmulsp vs39, vs27, alpha_r + xvmulsp vs40, vs28, alpha_r + xvmulsp vs41, vs29, alpha_r + xvmulsp vs42, vs30, alpha_r + xvmulsp vs43, vs31, alpha_r +#else + xvmaddasp vs36, vs24, alpha_r + xvmaddasp vs37, vs25, alpha_r + xvmaddasp vs38, vs26, alpha_r + xvmaddasp vs39, vs27, alpha_r + xvmaddasp vs40, vs28, alpha_r + xvmaddasp vs41, vs29, alpha_r + xvmaddasp vs42, vs30, alpha_r + xvmaddasp vs43, vs31, alpha_r +#endif + + stxv vs36, 0(CO) + stxv vs37, 0(T1) + stxv vs38, 0(T2) + stxv vs39, 0(T3) + stxv vs40, 0(T4) + stxv vs41, 0(T5) + stxv vs42, 0(T6) + stxv vs43, 0(T7) + + + addi CO,CO,16 +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=2 +**********************************************************************************************/ + + +.macro KERNEL8x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + + +.macro Zero8x2 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + +.endm + +.macro KERNEL8x2 + KERNEL8x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP8(\Index,16+\OffsetB)(\BREG) + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs26, vs9 + xvmulsp vs3, vs27, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs26, vs9 + xvmaddasp vs3, vs27, vs9 + + .endif + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP8(\Index,32) + +.endm + +.macro KERNEL8x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs29, DISP16(\Index,48+\OffsetB)(\BREG) + xxspltw vs8, vs4, 2 + xxspltw vs9, vs4, 3 + xxspltw vs10, vs4, 0 + xxspltw vs11, vs4, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs26, vs9 + xvmulsp vs3, vs27, vs9 + + xvmulsp vs0, vs28, vs10 + xvmulsp vs1, vs29, vs10 + xvmulsp vs2, vs28, vs11 + xvmulsp vs3, vs29, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs26, vs9 + xvmaddasp vs3, vs27, vs9 + + xvmaddasp vs0, vs28, vs10 + xvmaddasp vs1, vs29, vs10 + xvmaddasp vs2, vs28, vs11 + xvmaddasp vs3, vs29, vs11 + .endif + + +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP16(\Index,64) +.endif + +.endm + + +.macro SAVE8x2 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + add T4, T2, T10 + add T5, T3, T10 + add T6, T4, T10 + add T7, T5, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v1,4(CO) + + lxssp v2,0(T1) + lxssp v3,4(T1) + + lxssp v4,0(T2) + lxssp v5,4(T2) + + lxssp v6,0(T3) + lxssp v7,4(T3) + + lxssp v8,0(T4) + lxssp v9,4(T4) + + lxssp v10,0(T5) + lxssp v11,4(T5) + + lxssp v12,0(T6) + lxssp v13,4(T6) + + lxssp v14,0(T7) + lxssp v15,4(T7) +#endif + xscvspdp vs5, vs2 + xxspltw vs6, vs2, 1 + xxspltw vs7, vs2, 2 + xxspltw vs8, vs2, 3 + xscvspdp vs6,vs6 + xscvspdp vs7,vs7 + xscvspdp vs8,vs8 + + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + + xscvspdp vs9, vs3 + xxspltw vs10, vs3, 1 + xxspltw vs11, vs3, 2 + xxspltw vs12, vs3, 3 + xscvspdp vs10,vs10 + xscvspdp vs11,vs11 + xscvspdp vs12,vs12 + + xscvspdp vs28, vs1 + xxspltw vs29, vs1, 1 + xxspltw vs30, vs1, 2 + xxspltw vs31, vs1, 3 + xscvspdp vs29,vs29 + xscvspdp vs30,vs30 + xscvspdp vs31,vs31 + + + + +#if defined(TRMMKERNEL) + xsmuldp vs32,vs8, vs4 + xsmuldp vs33,vs27, vs4 + + xsmuldp vs34,vs7, vs4 + xsmuldp vs35,vs26, vs4 + + xsmuldp vs36,vs6, vs4 + xsmuldp vs37,vs25, vs4 + + xsmuldp vs38,vs5, vs4 + xsmuldp vs39,vs24, vs4 + + xsmuldp vs40,vs12, vs4 + xsmuldp vs41,vs31, vs4 + + xsmuldp vs42,vs11, vs4 + xsmuldp vs43,vs30, vs4 + + xsmuldp vs44,vs10, vs4 + xsmuldp vs45,vs29, vs4 + + xsmuldp vs46,vs9, vs4 + xsmuldp vs47,vs28, vs4 +#else + xsmaddadp vs32,vs8, vs4 + xsmaddadp vs33,vs27, vs4 + + xsmaddadp vs34,vs7, vs4 + xsmaddadp vs35,vs26, vs4 + + xsmaddadp vs36,vs6, vs4 + xsmaddadp vs37,vs25, vs4 + + xsmaddadp vs38,vs5, vs4 + xsmaddadp vs39,vs24, vs4 + + xsmaddadp vs40,vs12, vs4 + xsmaddadp vs41,vs31, vs4 + + xsmaddadp vs42,vs11, vs4 + xsmaddadp vs43,vs30, vs4 + + xsmaddadp vs44,vs10, vs4 + xsmaddadp vs45,vs29, vs4 + + xsmaddadp vs46,vs9, vs4 + xsmaddadp vs47,vs28, vs4 +#endif + + stxssp v0,0(CO) + stxssp v1,4(CO) + + stxssp v2,0(T1) + stxssp v3,4(T1) + + stxssp v4,0(T2) + stxssp v5,4(T2) + + stxssp v6,0(T3) + stxssp v7,4(T3) + + stxssp v8,0(T4) + stxssp v9,4(T4) + + stxssp v10,0(T5) + stxssp v11,4(T5) + + stxssp v12,0(T6) + stxssp v13,4(T6) + + stxssp v14,0(T7) + stxssp v15,4(T7) + + + addi CO,CO,8 +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=1 +**********************************************************************************************/ +.macro KERNEL8x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro Zero8x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +.endm + +.macro KERNEL8x1 + KERNEL8x1_1 AO,BO, 0 +.endm + +.macro KERNEL8x1_2 + KERNEL8x1_2_1 AO,BO, 0 +.endm + +.macro KERNEL8x1_1 AREG,BREG,First + lxvwsx vs8, 0, \AREG + lxv vs26, 0(\BREG) + lxv vs27, 16(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + .endif + addi \AREG, \AREG, 4 + addi \BREG, \BREG, 32 +.endm + +.macro KERNEL8x1_2_1 AREG,BREG,First + lxsd v4, 0(\AREG) + lxv vs26, 0(\BREG) + lxv vs27, 16(\BREG) + lxv vs28, 32(\BREG) + lxv vs29, 48(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs0, vs28, vs9 + xvmulsp vs1, vs29, vs9 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs0, vs28, vs9 + xvmaddasp vs1, vs29, vs9 + .endif + addi \AREG, \AREG, 8 + addi \BREG, \BREG, 64 +.endm + +.macro KERNEL8x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + xxspltw vs8, vs4, 3 + xxspltw vs9, vs4, 2 + xxspltw vs10, vs4, 1 + xxspltw vs11, vs4, 0 + lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,32+\OffsetB)(\BREG) + lxv vs29, DISP32(\Index,48+\OffsetB)(\BREG) + lxv vs30, DISP32(\Index,64+ 0+\OffsetB)(\BREG) + lxv vs31, DISP32(\Index,64+16+\OffsetB)(\BREG) + lxv vs32, DISP32(\Index,64+32+\OffsetB)(\BREG) + lxv vs33, DISP32(\Index,64+48+\OffsetB)(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs0, vs28, vs9 + xvmulsp vs1, vs29, vs9 + xvmulsp vs0, vs30, vs10 + xvmulsp vs1, vs31, vs10 + xvmulsp vs0, vs32, vs11 + xvmulsp vs1, vs33, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs0, vs28, vs9 + xvmaddasp vs1, vs29, vs9 + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs1, vs31, vs10 + xvmaddasp vs0, vs32, vs11 + xvmaddasp vs1, vs33, vs11 + .endif +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP32(\Index,128) +.endif +.endm + +.macro SAVE8x1 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + add T4, T2, T10 + add T5, T3, T10 + add T6, T4, T10 + add T7, T5, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v2,0(T1) + lxssp v4,0(T2) + lxssp v6,0(T3) + lxssp v8,0(T4) + lxssp v10,0(T5) + lxssp v12,0(T6) + lxssp v14,0(T7) +#endif + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + xscvspdp vs28, vs1 + xxspltw vs29, vs1, 1 + xxspltw vs30, vs1, 2 + xxspltw vs31, vs1, 3 + xscvspdp vs29,vs29 + xscvspdp vs30,vs30 + xscvspdp vs31,vs31 +#if defined(TRMMKERNEL) + xsmuldp vs32,vs27, vs4 + xsmuldp vs34,vs26, vs4 + xsmuldp vs36,vs25, vs4 + xsmuldp vs38,vs24, vs4 + xsmuldp vs40,vs31, vs4 + xsmuldp vs42,vs30, vs4 + xsmuldp vs44,vs29, vs4 + xsmuldp vs46,vs28, vs4 +#else + xsmaddadp vs32,vs27, vs4 + xsmaddadp vs34,vs26, vs4 + xsmaddadp vs36,vs25, vs4 + xsmaddadp vs38,vs24, vs4 + xsmaddadp vs40,vs31, vs4 + xsmaddadp vs42,vs30, vs4 + xsmaddadp vs44,vs29, vs4 + xsmaddadp vs46,vs28, vs4 +#endif + stxssp v0,0(CO) + stxssp v2,0(T1) + stxssp v4,0(T2) + stxssp v6,0(T3) + stxssp v8,0(T4) + stxssp v10,0(T5) + stxssp v12,0(T6) + stxssp v14,0(T7) + addi CO,CO,4 +.endm + + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro LOAD4x16_1 + LOAD4x16 1 +.endm + +.macro LOAD4x16_0 + LOAD4x16 0 +.endm + +.macro KERNEL4x16_L1_L4 Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero4X16 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + +.macro LOAD4x16 Zero + + lxv vs24, 0(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + +.endif +.endm + +.macro END4x16_NORMAL + END4x16 0, AO, BO, 64,16 +.endm + +.macro END4x16 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + +.endif +.endm + +.macro KERNEL4x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + + lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + + + lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG) + lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + + xxpermdi vs27, vs26, vs26,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + + lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) + + lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG) + lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG) + lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG) + lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + + +.if \Complete==0 + lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) + + lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG) + lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG) + lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG) + lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) + addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP64(\Index,256) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + + +.endm + +.macro KERNEL4x16 First + + LOAD4x16 0 + END4x16 \First, AO, BO, 64,16 +.endm + +.macro KERNEL4x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 +.endif + + xxpermdi vs11, vs10, vs10,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + +.endif +.if \Complete==0 + lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) + lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) + lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) + +.else + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + xvmulsp vs34, vs6,vs8 + xvmulsp vs35, vs7,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + xvmulsp vs38, vs6,vs9 + xvmulsp vs39, vs7,vs9 +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + xvmulsp vs42, vs6,vs10 + xvmulsp vs43, vs7,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + xvmulsp vs46, vs6,vs11 + xvmulsp vs47, vs7,vs11 + + + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + + +.endif + +.endm + + +.macro SAVE4x16 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxmrglw vs16, vs34, vs46 + xxmrglw vs18, vs38, vs42 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxmrghw vs4, vs38, vs42 + xxmrghw vs5, vs34, vs46 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxmrglw vs24, vs35, vs47 + xxmrglw vs26, vs39, vs43 + + xxlor vs17, vs16, vs16 + xxlor vs19, vs18, vs18 + + xxmrghw vs30, vs39, vs43 + xxmrghw vs31, vs35, vs47 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + +#ifndef TRMMKERNEL + lxv vs32, 0(CO) + lxv vs33, 16(CO) + lxv vs34, 32(CO) + lxv vs35, 48(CO) +#endif + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + +#ifndef TRMMKERNEL + lxv vs36, 0(T1) + lxv vs37, 16(T1) + lxv vs38, 32(T1) + lxv vs39, 48(T1) +#endif +#ifndef TRMMKERNEL + lxv vs40, 0(T2) + lxv vs41, 16(T2) + lxv vs42, 32(T2) + lxv vs43, 48(T2) +#endif +#ifndef TRMMKERNEL + lxv vs44, 0(T3) + lxv vs45, 16(T3) + lxv vs46, 32(T3) + lxv vs47, 48(T3) +#endif + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 + + xxperm vs17, vs4, save_permute_2 + xxperm vs19, vs5, save_permute_2 + + xxperm vs24, vs30, save_permute_1 + xxperm vs26, vs31, save_permute_1 + + xxperm vs25, vs30, save_permute_2 + xxperm vs27, vs31, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r + xvmulsp vs36, vs9, alpha_r + xvmulsp vs37, vs13, alpha_r + xvmulsp vs38, vs17, alpha_r + xvmulsp vs39, vs25, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r + xvmaddasp vs38, vs17, alpha_r + xvmaddasp vs39, vs25, alpha_r +#endif + + + +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r + +#endif + + stxv vs32, 0(CO) + stxv vs33, 16(CO) + stxv vs34, 32(CO) + stxv vs35, 48(CO) + + stxv vs36, 0(T1) + stxv vs37, 16(T1) + stxv vs38, 32(T1) + stxv vs39, 48(T1) + + stxv vs40, 0(T2) + stxv vs41, 16(T2) + stxv vs42, 32(T2) + stxv vs43, 48(T2) + stxv vs44, 0(T3) + stxv vs45, 16(T3) + stxv vs46, 32(T3) + stxv vs47, 48(T3) + + addi CO,CO,64 + + +.endm + + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro LOAD4x8_1 + LOAD4x8 1 +.endm + +.macro LOAD4x8_0 + LOAD4x8 0 +.endm + +.macro KERNEL4x8_L1_L4 Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro END4x8_NORMAL + END4x8 0, AO, BO, 32,16 +.endm + +.macro Zero4X8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + +.endm + +.macro LOAD4x8 Zero + + lxv vs24, 0(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + + xxpermdi vs27, vs26, vs26,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + +.endif +.endm + + +.macro END4x8 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + +.endif +.endm + +.macro KERNEL4x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + + + lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + + xxpermdi vs27, vs26, vs26,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + + + lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + + +.if \Complete==0 + lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + + +.endm + +.macro KERNEL4x8 First + + LOAD4x8 0 + END4x8 \First, AO, BO, 32,16 +.endm + +.macro KERNEL4x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + +.endif + + xxpermdi vs11, vs10, vs10,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + +.endif +.if \Complete==0 + lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) + + lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) + addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) + +.else + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP16(\Index,64) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + +.endif + +.endm + + +.macro SAVE4x8 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + + +#ifndef TRMMKERNEL + lxv vs34, 0(CO) + lxv vs35, 16(CO) + lxv vs38, 0(T1) + lxv vs39, 16(T1) + lxv vs42, 0(T2) + lxv vs43, 16(T2) + lxv vs46, 0(T3) + lxv vs47, 16(T3) + + +#endif + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs34, vs8, alpha_r + xvmulsp vs35, vs12, alpha_r + xvmulsp vs38, vs9, alpha_r + xvmulsp vs39, vs13, alpha_r + xvmulsp vs42, vs10, alpha_r + xvmulsp vs43, vs14, alpha_r + xvmulsp vs46, vs11, alpha_r + xvmulsp vs47, vs15, alpha_r +#else + xvmaddasp vs34, vs8, alpha_r + xvmaddasp vs35, vs12, alpha_r + xvmaddasp vs38, vs9, alpha_r + xvmaddasp vs39, vs13, alpha_r + xvmaddasp vs42, vs10, alpha_r + xvmaddasp vs43, vs14, alpha_r + xvmaddasp vs46, vs11, alpha_r + xvmaddasp vs47, vs15, alpha_r +#endif + + + stxv vs34, 0(CO) + stxv vs35, 16(CO) + stxv vs38, 0(T1) + stxv vs39, 16(T1) + stxv vs42, 0(T2) + stxv vs43, 16(T2) + stxv vs46, 0(T3) + stxv vs47, 16(T3) + + + addi CO,CO,32 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro LOAD4x4_1 + LOAD4x4 1 +.endm + +.macro LOAD4x4_0 + LOAD4x4 0 +.endm + +.macro KERNEL4x4_L1_L4 Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL4x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero4X4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endm + +.macro LOAD4x4 Zero + + lxv vs0, 0(AO) + lxv vs24, 0(BO) + + + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endif +.endm + +.macro END4x4_NORMAL + END4x4 0, AO, BO, 16,16 +.endm + +.macro END4x4 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + +.endif +.endm + +.macro KERNEL4x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + + lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP16(\Index, 16+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + + + lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 32+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + +.if \Complete==0 + + lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) + lxv vs24, DISP16(\Index, 48+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) + addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) + +.else + addi \AREG, \AREG, DISP16(\Index,64) + addi \BREG, \BREG, DISP16(\Index,64) + +.endif +.endif + + +.endm + +.macro KERNEL4x4 First + LOAD4x4 0 + END4x4 \First, AO, BO, 16,16 +.endm + +.macro KERNEL4x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 + +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + +.endif + +.if \Complete==0 + + lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP8(\Index, 16+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + +.if \First==1 + xvmulsp vs32, vs26, vs4 + xvmulsp vs33, vs26, vs5 + xvmulsp vs34, vs26, vs6 + xvmulsp vs35, vs26, vs7 + + +.else + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) + +.else + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP8(\Index,32) + +.endif +.endif + + +.endm + + +.macro SAVE4x4 + slwi T10, LDC , 1 + add T1, CO, LDC +#if !defined(TRMMKERNEL) + lxv vs36, 0(CO) + lxv vs37, 0(T1) +#endif + add T2, CO, T10 + add T3, T1, T10 +#if !defined(TRMMKERNEL) + lxv vs38, 0(T2) + lxv vs39, 0(T3) +#endif + + xxmrglw vs0, vs35,vs32 + xxmrglw vs1, vs34,vs33 + xxmrglw vs4, vs32,vs35 + xxmrglw vs5, vs33,vs34 + + + xxmrghw vs2, vs35,vs32 + xxmrghw vs3, vs34,vs33 + xxmrghw vs6, vs32,vs35 + xxmrghw vs7, vs33,vs34 + + xxmrgld vs24, vs1, vs0 + xxmrghd vs25,vs5,vs4 + + xxmrgld vs26, vs2, vs3 + xxmrghd vs27,vs6,vs7 + + #if defined(TRMMKERNEL) + xvmulsp vs36, vs24, alpha_r + xvmulsp vs37, vs25, alpha_r + xvmulsp vs38, vs26, alpha_r + xvmulsp vs39, vs27, alpha_r +#else + xvmaddasp vs36, vs24, alpha_r + xvmaddasp vs37, vs25, alpha_r + xvmaddasp vs38, vs26, alpha_r + xvmaddasp vs39, vs27, alpha_r + #endif + stxv vs36, 0(CO) + stxv vs37, 0(T1) + stxv vs38, 0(T2) + stxv vs39, 0(T3) + + + + addi CO,CO,16 +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + + +.macro KERNEL4x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + + +.macro Zero4x2 + xxlxor vs0, vs0, vs0 + xxlxor vs2, vs2, vs2 + +.endm + +.macro KERNEL4x2 + KERNEL4x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs2, vs26, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs2, vs26, vs9 + + .endif + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP4(\Index,16) + +.endm + +.macro KERNEL4x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs28, DISP8(\Index,16+\OffsetB)(\BREG) + xxspltw vs8, vs4, 2 + xxspltw vs9, vs4, 3 + xxspltw vs10, vs4, 0 + xxspltw vs11, vs4, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs2, vs26, vs9 + + xvmulsp vs0, vs28, vs10 + xvmulsp vs2, vs28, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs2, vs26, vs9 + + xvmaddasp vs0, vs28, vs10 + xvmaddasp vs2, vs28, vs11 + .endif + + +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP8(\Index,32) +.endif + +.endm + + +.macro SAVE4x2 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v1,4(CO) + + lxssp v2,0(T1) + lxssp v3,4(T1) + + lxssp v4,0(T2) + lxssp v5,4(T2) + + lxssp v6,0(T3) + lxssp v7,4(T3) + + +#endif + xscvspdp vs5, vs2 + xxspltw vs6, vs2, 1 + xxspltw vs7, vs2, 2 + xxspltw vs8, vs2, 3 + xscvspdp vs6,vs6 + xscvspdp vs7,vs7 + xscvspdp vs8,vs8 + + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + + +#if defined(TRMMKERNEL) + xsmuldp vs32,vs8, vs4 + xsmuldp vs33,vs27, vs4 + + xsmuldp vs34,vs7, vs4 + xsmuldp vs35,vs26, vs4 + + xsmuldp vs36,vs6, vs4 + xsmuldp vs37,vs25, vs4 + + xsmuldp vs38,vs5, vs4 + xsmuldp vs39,vs24, vs4 + + +#else + xsmaddadp vs32,vs8, vs4 + xsmaddadp vs33,vs27, vs4 + + xsmaddadp vs34,vs7, vs4 + xsmaddadp vs35,vs26, vs4 + + xsmaddadp vs36,vs6, vs4 + xsmaddadp vs37,vs25, vs4 + + xsmaddadp vs38,vs5, vs4 + xsmaddadp vs39,vs24, vs4 + + +#endif + + stxssp v0,0(CO) + stxssp v1,4(CO) + + stxssp v2,0(T1) + stxssp v3,4(T1) + + stxssp v4,0(T2) + stxssp v5,4(T2) + + stxssp v6,0(T3) + stxssp v7,4(T3) + + + + + addi CO,CO,8 +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ +.macro KERNEL4x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro Zero4x1 + xxlxor vs0, vs0, vs0 +.endm + +.macro KERNEL4x1 + KERNEL4x1_1 AO,BO, 0 +.endm + +.macro KERNEL4x1_2 + KERNEL4x1_2_1 AO,BO, 0 +.endm + +.macro KERNEL4x1_1 AREG,BREG,First + lxvwsx vs8, 0, \AREG + lxv vs26, 0(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 +.else + xvmaddasp vs0, vs26, vs8 + .endif + addi \AREG, \AREG, 4 + addi \BREG, \BREG, 16 +.endm + +.macro KERNEL4x1_2_1 AREG,BREG,First + lxsd v4, 0(\AREG) + lxv vs26, 0(\BREG) + lxv vs28, 16(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs0, vs28, vs9 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs0, vs28, vs9 + .endif + addi \AREG, \AREG, 8 + addi \BREG, \BREG, 32 +.endm + +.macro KERNEL4x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + xxspltw vs8, vs4, 3 + xxspltw vs9, vs4, 2 + xxspltw vs10, vs4, 1 + xxspltw vs11, vs4, 0 + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,16+\OffsetB)(\BREG) + lxv vs30, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs32, DISP16(\Index,48+\OffsetB)(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs0, vs28, vs9 + xvmulsp vs0, vs30, vs10 + xvmulsp vs0, vs32, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs0, vs28, vs9 + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs0, vs32, vs11 + .endif +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP16(\Index,64) +.endif +.endm + +.macro SAVE4x1 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v2,0(T1) + lxssp v4,0(T2) + lxssp v6,0(T3) +#endif + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + +#if defined(TRMMKERNEL) + xsmuldp vs32,vs27, vs4 + xsmuldp vs34,vs26, vs4 + xsmuldp vs36,vs25, vs4 + xsmuldp vs38,vs24, vs4 +#else + xsmaddadp vs32,vs27, vs4 + xsmaddadp vs34,vs26, vs4 + xsmaddadp vs36,vs25, vs4 + xsmaddadp vs38,vs24, vs4 +#endif + stxssp v0,0(CO) + stxssp v2,0(T1) + stxssp v4,0(T2) + stxssp v6,0(T3) + addi CO,CO,4 +.endm + +/****************************N=2 section*****************/ + +.macro KERNEL2x16_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero2x16 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 +.endm + +.macro KERNEL2x16 + KERNEL2x16_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs28, vs8 + xvmulsp vs3, vs29, vs8 + + xvmulsp vs4, vs26, vs9 + xvmulsp vs5, vs27, vs9 + xvmulsp vs6, vs28, vs9 + xvmulsp vs7, vs29, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + xvmaddasp vs6, vs28, vs9 + xvmaddasp vs7, vs29, vs9 + + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP16(\Index,64) + +.endm + + + + +.macro KERNEL2x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) + + lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) + + lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) + lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) + lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) + lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) + + lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) + lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) + lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) + lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + xxspltw vs12, vs39, 3 + xxspltw vs13, vs39, 2 + xxspltw vs14, vs39, 1 + xxspltw vs15, vs39, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + xvmaddasp vs6, vs28, vs9 + xvmaddasp vs7, vs29, vs9 + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + xvmaddasp vs2, vs18, vs10 + xvmaddasp vs3, vs19, vs10 + + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + xvmaddasp vs6, vs18, vs11 + xvmaddasp vs7, vs19, vs11 + + xvmaddasp vs0, vs30, vs12 + xvmaddasp vs1, vs31, vs12 + xvmaddasp vs2, vs32, vs12 + xvmaddasp vs3, vs33, vs12 + + xvmaddasp vs4, vs30, vs13 + xvmaddasp vs5, vs31, vs13 + xvmaddasp vs6, vs32, vs13 + xvmaddasp vs7, vs33, vs13 + + xvmaddasp vs0, vs34, vs14 + xvmaddasp vs1, vs35, vs14 + xvmaddasp vs2, vs36, vs14 + xvmaddasp vs3, vs37, vs14 + + xvmaddasp vs4, vs34, vs15 + xvmaddasp vs5, vs35, vs15 + xvmaddasp vs6, vs36, vs15 + xvmaddasp vs7, vs37, vs15 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP64(\Index,256) +.endif + +.endm + +.macro KERNEL2x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 3 + xxspltw vs9, vs36, 2 + xxspltw vs10, vs36, 1 + xxspltw vs11, vs36, 0 + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) + lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + xvmaddasp vs6, vs28, vs9 + xvmaddasp vs7, vs29, vs9 + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + xvmaddasp vs2, vs18, vs10 + xvmaddasp vs3, vs19, vs10 + + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + xvmaddasp vs6, vs18, vs11 + xvmaddasp vs7, vs19, vs11 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + + +.macro SAVE2x16 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) + lxv vs18, 32(CO) + lxv vs19, 48(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs26, 0(T1) + lxv vs27, 16(T1) + lxv vs28, 32(T1) + lxv vs29, 48(T1) +#endif + +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r + xvmulsp vs18, vs2, alpha_r + xvmulsp vs19, vs3, alpha_r + xvmulsp vs26, vs4, alpha_r + xvmulsp vs27, vs5, alpha_r + xvmulsp vs28, vs6, alpha_r + xvmulsp vs29, vs7, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r + xvmaddasp vs18, vs2, alpha_r + xvmaddasp vs19, vs3, alpha_r + xvmaddasp vs26, vs4, alpha_r + xvmaddasp vs27, vs5, alpha_r + xvmaddasp vs28, vs6, alpha_r + xvmaddasp vs29, vs7, alpha_r +#endif + stxv vs16, 0(CO) + stxv vs17, 16(CO) + stxv vs18, 32(CO) + stxv vs19, 48(CO) + + stxv vs26, 0(T1) + stxv vs27, 16(T1) + stxv vs28, 32(T1) + stxv vs29, 48(T1) + + addi CO,CO,64 + +.endm + +/* M=8 N=2 */ + +.macro KERNEL2x8_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero2x8 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +.endm + +.macro KERNEL2x8 + KERNEL2x8_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + + xvmulsp vs4, vs26, vs9 + xvmulsp vs5, vs27, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP8(\Index,32) + +.endm + + + + +.macro KERNEL2x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + + lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) + + lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + + lxv vs34, DISP32(\Index, 96+ 0+\OffsetA)(\AREG) + lxv vs35, DISP32(\Index, 96+ 16+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + xxspltw vs12, vs39, 3 + xxspltw vs13, vs39, 2 + xxspltw vs14, vs39, 1 + xxspltw vs15, vs39, 0 + + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + + + xvmaddasp vs0, vs30, vs12 + xvmaddasp vs1, vs31, vs12 + xvmaddasp vs4, vs30, vs13 + xvmaddasp vs5, vs31, vs13 + + xvmaddasp vs0, vs34, vs14 + xvmaddasp vs1, vs35, vs14 + xvmaddasp vs4, vs34, vs15 + xvmaddasp vs5, vs35, vs15 + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + +.macro KERNEL2x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 3 + xxspltw vs9, vs36, 2 + xxspltw vs10, vs36, 1 + xxspltw vs11, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs16, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs17, DISP16(\Index,48+\OffsetA)(\AREG) + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + + +.macro SAVE2x8 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs26, 0(T1) + lxv vs27, 16(T1) + +#endif + +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r + xvmulsp vs26, vs4, alpha_r + xvmulsp vs27, vs5, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r + xvmaddasp vs26, vs4, alpha_r + xvmaddasp vs27, vs5, alpha_r +#endif + + stxv vs16, 0(CO) + stxv vs17, 16(CO) + + + stxv vs26, 0(T1) + stxv vs27, 16(T1) + + addi CO,CO,32 + +.endm + + +/*M=4*/ + + +.macro KERNEL2x4_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + /* we will aggregate on save vs0 +vs4 vs11+vs5 */ +.macro Zero2x4 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +.endm + +.macro KERNEL2x4 + KERNEL2x4_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs26, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP4(\Index,16) + +.endm + + + + +.macro KERNEL2x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs16, DISP16(\Index,16+\OffsetA)(\AREG) + + lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs34, DISP16(\Index,32+ 16+\OffsetA)(\AREG) + + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + xxspltw vs12, vs39, 3 + xxspltw vs13, vs39, 2 + xxspltw vs14, vs39, 1 + xxspltw vs15, vs39, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + xvmaddasp vs4, vs16, vs10 + xvmaddasp vs5, vs16, vs11 + + + xvmaddasp vs0, vs30, vs12 + xvmaddasp vs1, vs30, vs13 + xvmaddasp vs4, vs34, vs14 + xvmaddasp vs5, vs34, vs15 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + +.macro KERNEL2x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 3 + xxspltw vs9, vs36, 2 + xxspltw vs10, vs36, 1 + xxspltw vs11, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs16, DISP8(\Index, 16+\OffsetA)(\AREG) + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + xvmaddasp vs4, vs16, vs10 + xvmaddasp vs5, vs16, vs11 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP8(\Index,32) +.endif + +.endm + + +.macro SAVE2x4 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs26, 0(T1) + +#endif + /*aggregate vectors*/ + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs26, vs1, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs26, vs1, alpha_r +#endif + + stxv vs16, 0(CO) + stxv vs26, 0(T1) + + addi CO,CO,16 + +.endm + + +/* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2 */ +.macro SWITCH_PERMUTE_INNER + xxpermdi permute_mask, permute_mask, permute_mask,2 +.endm + +.macro Zero2x2 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + SWITCH_PERMUTE_INNER +.endm + +.macro KERNEL2x2 + KERNEL2x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxperm vs9, vs36, permute_mask + lxsd v5, DISP2(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs37, vs36 + xvmulsp vs1, vs37, vs9 + +.else + xvmaddasp vs0, vs37, vs36 + xvmaddasp vs1, vs37, vs9 + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP2(\Index,8) + +.endm + + + + +.macro KERNEL2x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs16, DISP8(\Index,16+\OffsetA)(\AREG) + + + xxperm vs9, vs8, permute_mask + xxperm vs11, vs10, permute_mask + + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs16, vs11 + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP8(\Index,32) +.endif + +.endm + +.macro KERNEL2x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP4(\Index, 0+\OffsetB)(\BREG) + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + + xxperm vs9, vs8, permute_mask + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP4(\Index,16) +.endif +.endm + + +.macro SAVE2x2 + +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxsd v5 , 0(T1) + +#endif + /*aggregate vectors*/ + xxpermdi vs4,vs0,vs0,2 + xxpermdi vs5,vs1,vs1,2 + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 + /* */ + /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10} */ + xxperm vs1,vs1, permute_mask + + + xxmrghw vs2 ,vs1,vs0 + xxpermdi vs2,vs2,vs2,2 + xxmrghw vs3 ,vs0,vs1 +#if defined(TRMMKERNEL) + xvmulsp vs36, vs2, alpha_r + xvmulsp vs37, vs3, alpha_r +#else + xvmaddasp vs36, vs2, alpha_r + xvmaddasp vs37, vs3, alpha_r +#endif + /**** store last two words*/ + + + stxsd v4, 0(CO) + stxsd v5, 0(T1) + + addi CO,CO,8 + +.endm + +/*--------------------------- M=1 N=2 */ +.macro Zero2x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2,vs2,vs2 + xxlxor vs3,vs3,vs3 +.endm + +.macro KERNEL2x1 + KERNEL2x1_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + /* + we will calculate 1 alone then will add it to batched ones + */ +.macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v3, DISP2(\Index, 0+\OffsetB)(\BREG) + lxssp v4, DISP2(\Index, 4+\OffsetB)(\BREG) + lxssp v5, DISP1(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs2, vs37, vs35 + xvmulsp vs3, vs37, vs36 + +.else + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP1(\Index,4) + +.endm + + + + +.macro KERNEL2x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + xxmrglw vs5, vs26,vs26 + xxmrghw vs6, vs26,vs26 + + xvmaddasp vs0, vs8, vs5 + xvmaddasp vs1, vs10, vs6 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP4(\Index,16) +.endif + +.endm + +.macro KERNEL2x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxssp v3, DISP4(\Index, 0+\OffsetB)(\BREG) + lxssp v4, DISP4(\Index, 4+\OffsetB)(\BREG) + lxssp v7, DISP4(\Index, 8+\OffsetB)(\BREG) + lxssp v8, DISP4(\Index, 12+\OffsetB)(\BREG) + lxssp v5, DISP2(\Index, 0+\OffsetA)(\AREG) + lxssp v6, DISP2(\Index, 4+\OffsetA)(\AREG) + + + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + + xsmaddadp vs2, vs38, vs39 + xsmaddadp vs3, vs38, vs40 + + + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP2(\Index,8) +.endm + + +.macro SAVE2x1 + +#ifndef TRMMKERNEL + lxssp v4 , 0(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxssp v5 , 0(T1) + +#endif + + /*convert alpha_r for multiply*/ + xscvspdp vs16,alpha_r + + /*aggregate vectors 2x2_4 */ + xxpermdi vs4,vs0,vs0,2 + xxpermdi vs5,vs1,vs1,2 + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 + xvaddsp vs0,vs0,vs1 +/*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/ + xscvspdp vs5, vs0 + xxspltw vs6, vs0, 1 + xscvspdp vs6,vs6 + xsadddp vs2,vs2,vs6 + xsadddp vs3,vs3,vs5 + + /**** store last two words*/ +#if defined(TRMMKERNEL) + xsmuldp vs36,vs2, vs16 + xsmuldp vs37,vs3, vs16 + +#else + xsmaddadp vs36,vs2, vs16 + xsmaddadp vs37,vs3, vs16 +#endif + + stxssp v4, 0(CO) + stxssp v5, 0(T1) + + addi CO,CO,4 + +.endm + + + +/****************************N=1 section*****************/ + +.macro KERNEL1x16_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero1x16 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 +.endm + +.macro KERNEL1x16 + KERNEL1x16_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) + xscvdpspn vs36,vs36 + xxspltw vs8, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs28, vs8 + xvmulsp vs3, vs29, vs8 + + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + .endif + + addi \BREG, \BREG, DISP1(\Index,4) + addi \AREG, \AREG, DISP16(\Index,64) + +.endm + + + + +.macro KERNEL1x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) + + lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) + + lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + + lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) + lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) + lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) + lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) + + lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) + lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) + lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) + lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) + + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + + xvmaddasp vs0, vs16, vs9 + xvmaddasp vs1, vs17, vs9 + xvmaddasp vs2, vs18, vs9 + xvmaddasp vs3, vs19, vs9 + + + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs1, vs31, vs10 + xvmaddasp vs2, vs32, vs10 + xvmaddasp vs3, vs33, vs10 + + + xvmaddasp vs0, vs34, vs11 + xvmaddasp vs1, vs35, vs11 + xvmaddasp vs2, vs36, vs11 + xvmaddasp vs3, vs37, vs11 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP64(\Index,256) +.endif + +.endm + +.macro KERNEL1x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) + lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + + xvmaddasp vs0, vs16, vs9 + xvmaddasp vs1, vs17, vs9 + xvmaddasp vs2, vs18, vs9 + xvmaddasp vs3, vs19, vs9 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + + +.macro SAVE1x16 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) + lxv vs18, 32(CO) + lxv vs19, 48(CO) +#endif + + +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r + xvmulsp vs18, vs2, alpha_r + xvmulsp vs19, vs3, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r + xvmaddasp vs18, vs2, alpha_r + xvmaddasp vs19, vs3, alpha_r +#endif + stxv vs16, 0(CO) + stxv vs17, 16(CO) + stxv vs18, 32(CO) + stxv vs19, 48(CO) + + addi CO,CO,64 + +.endm + +/* M=8 N=1 */ + +.macro KERNEL1x8_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero1x8 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 +.endm + +.macro KERNEL1x8 + KERNEL1x8_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) + xscvdpspn vs36,vs36 + xxspltw vs8, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + .endif + + addi \BREG, \BREG, DISP1(\Index,4) + addi \AREG, \AREG, DISP8(\Index,32) + +.endm + + + + +.macro KERNEL1x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) + + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + + lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + + lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + + lxv vs34, DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG) + lxv vs35, DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG) + + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + + xvmaddasp vs2, vs16, vs9 + xvmaddasp vs3, vs17, vs9 + + + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs1, vs31, vs10 + + + xvmaddasp vs2, vs34, vs11 + xvmaddasp vs3, vs35, vs11 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + +.macro KERNEL1x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs16, DISP16(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs17, DISP16(\Index,32+ 16+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + + xvmaddasp vs2, vs16, vs9 + xvmaddasp vs3, vs17, vs9 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + + +.macro SAVE1x8 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) +#endif + /* aggregate vs0 vs2 and vs1 vs3*/ + xvaddsp vs0,vs0,vs2 + xvaddsp vs1,vs1,vs3 +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r +#endif + stxv vs16, 0(CO) + stxv vs17, 16(CO) + + addi CO,CO,32 + +.endm +/*M=4*/ + +.macro KERNEL1x4_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero1x4 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 +.endm + +.macro KERNEL1x4 + KERNEL1x4_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) + xscvdpspn vs36,vs36 + xxspltw vs8, vs36, 0 + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 +.else + xvmaddasp vs0, vs26, vs8 + + .endif + + addi \BREG, \BREG, DISP1(\Index,4) + addi \AREG, \AREG, DISP4(\Index,16) + +.endm + + + + +.macro KERNEL1x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) + + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + + lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs31, DISP16(\Index,32+ 16+\OffsetA)(\AREG) + + + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + + xvmaddasp vs0, vs26, vs8 + + xvmaddasp vs1, vs27, vs9 + + xvmaddasp vs2, vs30, vs10 + + + xvmaddasp vs3, vs31, vs11 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + +.macro KERNEL1x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs9 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP8(\Index,32) +.endif + +.endm + + +.macro SAVE1x4 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) +#endif + /* aggregate */ + xvaddsp vs0,vs0,vs2 + xvaddsp vs1,vs1,vs3 + xvaddsp vs0,vs1,vs0 +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r +#endif + stxv vs16, 0(CO) + + addi CO,CO,16 + +.endm + +/* M=2 N=1*/ +.macro Zero1x2 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2,vs2,vs2 + xxlxor vs3,vs3,vs3 +.endm + +.macro KERNEL1x2 + KERNEL1x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + /* + we will calculate 1 alone then will add it to batched ones + */ +.macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v3, DISP2(\Index, 0+\OffsetB)(\AREG) + lxssp v4, DISP2(\Index, 4+\OffsetB)(\AREG) + lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) + + +.if \First==1 + xvmuldp vs2, vs37, vs35 + xvmuldp vs3, vs37, vs36 + +.else + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + .endif + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP1(\Index,4) + +.endm + + + + +.macro KERNEL1x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) + lxv vs10, DISP8(\Index, 16+\OffsetB)(\AREG) + + lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) + + xxmrglw vs5, vs26,vs26 + xxmrghw vs6, vs26,vs26 + + xvmaddasp vs0, vs8, vs5 + xvmaddasp vs1, vs10, vs6 + + +.if \IsLast==1 + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP4(\Index,16) +.endif + +.endm + +.macro KERNEL1x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxssp v3, DISP4(\Index, 0+\OffsetB)(\AREG) + lxssp v4, DISP4(\Index, 4+\OffsetB)(\AREG) + lxssp v7, DISP4(\Index, 8+\OffsetB)(\AREG) + lxssp v8, DISP4(\Index, 12+\OffsetB)(\AREG) + lxssp v5, DISP2(\Index, 0+\OffsetA)(\BREG) + lxssp v6, DISP2(\Index, 4+\OffsetA)(\BREG) + + + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + + xsmaddadp vs2, vs38, vs39 + xsmaddadp vs3, vs38, vs40 + + + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP2(\Index,8) +.endm + + +.macro SAVE1x2 + +#ifndef TRMMKERNEL + lxssp v4 , 0(CO) + lxssp v5 , 4(CO) + +#endif + + /*convert alpha_r for multiply*/ + xscvspdp vs16,alpha_r + + /*aggregate vectors 1x2_4 */ + xxpermdi vs4,vs0,vs0,2 + xxpermdi vs5,vs1,vs1,2 + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 + xvaddsp vs0,vs0,vs1 +/*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/ + xscvspdp vs5, vs0 + xxspltw vs6, vs0, 1 + xscvspdp vs6,vs6 + xsadddp vs2,vs2,vs6 + xsadddp vs3,vs3,vs5 + + /**** store last two words*/ +#if defined(TRMMKERNEL) + xsmuldp vs36,vs2, vs16 + xsmuldp vs37,vs3, vs16 + +#else + xsmaddadp vs36,vs2, vs16 + xsmaddadp vs37,vs3, vs16 +#endif + + stxssp v4, 0(CO) + stxssp v5, 4(CO) + + addi CO,CO,8 + +.endm +/*///////////////// N=1 M=1 //////////////////*/ +.macro Zero1x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2,vs2 + xxlxor vs3,vs3,vs3 + xxlxor vs4,vs4,vs4 +.endm + +.macro KERNEL1x1 + KERNEL1x1_1 AO,BO, 1, 0,0,0 +.endm + +.macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_16 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_8 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + /* + we will calculate 1 alone ( FIRST==1 to zero vs4) + */ +.macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v3, DISP1(\Index, 0+\OffsetB)(\AREG) + lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) + + +.if \First==1 + xvmuldp vs4, vs37, vs35 + +.else + xsmaddadp vs4, vs37, vs35 + .endif + + addi \AREG, \AREG, DISP1(\Index,4) + addi \BREG, \BREG, DISP1(\Index,4) + +.endm + + +.macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\AREG) + lxv vs9, DISP16(\Index, 16+\OffsetB)(\AREG) + lxv vs10, DISP16(\Index, 32+0+\OffsetB)(\AREG) + lxv vs11, DISP16(\Index, 32+ 16+\OffsetB)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetA)(\BREG) + lxv vs16, DISP16(\Index, 16+\OffsetA)(\BREG) + lxv vs17, DISP16(\Index, 32+0+\OffsetA)(\BREG) + lxv vs18, DISP16(\Index, 32+16+\OffsetA)(\BREG) + xvmaddasp vs0, vs8, vs26 + xvmaddasp vs1, vs9, vs16 + xvmaddasp vs2, vs10, vs17 + xvmaddasp vs3, vs11, vs18 +.if \IsLast==1 + addi \AREG, \AREG, DISP16(\Index,64) + addi \BREG, \BREG, DISP16(\Index,64) +.endif + +.endm + +.macro KERNEL1x1_I_8 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) + lxv vs9, DISP8(\Index, 16+\OffsetB)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetA)(\BREG) + lxv vs16, DISP8(\Index, 16+\OffsetA)(\BREG) + xvmaddasp vs0, vs8, vs26 + xvmaddasp vs1, vs9, vs16 + +.if \IsLast==1 + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP8(\Index,32) +.endif + +.endm + + +.macro KERNEL1x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP4(\Index, 0+\OffsetB)(\AREG) + lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) + + xvmaddasp vs0, vs8, vs26 + + +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP4(\Index,16) +.endif + +.endm + +.macro KERNEL1x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\AREG) + lxsd v5, DISP2(\Index, 0+\OffsetA)(\BREG) + + xvmaddasp vs0, vs36, vs37 + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP2(\Index,8) +.endm + + +.macro SAVE1x1 + +#ifndef TRMMKERNEL + lxssp v4 , 0(CO) + +#endif + + /*convert alpha_r for multiply*/ + xscvspdp vs16,alpha_r + + /*aggregate vectors */ + xvaddsp vs0,vs0,vs1 + xvaddsp vs2,vs2,vs3 + xvaddsp vs0,vs0,vs2 + + xxpermdi vs7,vs0,vs0,2 + xvaddsp vs0,vs0,vs7 +/*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/ + xscvspdp vs5, vs0 + xxspltw vs6, vs0, 1 + xscvspdp vs6,vs6 + xsadddp vs7,vs5,vs6 + xsadddp vs4,vs4,vs7 + + /**** store last two words*/ +#if defined(TRMMKERNEL) + xsmuldp vs36,vs4, vs16 + +#else + xsmaddadp vs36,vs4, vs16 +#endif + + stxssp v4, 0(CO) + + addi CO,CO,4 + +.endm + + + + +/****************************TRMM POINTER REFRESH MACROSES*************************/ + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 4 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 3 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 2 + .endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif + +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif + +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + + #endif + + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif +.endm \ No newline at end of file diff --git a/param.h b/param.h index 938a82a9e..d59cb1656 100644 --- a/param.h +++ b/param.h @@ -2248,12 +2248,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 1280 +#define SGEMM_DEFAULT_P 640 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 640 #define ZGEMM_DEFAULT_P 320 -#define SGEMM_DEFAULT_Q 640 +#define SGEMM_DEFAULT_Q 1408 #define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 640 #define ZGEMM_DEFAULT_Q 640 From 9763f872fcb841a00926f31c801bfd007a5337b0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 29 Apr 2019 19:18:26 +0200 Subject: [PATCH 07/28] Update Changelog with changes from 0.3.6 --- Changelog.txt | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 49b26873a..8df35d5c3 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,82 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.6 +29-Apr-2019 + +common: + * the build tools now check that a given cpu TARGET is actually valid + * the build-time check of system features (c_check) has been made + less dependent on particular perl features (this should mainly + benefit building on Windows) + * several problem with the ReLAPACK integration were fixed, + including INTERFACE64 support and building a shared library + * building with CMAKE on BSD systems was improved + * a non-absolute SUM function was added based on the + existing optimized code for ASUM + * CBLAS interfaces to the IxMIN and IxMAX functions were added + * a name clash between LAPACKE and BOOST headers was resolved + * CMAKE builds with OpenMP failed to include the appropriate getrf_parallel + kernels + * a crash on thread (key) deletion with the USE_TLS=1 memory management + option was fixed + * restored several earlier fixes, in particular for OpenMP performance, + building on BSD, and calling fork on CYGWIN, which had inadvertently + been dropped in the 0.3.3 rewrite of the memory management code. + +x86_64: + * the AVX512 DGEMM kernel has been disabled again due to unsolved problems + * building with old versions of MSVC was fixed + * it is now possible to build a static library on Windows with CMAKE + * accessing environment variables on CYGWIN at run time was fixed + * the CMAKE build system now recognizes 32bit userspace on 64bit hardware + * Intel "Denverton" atom and Hygon "Dhyana" zen CPUs are now autodetected + * building for DYNAMIC_ARCH with a DYNAMIC_LIST of targets is now supported + with CMAKE as well + * building for DYNAMIC_ARCH with GENERIC as the default target is now supported + * a buffer overflow in the SSE GEMM kernel for Intel Nano targets was fixed + * assembly bugs involving undeclared modification of input operands were fixed + in the AXPY, DOT, GEMV, GER, SCAL, SYMV and TRSM microkernels for Nehalem, + Sandybridge, Haswell, Bulldozer and Piledriver. These would typically cause + test failures or segfaults when compiled with recent versions of gcc from 8 onward. + * a similar bug was fixed in the blas_quickdivide code used to split workloads + in most functions + * a bug in the IxMIN implementation for the GENERIC target made it return the result of IxMAX + * fixed building on SkylakeX systems when either the compiler or the (emulated) operating + environment does not support AVX512 + * improved GEMM performance on ZEN targets + +x86: + * build failures caused by the recently added checks for AVX512 were fixed + * an inline assembly bug involving undeclared modification of an input argument was + fixed in the blas_quickdivide code used to split workloads in most functions + * a bug in the IMIN implementation for the GENERIC target made it return the result of IMAX + +MIPS32: + * a bug in the IMIN implementation made it return the result of IMAX + +POWER: + * single precision BLAS1/2 functions have received optimized POWER8 kernels + * POWER9 is now a separate target, with an optimized DGEMM/DTRMM kernel + * building on PPC970 systems under OSX Leopard or Tiger is now supported + * out-of-bounds memory accesses in the gemm_beta microkernels were fixed + * building a shared library on AIX is now supported for POWER6 + * DYNAMIC_ARCH support has been added for POWER6 and newer + +ARMv7: + * corrected xDOT behaviour with zero INC_X or INC_Y + * a bug in the IMIN implementation made it return the result of IMAX + +ARMv8: + * added support for HiSilicon TSV110 cpus + * the CMAKE build system now recognizes 32bit userspace on 64bit hardware + * cross-compilation with CMAKE now works again + * a bug in the IMIN implementation made it return the result of IMAX + * ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7 + +IBM Z: + * optimized microkernels for single precicion BLAS1/2 functions have been added + for both Z13 and Z14 + ==================================================================== Version 0.3.5 31-Dec-2018 From bfeb9c16b0011f4f5f508a6d6df18017ab28f95a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 29 Apr 2019 19:24:53 +0200 Subject: [PATCH 08/28] Increment version to 0.3.7.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 969696179..8900973a5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 6) +set(OpenBLAS_PATCH_VERSION 7.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 4f8143b098418487b261653b48b16dc71cc2a259 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 29 Apr 2019 19:25:32 +0200 Subject: [PATCH 09/28] Increment version to 0.3.7.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 21782a2b9..b46479d03 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.6 +VERSION = 0.3.7.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From daf2fec12db90c02aa74cb13726efd8f9b708312 Mon Sep 17 00:00:00 2001 From: "luz.paz" Date: Mon, 29 Apr 2019 17:03:56 -0400 Subject: [PATCH 10/28] Misc. typo fixes Found via `codespell -q 3 -w -L ith,als,dum,nd,amin,nto,wis,ba -S ./relapack,./kernel,./lapack-netlib` --- Changelog.txt | 14 +++++++------- Makefile.rule | 6 +++--- README.md | 2 +- cmake/kernel.cmake | 2 +- cmake/system.cmake | 2 +- cmake/utils.cmake | 2 +- common_stackalloc.h | 2 +- common_x86.h | 2 +- common_x86_64.h | 2 +- ctest/c_cblat1.f | 2 +- ctest/c_dblat1.f | 2 +- ctest/c_sblat1.f | 2 +- ctest/c_zblat1.f | 2 +- driver/others/blas_server.c | 6 +++--- driver/others/blas_server_win32.c | 4 ++-- driver/others/init.c | 2 +- driver/others/memory.c | 2 +- f_check | 2 +- interface/CMakeLists.txt | 2 +- interface/axpy.c | 2 +- interface/zaxpy.c | 2 +- reference/ctbmvf.f | 2 +- reference/ctpmvf.f | 2 +- reference/ctrmvf.f | 2 +- reference/dtbmvf.f | 2 +- reference/dtpmvf.f | 2 +- reference/dtrmvf.f | 2 +- reference/stbmvf.f | 2 +- reference/stpmvf.f | 2 +- reference/strmvf.f | 2 +- reference/ztbmvf.f | 2 +- reference/ztpmvf.f | 2 +- reference/ztrmvf.f | 2 +- test/cblat1.f | 2 +- test/dblat1.f | 2 +- test/sblat1.f | 2 +- test/zblat1.f | 2 +- 37 files changed, 48 insertions(+), 48 deletions(-) diff --git a/Changelog.txt b/Changelog.txt index 8df35d5c3..9feacf071 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -74,7 +74,7 @@ ARMv8: * ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7 IBM Z: - * optimized microkernels for single precicion BLAS1/2 functions have been added + * optimized microkernels for single precision BLAS1/2 functions have been added for both Z13 and Z14 ==================================================================== @@ -588,8 +588,8 @@ common: s/d/c/zaxpby, s/d/c/zimatcopy, s/d/c/zomatcopy. * Added OPENBLAS_CORETYPE environment for dynamic_arch. (a86d34) * Added NO_AVX2 flag for old binutils. (#401) - * Support outputing the CPU corename on runtime.(#407) - * Patched LAPACK to fix bug 114, 117, 118. + * Support outputting the CPU corename on runtime.(#407) + * Patched LAPACK to fix bug 114, 117, 118. (http://www.netlib.org/lapack/bug_list.html) * Disabled ?gemm3m for a work-around fix. (#400) x86/x86-64: @@ -628,7 +628,7 @@ Version 0.2.9.rc1 13-Jan-2013 common: * Update LAPACK to 3.5.0 version - * Fixed compatiable issues with Clang and Pathscale compilers. + * Fixed compatible issues with Clang and Pathscale compilers. x86/x86-64: * Optimization on Intel Haswell. @@ -705,7 +705,7 @@ Version 0.2.5 26-Nov-2012 common: * Added NO_SHARED flag to disable generating the shared library. - * Compile LAPACKE with ILP64 modle when INTERFACE64=1 (#158) + * Compile LAPACKE with ILP64 model when INTERFACE64=1 (#158) * Export LAPACK 3.4.2 symbols in shared library. (#147) * Only detect the number of physical CPU cores on Mac OSX. (#157) * Fixed NetBSD build. (#155) @@ -896,7 +896,7 @@ x86/x86_64: * Fixed #28 a wrong result of dsdot on x86_64. * Fixed #32 a SEGFAULT bug of zdotc with gcc-4.6. * Fixed #33 ztrmm bug on Nehalem. - * Work-around #27 the low performance axpy issue with small imput size & multithreads. + * Work-around #27 the low performance axpy issue with small input size & multithreads. MIPS64: * Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64. @@ -919,7 +919,7 @@ common: * Imported GotoBLAS2 1.13 BSD version x86/x86_64: - * On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would casue + * On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would cause zdotu & zdotc failures. Instead, work-around it. (Refs issue #8 #9 on github) * Modified ?axpy functions to return same netlib BLAS results when incx==0 or incy==0 (Refs issue #7 on github) diff --git a/Makefile.rule b/Makefile.rule index b46479d03..17815096e 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -181,17 +181,17 @@ NO_AFFINITY = 1 # time out to improve performance. This number should be from 4 to 30 # which corresponds to (1 << n) cycles. For example, if you set to 26, # thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz -# system). Also you can control this mumber by THREAD_TIMEOUT +# system). Also you can control this number by THREAD_TIMEOUT # CCOMMON_OPT += -DTHREAD_TIMEOUT=26 -# Using special device driver for mapping physically contigous memory +# Using special device driver for mapping physically contiguous memory # to the user space. If bigphysarea is enabled, it will use it. # DEVICEDRIVER_ALLOCATION = 1 # If you need to synchronize FP CSR between threads (for x86/x86_64 only). # CONSISTENT_FPCSR = 1 -# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute +# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute # with single thread. (Actually in recent versions this is a factor proportional to the # number of floating point operations necessary for the given problem size, no longer # an individual dimension). You can use this setting to avoid the overhead of multi- diff --git a/README.md b/README.md index 26055c745..76a65b74b 100644 --- a/README.md +++ b/README.md @@ -133,7 +133,7 @@ Please read `GotoBLAS_01Readme.txt`. #### PPC/PPC64 -- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` +- **POWER8**: Optimized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` #### IBM zEnterprise System diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 0ed09e776..9b238f004 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -1,7 +1,7 @@ # helper functions for the kernel CMakeLists.txt -# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file. +# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file. macro(SetDefaultL1) set(SAMAXKERNEL amax.S) set(DAMAXKERNEL amax.S) diff --git a/cmake/system.cmake b/cmake/system.cmake index 7fda2adb9..d0f560872 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -283,7 +283,7 @@ endif () set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}") -# TODO: nead to convert these Makefiles +# TODO: need to convert these Makefiles # include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake if (${CORE} STREQUAL "PPC440") diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 28ef65f47..fd93f8a70 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in) set(CODES_OUT ${CODES_OUT} PARENT_SCOPE) endfunction () -# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition +# generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition # @param sources_in the source files to build from # @param defines_in (optional) preprocessor definitions that will be applied to all objects # @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended. diff --git a/common_stackalloc.h b/common_stackalloc.h index ec0fa1611..d3d54669c 100644 --- a/common_stackalloc.h +++ b/common_stackalloc.h @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * SIZE must be carefully chosen to be: * - as small as possible to maximize the number of stack allocation * - large enough to support all architectures and kernel - * Chosing a too small SIZE will lead to a stack smashing. + * Choosing a SIZE too small will lead to a stack smashing. */ #define STACK_ALLOC(SIZE, TYPE, BUFFER) \ /* make it volatile because some function (ex: dgemv_n.S) */ \ diff --git a/common_x86.h b/common_x86.h index 3fdffe2a8..99adc9f5b 100644 --- a/common_x86.h +++ b/common_x86.h @@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #endif #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) -//Enable some optimazation for barcelona. +//Enable some optimization for barcelona. #define BARCELONA_OPTIMIZATION #endif diff --git a/common_x86_64.h b/common_x86_64.h index 718a81050..f59ff6627 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -276,7 +276,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #ifdef ASSEMBLER #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) -//Enable some optimazation for barcelona. +//Enable some optimization for barcelona. #define BARCELONA_OPTIMIZATION #endif diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f index c741ce506..1a123d74d 100644 --- a/ctest/c_cblat1.f +++ b/ctest/c_cblat1.f @@ -577,7 +577,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/ctest/c_dblat1.f b/ctest/c_dblat1.f index c570a9140..4a71b4dcf 100644 --- a/ctest/c_dblat1.f +++ b/ctest/c_dblat1.f @@ -653,7 +653,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/ctest/c_sblat1.f b/ctest/c_sblat1.f index 773787d6f..89902f12d 100644 --- a/ctest/c_sblat1.f +++ b/ctest/c_sblat1.f @@ -653,7 +653,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/ctest/c_zblat1.f b/ctest/c_zblat1.f index 03753e782..cd0c8541d 100644 --- a/ctest/c_zblat1.f +++ b/ctest/c_zblat1.f @@ -577,7 +577,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index e5db1804f..6f4e20610 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout(); /* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */ /* jobs is queued. */ -/* We need this grobal for cheking if initialization is finished. */ +/* We need this global for checking if initialization is finished. */ int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; /* Local Variables */ @@ -150,8 +150,8 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT)); #ifdef MONITOR -/* Monitor is a function to see thread's status for every seconds. */ -/* Usually it turns off and it's for debugging. */ +/* Monitor is a function to see thread's status for every second. */ +/* Usually it turns off and it's for debugging. */ static pthread_t monitor_thread; static int main_status[MAX_CPU_NUMBER]; diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 0b38ee365..bace54a23 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -50,7 +50,7 @@ /* This is a thread implementation for Win32 lazy implementation */ -/* Thread server common infomation */ +/* Thread server common information */ typedef struct{ CRITICAL_SECTION lock; HANDLE filled; @@ -61,7 +61,7 @@ typedef struct{ } blas_pool_t; -/* We need this global for cheking if initialization is finished. */ +/* We need this global for checking if initialization is finished. */ int blas_server_avail = 0; /* Local Variables */ diff --git a/driver/others/init.c b/driver/others/init.c index 012ef6647..0aad9c407 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) { int mynode = 1; - /* if number of threads is larger than inital condition */ + /* if number of threads is larger than initial condition */ if (pos < 0) { sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); return 0; diff --git a/driver/others/memory.c b/driver/others/memory.c index ac8545f35..3fe31168d 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2751,7 +2751,7 @@ void *blas_memory_alloc(int procpos){ #ifdef ALLOC_DEVICEDRIVER if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { - fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); + fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n"); } #endif diff --git a/f_check b/f_check index 34caa00be..b05db85bd 100644 --- a/f_check +++ b/f_check @@ -125,7 +125,7 @@ if ($compiler eq "") { $openmp = "-openmp"; } - # for embeded underscore name, e.g. zho_ge, it may append 2 underscores. + # for embedded underscore name, e.g. zho_ge, it may append 2 underscores. $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; if ($data =~ / zho_ge__/) { $need2bu = 1; diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index f76d5c13f..5ea39f864 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -24,7 +24,7 @@ set(BLAS1_MANGLED_SOURCES axpby.c ) -# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f +# TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f # these all have 'z' sources for complex versions set(BLAS2_SOURCES gemv.c ger.c diff --git a/interface/axpy.c b/interface/axpy.c index 9032946d2..eaa19f4df 100644 --- a/interface/axpy.c +++ b/interface/axpy.c @@ -91,7 +91,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. // - //Temporarily work-around the low performance issue with small imput size & + //Temporarily work-around the low performance issue with small input size & //multithreads. if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) nthreads = 1; diff --git a/interface/zaxpy.c b/interface/zaxpy.c index dbd559628..da3b48ead 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -99,7 +99,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. // - //Temporarily work-around the low performance issue with small imput size & + //Temporarily work-around the low performance issue with small input size & //multithreads. if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) nthreads = 1; diff --git a/reference/ctbmvf.f b/reference/ctbmvf.f index ff3c5268d..ada701d70 100644 --- a/reference/ctbmvf.f +++ b/reference/ctbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ctpmvf.f b/reference/ctpmvf.f index 340234270..ffc4766d2 100644 --- a/reference/ctpmvf.f +++ b/reference/ctpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ctrmvf.f b/reference/ctrmvf.f index f9d3b445a..9cd1d17ad 100644 --- a/reference/ctrmvf.f +++ b/reference/ctrmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/dtbmvf.f b/reference/dtbmvf.f index da340774e..621489085 100644 --- a/reference/dtbmvf.f +++ b/reference/dtbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/dtpmvf.f b/reference/dtpmvf.f index e8f6eb412..492f9fd46 100644 --- a/reference/dtpmvf.f +++ b/reference/dtpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/dtrmvf.f b/reference/dtrmvf.f index 0619d3eca..79b2eb806 100644 --- a/reference/dtrmvf.f +++ b/reference/dtrmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/stbmvf.f b/reference/stbmvf.f index 353e63ee8..f21e5aa8b 100644 --- a/reference/stbmvf.f +++ b/reference/stbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/stpmvf.f b/reference/stpmvf.f index 1e93b843a..d97a695f5 100644 --- a/reference/stpmvf.f +++ b/reference/stpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/strmvf.f b/reference/strmvf.f index 249aff275..7614dcd32 100644 --- a/reference/strmvf.f +++ b/reference/strmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ztbmvf.f b/reference/ztbmvf.f index 8df5609ad..c8487cf7c 100644 --- a/reference/ztbmvf.f +++ b/reference/ztbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ztpmvf.f b/reference/ztpmvf.f index 7e52ef74e..5dc03bac9 100644 --- a/reference/ztpmvf.f +++ b/reference/ztpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ztrmvf.f b/reference/ztrmvf.f index 9e4f85380..5f52622e2 100644 --- a/reference/ztrmvf.f +++ b/reference/ztrmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/test/cblat1.f b/test/cblat1.f index a4c996fda..d6b53d105 100644 --- a/test/cblat1.f +++ b/test/cblat1.f @@ -576,7 +576,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/test/dblat1.f b/test/dblat1.f index f3255fef4..28af121cd 100644 --- a/test/dblat1.f +++ b/test/dblat1.f @@ -991,7 +991,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/test/sblat1.f b/test/sblat1.f index a5c1c6af6..fe05bbe87 100644 --- a/test/sblat1.f +++ b/test/sblat1.f @@ -946,7 +946,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/test/zblat1.f b/test/zblat1.f index e2415e1c4..8b4b8d21e 100644 --- a/test/zblat1.f +++ b/test/zblat1.f @@ -576,7 +576,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * From b43c8382c885551b0f230c8493e79bf04d94e366 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 1 May 2019 10:46:46 +0200 Subject: [PATCH 11/28] Correct argument of CPU_ISSET for glibc <2.5 fixes #2104 --- driver/others/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index ac8545f35..db14cde02 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -229,7 +229,7 @@ int get_num_procs(void) { n=0; #if !__GLIBC_PREREQ(2, 6) for (i=0;i Date: Wed, 1 May 2019 19:36:22 +0000 Subject: [PATCH 12/28] conflict resolve --- kernel/power/KERNEL.POWER9 | 10 +++++----- kernel/power/icamax.c | 2 +- kernel/power/icamin.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 6d5cf9068..0e0d62393 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -12,11 +12,11 @@ SGEMMKERNEL = sgemm_kernel_power9.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c -SGEMMOTCOPY = sgemm_tcopy_8_power8.S -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMOTCOPY = sgemm_tcopy_8_power8.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_power9.S DGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/power/icamax.c b/kernel/power/icamax.c index 06fc5d8ad..bd74d20e5 100644 --- a/kernel/power/icamax.c +++ b/kernel/power/icamax.c @@ -75,7 +75,7 @@ static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { BLASLONG index; - BLASLONG i; + BLASLONG i=0; #if defined(USE_MASK_PERMUTATIONS) register __vector unsigned int static_index0 = {0,1,2,3}; #else diff --git a/kernel/power/icamin.c b/kernel/power/icamin.c index 36432c993..336766245 100644 --- a/kernel/power/icamin.c +++ b/kernel/power/icamin.c @@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { BLASLONG index; - BLASLONG i; + BLASLONG i=0; register __vector unsigned int static_index0 = {0,1,2,3}; register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} From 858e609e1feba715065a65034eef02c9516aa107 Mon Sep 17 00:00:00 2001 From: "luz.paz" Date: Sat, 4 May 2019 15:01:29 -0400 Subject: [PATCH 13/28] Revert reference/ fixes --- reference/ctbmvf.f | 2 +- reference/ctpmvf.f | 2 +- reference/ctrmvf.f | 2 +- reference/dtbmvf.f | 2 +- reference/dtpmvf.f | 2 +- reference/dtrmvf.f | 2 +- reference/stbmvf.f | 2 +- reference/stpmvf.f | 2 +- reference/strmvf.f | 2 +- reference/ztbmvf.f | 2 +- reference/ztpmvf.f | 2 +- reference/ztrmvf.f | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/reference/ctbmvf.f b/reference/ctbmvf.f index ada701d70..ff3c5268d 100644 --- a/reference/ctbmvf.f +++ b/reference/ctbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ctpmvf.f b/reference/ctpmvf.f index ffc4766d2..340234270 100644 --- a/reference/ctpmvf.f +++ b/reference/ctpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ctrmvf.f b/reference/ctrmvf.f index 9cd1d17ad..f9d3b445a 100644 --- a/reference/ctrmvf.f +++ b/reference/ctrmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/dtbmvf.f b/reference/dtbmvf.f index 621489085..da340774e 100644 --- a/reference/dtbmvf.f +++ b/reference/dtbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/dtpmvf.f b/reference/dtpmvf.f index 492f9fd46..e8f6eb412 100644 --- a/reference/dtpmvf.f +++ b/reference/dtpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/dtrmvf.f b/reference/dtrmvf.f index 79b2eb806..0619d3eca 100644 --- a/reference/dtrmvf.f +++ b/reference/dtrmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/stbmvf.f b/reference/stbmvf.f index f21e5aa8b..353e63ee8 100644 --- a/reference/stbmvf.f +++ b/reference/stbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/stpmvf.f b/reference/stpmvf.f index d97a695f5..1e93b843a 100644 --- a/reference/stpmvf.f +++ b/reference/stpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/strmvf.f b/reference/strmvf.f index 7614dcd32..249aff275 100644 --- a/reference/strmvf.f +++ b/reference/strmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ztbmvf.f b/reference/ztbmvf.f index c8487cf7c..8df5609ad 100644 --- a/reference/ztbmvf.f +++ b/reference/ztbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ztpmvf.f b/reference/ztpmvf.f index 5dc03bac9..7e52ef74e 100644 --- a/reference/ztpmvf.f +++ b/reference/ztpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ztrmvf.f b/reference/ztrmvf.f index 5f52622e2..9e4f85380 100644 --- a/reference/ztrmvf.f +++ b/reference/ztrmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of From b46875b76b8d4ebbc320547c20f7f4486fe52563 Mon Sep 17 00:00:00 2001 From: "luz.paz" Date: Sat, 4 May 2019 15:43:17 -0400 Subject: [PATCH 14/28] Revert Changelog.txt typos --- Changelog.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Changelog.txt b/Changelog.txt index 9feacf071..8df35d5c3 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -74,7 +74,7 @@ ARMv8: * ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7 IBM Z: - * optimized microkernels for single precision BLAS1/2 functions have been added + * optimized microkernels for single precicion BLAS1/2 functions have been added for both Z13 and Z14 ==================================================================== @@ -588,8 +588,8 @@ common: s/d/c/zaxpby, s/d/c/zimatcopy, s/d/c/zomatcopy. * Added OPENBLAS_CORETYPE environment for dynamic_arch. (a86d34) * Added NO_AVX2 flag for old binutils. (#401) - * Support outputting the CPU corename on runtime.(#407) - * Patched LAPACK to fix bug 114, 117, 118. + * Support outputing the CPU corename on runtime.(#407) + * Patched LAPACK to fix bug 114, 117, 118. (http://www.netlib.org/lapack/bug_list.html) * Disabled ?gemm3m for a work-around fix. (#400) x86/x86-64: @@ -628,7 +628,7 @@ Version 0.2.9.rc1 13-Jan-2013 common: * Update LAPACK to 3.5.0 version - * Fixed compatible issues with Clang and Pathscale compilers. + * Fixed compatiable issues with Clang and Pathscale compilers. x86/x86-64: * Optimization on Intel Haswell. @@ -705,7 +705,7 @@ Version 0.2.5 26-Nov-2012 common: * Added NO_SHARED flag to disable generating the shared library. - * Compile LAPACKE with ILP64 model when INTERFACE64=1 (#158) + * Compile LAPACKE with ILP64 modle when INTERFACE64=1 (#158) * Export LAPACK 3.4.2 symbols in shared library. (#147) * Only detect the number of physical CPU cores on Mac OSX. (#157) * Fixed NetBSD build. (#155) @@ -896,7 +896,7 @@ x86/x86_64: * Fixed #28 a wrong result of dsdot on x86_64. * Fixed #32 a SEGFAULT bug of zdotc with gcc-4.6. * Fixed #33 ztrmm bug on Nehalem. - * Work-around #27 the low performance axpy issue with small input size & multithreads. + * Work-around #27 the low performance axpy issue with small imput size & multithreads. MIPS64: * Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64. @@ -919,7 +919,7 @@ common: * Imported GotoBLAS2 1.13 BSD version x86/x86_64: - * On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would cause + * On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would casue zdotu & zdotc failures. Instead, work-around it. (Refs issue #8 #9 on github) * Modified ?axpy functions to return same netlib BLAS results when incx==0 or incy==0 (Refs issue #7 on github) From 7ed8431527eb00f161de4dd309fd4d2b6c885b0c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 4 May 2019 22:54:41 +0200 Subject: [PATCH 15/28] Disable the SkyLakeX DGEMMITCOPY kernel as well as a stopgap measure for https://github.com/numpy/numpy/issues/13401 as mentioned in #1955 --- kernel/x86_64/KERNEL.SKYLAKEX | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 5d0a300b5..3c678904d 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -10,7 +10,7 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_4.c #DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c DGEMMINCOPY = dgemm_ncopy_8_skylakex.c -DGEMMITCOPY = dgemm_tcopy_8_skylakex.c +#DGEMMITCOPY = dgemm_tcopy_8_skylakex.c DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c From b1561ecc6864428baa4f1336d47d23729b9636f2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 5 May 2019 15:52:01 +0200 Subject: [PATCH 16/28] Disable DGEMMINCOPY as well for now #1955 --- kernel/x86_64/KERNEL.SKYLAKEX | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 3c678904d..d61c51628 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -9,7 +9,7 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_4.c #DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c -DGEMMINCOPY = dgemm_ncopy_8_skylakex.c +#DGEMMINCOPY = dgemm_ncopy_8_skylakex.c #DGEMMITCOPY = dgemm_tcopy_8_skylakex.c DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c From a6a8cc2b7fa30f46fdaa4fb6e50c19da8c11e335 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 7 May 2019 13:34:52 +0200 Subject: [PATCH 17/28] Fix errors in cpu enumeration with glibc 2.6 for #2114 --- driver/others/init.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/driver/others/init.c b/driver/others/init.c index 012ef6647..a29dce971 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) { int mynode = 1; - /* if number of threads is larger than inital condition */ + /* if number of threads is larger than initial condition */ if (pos < 0) { sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); return 0; @@ -857,7 +857,14 @@ void gotoblas_affinity_init(void) { common -> shmid = pshmid; if (common -> magic != SH_MAGIC) { + +#if defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 7) cpu_set_t *cpusetp; +#else + cpu_set_t cpuset; +#endif +#endif int nums; int ret; @@ -890,7 +897,7 @@ void gotoblas_affinity_init(void) { } CPU_FREE(cpusetp); #else - ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); + ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset); if (ret!=0) { common->num_procs = nums; } else { @@ -898,11 +905,11 @@ void gotoblas_affinity_init(void) { int i; int n = 0; for (i=0;inum_procs = n; } #else - common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp); + common->num_procs = CPU_COUNT(&cpuset); } #endif From c516209581a77790b8d67d6dcd0c3f95fe713643 Mon Sep 17 00:00:00 2001 From: Diazonium Date: Tue, 7 May 2019 14:55:20 +0200 Subject: [PATCH 18/28] Change two http links to https Closes #2109 --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 76a65b74b..620e393f1 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. -Please read the documentation on the OpenBLAS wiki pages: . +Please read the documentation on the OpenBLAS wiki pages: . ## Binary Packages @@ -22,7 +22,7 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge ## Installation from Source -Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code +Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code using Git from https://github.com/xianyi/OpenBLAS.git. ### Dependencies From 7d1b468d9d83789d25eb6996afb5e358ee861f1d Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 8 May 2019 09:58:01 +0800 Subject: [PATCH 19/28] Set up CI with Azure Pipelines [skip ci] --- azure-pipelines.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 azure-pipelines.yml diff --git a/azure-pipelines.yml b/azure-pipelines.yml new file mode 100644 index 000000000..aa912913d --- /dev/null +++ b/azure-pipelines.yml @@ -0,0 +1,19 @@ +# Starter pipeline +# Start with a minimal pipeline that you can customize to build and deploy your code. +# Add steps that build, run tests, deploy, and more: +# https://aka.ms/yaml + +trigger: +- master + +pool: + vmImage: 'ubuntu-latest' + +steps: +- script: echo Hello, world! + displayName: 'Run a one-line script' + +- script: | + echo Add other tasks to build, test, and deploy your project. + echo See https://aka.ms/yaml + displayName: 'Run a multi-line script' From e47b63466b26dab9618443fd5754885bea653845 Mon Sep 17 00:00:00 2001 From: Tyler Reddy Date: Tue, 7 May 2019 16:06:42 -0700 Subject: [PATCH 20/28] TST: add native POWER8 to CI * add native POWER8 testing to Travis CI matrix with ppc64le os entry --- .travis.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.travis.yml b/.travis.yml index eee7674fe..00a2509f9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,6 +25,15 @@ matrix: - TARGET_BOX=LINUX64 - BTYPE="BINARY=64" + - <<: *test-ubuntu + os: linux-ppc64le + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" + env: + # for matrix annotation only + - TARGET_BOX=PPC64LE_LINUX + - BTYPE="BINARY=64 USE_OPENMP=1" + - <<: *test-ubuntu env: - TARGET_BOX=LINUX64 From 70cea0b96b70330ae6ef80b954e708d6acd86911 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 12:20:00 +0200 Subject: [PATCH 21/28] Update link to IBM MASS library, update cpu support status --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 620e393f1..68a121498 100644 --- a/README.md +++ b/README.md @@ -63,9 +63,7 @@ A debug version can be built using `make DEBUG=1`. ### Compile with MASS support on Power CPU (optional) -The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library -consists of a set of mathematical functions for C, C++, and Fortran applications that are -are tuned for optimum performance on POWER architectures. +The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER. The library can be installed as shown: @@ -115,6 +113,7 @@ Please read `GotoBLAS_01Readme.txt`. - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. +- **AMD ZEN**: Uses Haswell codes with some optimizations. #### MIPS64 @@ -133,11 +132,13 @@ Please read `GotoBLAS_01Readme.txt`. #### PPC/PPC64 -- **POWER8**: Optimized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` +- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1` +- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only. #### IBM zEnterprise System - **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) +- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision) ### Supported OS From 3a49e8c05aa24bba832e5e05bd8888fbee039919 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 13:52:22 +0200 Subject: [PATCH 22/28] first try migrating one of the arm builds from travis --- azure-pipelines.yml | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index aa912913d..87b4de3f0 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -14,6 +14,26 @@ steps: displayName: 'Run a one-line script' - script: | - echo Add other tasks to build, test, and deploy your project. - echo See https://aka.ms/yaml - displayName: 'Run a multi-line script' + docker run --rm --privileged multiarch/qemu-user-static:register --reset + ls /proc/sys/fs/binfmt_misc/ + condition: not(startsWith(variables['CONFIG'], 'linux_64')) + displayName: Configure binfmt_misc + +- script: | + echo "FROM openblas/alpine:arm32 + COPY . /tmp/openblas + RUN mkdir /tmp/openblas/build && \ + cd /tmp/openblas/build && \ + CC=gcc cmake -D DYNAMIC_ARCH=OFF \ + -D TARGET=ARMV6 \ + -D BUILD_SHARED_LIBS=ON \ + -D BUILD_WITHOUT_LAPACK=ON \ + -D BUILD_WITHOUT_CBLAS=ON \ + -D CMAKE_BUILD_TYPE=Release ../ && \ + cmake --build ." > Dockerfile + docker build . + +#- script: | +# echo Add other tasks to build, test, and deploy your project. +# echo See https://aka.ms/yaml +# displayName: 'Run a multi-line script' From 5cf434167ab9622c6788e4fdc9b418ab7bf96e61 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 13:58:59 +0200 Subject: [PATCH 23/28] fix tabbing in azure commands --- azure-pipelines.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 87b4de3f0..3b277073a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -14,10 +14,10 @@ steps: displayName: 'Run a one-line script' - script: | - docker run --rm --privileged multiarch/qemu-user-static:register --reset - ls /proc/sys/fs/binfmt_misc/ + docker run --rm --privileged multiarch/qemu-user-static:register --reset + ls /proc/sys/fs/binfmt_misc/ condition: not(startsWith(variables['CONFIG'], 'linux_64')) - displayName: Configure binfmt_misc + displayName: 'Configure binfmt_misc' - script: | echo "FROM openblas/alpine:arm32 @@ -32,7 +32,7 @@ steps: -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile docker build . - + displayname: 'Run ARMV6 docker build' #- script: | # echo Add other tasks to build, test, and deploy your project. # echo See https://aka.ms/yaml From aa4c41bad26bbb6d550ddad3141063c2260b7afd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 14:12:02 +0200 Subject: [PATCH 24/28] Update azure-pipelines.yml take out offending lines (although stolen from https://github.com/conda-forge/opencv-feedstock azure-pipelines fiie) --- azure-pipelines.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 3b277073a..d7e6cdc9b 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -15,9 +15,9 @@ steps: - script: | docker run --rm --privileged multiarch/qemu-user-static:register --reset - ls /proc/sys/fs/binfmt_misc/ - condition: not(startsWith(variables['CONFIG'], 'linux_64')) - displayName: 'Configure binfmt_misc' +# ls /proc/sys/fs/binfmt_misc/ +# condition: not(startsWith(variables['CONFIG'], 'linux_64')) +# displayName: 'Configure binfmt_misc' - script: | echo "FROM openblas/alpine:arm32 From 16fd8e3dbe510802860f1981321bf9cd70676de4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 14:14:22 +0200 Subject: [PATCH 25/28] Update azure-pipelines.yml --- azure-pipelines.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index d7e6cdc9b..12ea40b61 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -32,7 +32,8 @@ steps: -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile docker build . - displayname: 'Run ARMV6 docker build' + displayName: 'Run ARMV6 docker build' + #- script: | # echo Add other tasks to build, test, and deploy your project. # echo See https://aka.ms/yaml From a598ab1d32c1d5fcf9b9eb0c503a24db13757bc2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 15:23:54 +0200 Subject: [PATCH 26/28] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 12ea40b61..2b092c256 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -32,7 +32,7 @@ steps: -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile docker build . - displayName: 'Run ARMV6 docker build' +# displayName: 'Run ARMV6 docker build' #- script: | # echo Add other tasks to build, test, and deploy your project. From dd77a3f0e27dee0c15b6e1da3649aba6723631ab Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 15:25:43 +0200 Subject: [PATCH 27/28] Update azure-pipelines.yml --- azure-pipelines.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 2b092c256..e25f11cb1 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -32,6 +32,8 @@ steps: -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile docker build . + + # displayName: 'Run ARMV6 docker build' #- script: | From ad20ceaa680e555e6f4e5e6d199f4c158ef1b6df Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 19:07:58 +0200 Subject: [PATCH 28/28] Update azure-pipelines.yml --- azure-pipelines.yml | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index e25f11cb1..0b1ba16fd 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -13,14 +13,14 @@ steps: - script: echo Hello, world! displayName: 'Run a one-line script' -- script: | - docker run --rm --privileged multiarch/qemu-user-static:register --reset +#- script: | +# docker run --rm --privileged multiarch/qemu-user-static:register --reset # ls /proc/sys/fs/binfmt_misc/ # condition: not(startsWith(variables['CONFIG'], 'linux_64')) # displayName: 'Configure binfmt_misc' - script: | - echo "FROM openblas/alpine:arm32 + echo "FROM openblas/alpine:arm32 COPY . /tmp/openblas RUN mkdir /tmp/openblas/build && \ cd /tmp/openblas/build && \ @@ -31,10 +31,8 @@ steps: -D BUILD_WITHOUT_CBLAS=ON \ -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile - docker build . - - -# displayName: 'Run ARMV6 docker build' + docker build . + displayName: Run ARMV6 docker build #- script: | # echo Add other tasks to build, test, and deploy your project.