From 0f105dd8a5a597b2f468f774a52da226581efbdc Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Sat, 13 Apr 2019 13:56:19 +0000 Subject: [PATCH 001/127] sgemm/strmm --- CONTRIBUTORS.md | 5 +- kernel/power/KERNEL.POWER9 | 6 +- kernel/power/sgemm_kernel_power9.S | 286 ++ kernel/power/sgemm_logic_power9.S | 2133 ++++++++++ kernel/power/sgemm_macros_power9.S | 5828 ++++++++++++++++++++++++++++ param.h | 4 +- 6 files changed, 8256 insertions(+), 6 deletions(-) create mode 100644 kernel/power/sgemm_kernel_power9.S create mode 100644 kernel/power/sgemm_logic_power9.S create mode 100644 kernel/power/sgemm_macros_power9.S diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 08f8cc69d..3859a9c19 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -167,4 +167,7 @@ In chronological order: * [2017-02-26] ztrmm kernel for IBM z13 * [2017-03-13] strmm and ctrmm kernel for IBM z13 * [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13 - + * [2018-03-07] added missing Blas Level 1-2 (double precision) simd codes + * [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes + * [2019-03-14] power9 dgemm/dtrmm kernel + * [2019-04-29] power9 sgemm/strmm kernel diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 86a931971..6d5cf9068 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -3,16 +3,16 @@ #CGEMM_BETA = ../generic/zgemm_beta.c #ZGEMM_BETA = ../generic/zgemm_beta.c -STRMMKERNEL = strmm_kernel_16x8_power8.S +STRMMKERNEL = sgemm_kernel_power9.S DTRMMKERNEL = dgemm_kernel_power9.S CTRMMKERNEL = ctrmm_kernel_8x4_power8.S ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S -SGEMMKERNEL = sgemm_kernel_16x8_power8.S +SGEMMKERNEL = sgemm_kernel_power9.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c -SGEMMOTCOPY = sgemm_tcopy_8_power8.S +SGEMMOTCOPY = sgemm_tcopy_8_power8.S SGEMMINCOPYOBJ = sgemm_incopy.o SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S new file mode 100644 index 000000000..a44659468 --- /dev/null +++ b/kernel/power/sgemm_kernel_power9.S @@ -0,0 +1,286 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld +#define STACKSIZE (512 ) + +#define M r3 +#define N r4 +#define K r5 + + +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 + + + +#define alpha_r vs20 +#define save_permute_1 vs21 +#define save_permute_2 vs22 +#define permute_mask vs23 +#define o0 0 + + +#define T1 r11 +#define T2 r12 +#define T3 r14 +#define T4 r15 +#define T5 r16 +#define T6 r17 +#define L r18 +#define T7 r19 +#define T8 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T9 r27 +#define T10 r28 +#define T11 r29 + +#define T12 r30 +#define T13 r31 + +#include "sgemm_macros_power9.S" + +.equ perm_const1, 0x0405060700010203 +.equ perm_const2, 0x0c0d0e0f08090a0b +.equ save_permute_11, 0x1415161718191a1b +.equ save_permute_12, 0x0405060708090a0b +.equ save_permute_21, 0x101112131c1d1e1f +.equ save_permute_22, 0x000102030c0d0e0f + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv v20, 288(SP) + stxv v21, 304(SP) + stxv v22, 320(SP) + stxv v23, 336(SP) + stxv v24, 352(SP) + stxv v25, 368(SP) + stxv v26, 384(SP) + stxv v27, 400(SP) + stxv v28, 416(SP) + stxv v29, 432(SP) + stxv v30, 448(SP) + stxv v31, 464(SP) + + + +#if defined(TRMMKERNEL) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + slwi LDC, LDC, 2 + + +/* cmpwi cr0, M, 0 + ble .L999_H1 + cmpwi cr0, N, 0 + ble .L999_H1 + cmpwi cr0, K, 0 + ble .L999_H1 +*/ + + + /*alpha is stored in f1. convert to single and splat*/ + xscvdpspn alpha_r,vs1 + xxspltw alpha_r,alpha_r,0 + + +/*load reverse permute mask for big endian + uint128 = 0xc0d0e0f08090a0b0405060700010203 +*/ + + lis T2, perm_const2@highest + ori T2, T2, perm_const2@higher + rldicr T2, T2, 32, 31 + oris T2, T2, perm_const2@h + ori T2, T2, perm_const2@l + + lis T1, perm_const1@highest + ori T1, T1, perm_const1@higher + rldicr T1, T1, 32, 31 + oris T1, T1, perm_const1@h + ori T1, T1, perm_const1@l + + mtvsrdd permute_mask,T2,T1 + + lis T2, save_permute_12@highest + ori T2, T2, save_permute_12@higher + rldicr T2, T2, 32, 31 + oris T2, T2, save_permute_12@h + ori T2, T2, save_permute_12@l + + lis T1, save_permute_11@highest + ori T1, T1, save_permute_11@higher + rldicr T1, T1, 32, 31 + oris T1, T1, save_permute_11@h + ori T1, T1, save_permute_11@l + + mtvsrdd save_permute_1,T2,T1 + + lis T2, save_permute_22@highest + ori T2, T2, save_permute_22@higher + rldicr T2, T2, 32, 31 + oris T2, T2, save_permute_22@h + ori T2, T2, save_permute_22@l + + lis T1, save_permute_21@highest + ori T1, T1, save_permute_21@higher + rldicr T1, T1, 32, 31 + oris T1, T1, save_permute_21@h + ori T1, T1, save_permute_21@l + + mtvsrdd save_permute_2,T2,T1 + +#include "sgemm_logic_power9.S" + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + lxv v20, 288(SP) + lxv v21, 304(SP) + lxv v22, 320(SP) + lxv v23, 336(SP) + lxv v24, 352(SP) + lxv v25, 368(SP) + lxv v26, 384(SP) + lxv v27, 400(SP) + lxv v28, 416(SP) + lxv v29, 432(SP) + lxv v30, 448(SP) + lxv v31, 464(SP) + + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S new file mode 100644 index 000000000..300e30470 --- /dev/null +++ b/kernel/power/sgemm_logic_power9.S @@ -0,0 +1,2133 @@ +#define MY_ALIGN .align 3 + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + + srawi. J, N, 3 + + ble LSGEMM_L8_END + +LSGEMM_L8_BEGIN: + + li T1, 128 + li T2, 256 + + mr AO, A + mr CO, C + slwi T3, LDC , 3 + add C, C, T3 + + dcbt A, T1 + dcbt A, T2 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_L8x16_END + + MY_ALIGN +LSGEMM_L8x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 6 /**(T11-1) % 64x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 6 /**(K-1) % 64x */ +#endif + + ZERO8x16 + ble LSGEMM_L8x16_SUB0 + + MY_ALIGN +LSGEMM_L8x16_LOOP_START: + + LOAD8x16_0 /*we already zeroed */ + ##OffsetA=64 OffsetB=32 + addi AO,AO,2112 + addi BO,BO,32 + + mtctr L + + MY_ALIGN + +LSGEMM_L8x16_LOOP: + + KERNEL8x16_I1_L4_2 -2048,0, 0,0 + KERNEL8x16_I1_L4_2 -2048,0, 1,0 + KERNEL8x16_I1_L4_2 -2048,0, 2,0 + KERNEL8x16_I1_L4_2 -2048,0, 3,0 + KERNEL8x16_I1_L4_2 -2048,0, 4,0 + KERNEL8x16_I1_L4_2 -2048,0, 5,0 + KERNEL8x16_I1_L4_2 -2048,0, 6,0 + KERNEL8x16_I1_L4_2 -2048,0, 7,0 + KERNEL8x16_I1_L4_2 -2048,0, 8,0 + KERNEL8x16_I1_L4_2 -2048,0, 9,0 + KERNEL8x16_I1_L4_2 -2048,0, 10,0 + KERNEL8x16_I1_L4_2 -2048,0, 11,0 + KERNEL8x16_I1_L4_2 -2048,0, 12,0 + KERNEL8x16_I1_L4_2 -2048,0, 13,0 + KERNEL8x16_I1_L4_2 -2048,0, 14,0 + KERNEL8x16_I1_L4_2 -2048,0, 15,1 + + bdnz LSGEMM_L8x16_LOOP + + MY_ALIGN +LSGEMM_L8x16_LOOP_END: + + END8x16 0, AO, BO, -2048, 0 + + b LSGEMM_L8x16_SUB1 + MY_ALIGN +LSGEMM_L8x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 127 +#else + andi. L, K, 127 +#endif + b LSGEMM_L8x16_SUB2 + MY_ALIGN +LSGEMM_L8x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 63 +#else + andi. L, T12, 63 +#endif + ble LSGEMM_L8x16_SAVE + MY_ALIGN +LSGEMM_L8x16_SUB2: + + srawi. T10,L, 5 + ble LSGEMM_L8x16_SUB2_16 + mtctr T10 + MY_ALIGN +LSGEMM_L8x16_SUB2_LOOP: + LOAD8x16_0 + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_2 64,32, 1,0 + KERNEL8x16_I1_L4_2 64,32, 2,0 + KERNEL8x16_I1_L4_2 64,32, 3,0 + KERNEL8x16_I1_L4_2 64,32, 4,0 + KERNEL8x16_I1_L4_2 64,32, 5,0 + KERNEL8x16_I1_L4_2 64,32, 6,0 + KERNEL8x16_I1_L4_3 64,32, 7,1 + bdnz LSGEMM_L8x16_SUB2_LOOP + MY_ALIGN +LSGEMM_L8x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L8x16_SUB2_8 + LOAD8x16_0 + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_2 64,32, 1,0 + KERNEL8x16_I1_L4_2 64,32, 2,0 + KERNEL8x16_I1_L4_3 64,32, 3,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L8x16_SUB2_4 + LOAD8x16_0 + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_3 64,32, 1,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L8x16_SUB2_2 + LOAD8x16_0 + KERNEL8x16_I1_L4_3 64,32, 0,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L8x16_SUB2_1 + LOAD8x16_0 + KERNEL8x16_I1_L2_3 64,32, 0,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L8x16_SAVE + KERNEL8x16 0 +# addic. L, L, -1 +# bgt LSGEMM_L8x16_SUB2 + + MY_ALIGN +LSGEMM_L8x16_SAVE: + SAVE8x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8 +#endif + addic. I, I, -1 + bgt+ LSGEMM_L8x16_BEGIN + MY_ALIGN +LSGEMM_L8x16_END: +LSGEMM_L8x8_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L8x1_END + + andi. T1, M, 8 + ble LSGEMM_L8x8_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,8 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO8x8 + ble LSGEMM_L8x8_SUB0 + + MY_ALIGN +LSGEMM_L8x8_LOOP_START: + + LOAD8x8_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L8x8_LOOP: + + KERNEL8x8_I1_L4_2 32,32, 0,0 + KERNEL8x8_I1_L4_2 32,32, 1,0 + KERNEL8x8_I1_L4_2 32,32, 2,0 + KERNEL8x8_I1_L4_2 32,32, 3,1 + + bdnz LSGEMM_L8x8_LOOP + + MY_ALIGN +LSGEMM_L8x8_LOOP_END: + + END8x8 0, AO, BO, 32, 32 + + b LSGEMM_L8x8_SUB1 + MY_ALIGN +LSGEMM_L8x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L8x8_SUB2 + MY_ALIGN +LSGEMM_L8x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L8x8_SAVE + MY_ALIGN +LSGEMM_L8x8_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L8x8_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L8x8_SUB2_LOOP: + LOAD8x8_0 + KERNEL8x8_I1_L4_2 32,32, 0,0 + KERNEL8x8_I1_L4_3 32,32, 1,1 + bdnz LSGEMM_L8x8_SUB2_LOOP + MY_ALIGN +LSGEMM_L8x8_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L8x8_SUB2_2 + LOAD8x8_0 + KERNEL8x8_I1_L4_3 32,32, 0,1 + MY_ALIGN +LSGEMM_L8x8_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x8_SUB2_1 + LOAD8x8_0 + KERNEL8x8_I1_L2_3 32,32, 0,1 + MY_ALIGN +LSGEMM_L8x8_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x8_SAVE + KERNEL8x8 0 + + + MY_ALIGN +LSGEMM_L8x8_SAVE: + SAVE8x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8 +#endif + MY_ALIGN +LSGEMM_L8x8_END: +LSGEMM_L8x4_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L8x1_END + + andi. T1, M, 4 + ble LSGEMM_L8x4_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,8 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO8x4 + ble LSGEMM_L8x4_SUB0 + + MY_ALIGN +LSGEMM_L8x4_LOOP_START: + + LOAD8x4_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L8x4_LOOP: + + KERNEL8x4_I1_L4_2 16,32, 0,0 + KERNEL8x4_I1_L4_2 16,32, 1,0 + KERNEL8x4_I1_L4_2 16,32, 2,0 + KERNEL8x4_I1_L4_2 16,32, 3,1 + + bdnz LSGEMM_L8x4_LOOP + + MY_ALIGN +LSGEMM_L8x4_LOOP_END: + + END8x4 0, AO, BO, 16, 32 + + b LSGEMM_L8x4_SUB1 + MY_ALIGN +LSGEMM_L8x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L8x4_SUB2 + MY_ALIGN +LSGEMM_L8x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L8x4_SAVE + MY_ALIGN +LSGEMM_L8x4_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L8x4_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L8x4_SUB2_LOOP: + LOAD8x4_0 + KERNEL8x4_I1_L4_2 16,32, 0,0 + KERNEL8x4_I1_L4_3 16,32, 1,1 + bdnz LSGEMM_L8x4_SUB2_LOOP + MY_ALIGN +LSGEMM_L8x4_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L8x4_SUB2_2 + LOAD8x4_0 + KERNEL8x4_I1_L4_3 16,32, 0,1 + MY_ALIGN +LSGEMM_L8x4_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x4_SUB2_1 + LOAD8x4_0 + KERNEL8x4_I1_L2_3 16,32, 0,1 + MY_ALIGN +LSGEMM_L8x4_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x4_SAVE + KERNEL8x4 0 + + + MY_ALIGN +LSGEMM_L8x4_SAVE: + SAVE8x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8 +#endif + MY_ALIGN +LSGEMM_L8x4_END: +LSGEMM_L8x2_BEGIN: + andi. T1, M, 2 + ble LSGEMM_L8x2_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,8 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO8x2 + ble LSGEMM_L8x2_SUB0 + + MY_ALIGN +LSGEMM_L8x2_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L8x2_LOOP: + + KERNEL8x2_2 0,0, 0,0 + KERNEL8x2_2 0,0, 1,0 + KERNEL8x2_2 0,0, 2,0 + KERNEL8x2_2 0,0, 3,1 + + bdnz LSGEMM_L8x2_LOOP + + MY_ALIGN +LSGEMM_L8x2_LOOP_END: + +LSGEMM_L8x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L8x2_SAVE + MY_ALIGN +LSGEMM_L8x2_SUB2: + andi. T1,L, 4 + ble LSGEMM_L8x2_SUB2_2 + KERNEL8x2_2 0,0, 0,0 + KERNEL8x2_2 0,0, 1,1 + MY_ALIGN +LSGEMM_L8x2_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x2_SUB2_1 + KERNEL8x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L8x2_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x2_SAVE + KERNEL8x2 + + MY_ALIGN +LSGEMM_L8x2_SAVE: + SAVE8x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8 +#endif + MY_ALIGN +LSGEMM_L8x2_END: +LSGEMM_L8x1_BEGIN: + andi. T1, M, 1 + ble LSGEMM_L8x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,8 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO8x1 + ble LSGEMM_L8x1_SUB0 + + MY_ALIGN +LSGEMM_L8x1_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L8x1_LOOP: + + KERNEL8x1_4 0,0, 0,0 + KERNEL8x1_4 0,0, 1,1 + + bdnz LSGEMM_L8x1_LOOP + + MY_ALIGN +LSGEMM_L8x1_LOOP_END: + +LSGEMM_L8x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L8x1_SAVE + MY_ALIGN +LSGEMM_L8x1_SUB2: + andi. T1,L, 4 + ble LSGEMM_L8x1_SUB2_2 + KERNEL8x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L8x1_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x1_SUB2_1 + KERNEL8x1_2 + MY_ALIGN +LSGEMM_L8x1_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x1_SAVE + KERNEL8x1 + + MY_ALIGN +LSGEMM_L8x1_SAVE: + SAVE8x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8 +#endif + MY_ALIGN +LSGEMM_L8x1_END: + + slwi T1, K, 5 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 8 +#endif + addic. J, J, -1 + bgt LSGEMM_L8_BEGIN + + +LSGEMM_L8_END: + +/* b LSGEMM_L4_BEGIN*/ + andi. T1, N, 4 + ble LSGEMM_L4_END +LSGEMM_L4_BEGIN: + + + mr AO, A + mr CO, C + slwi T3, LDC , 2 + add C, C, T3 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_L4x16_END + + MY_ALIGN +LSGEMM_L4x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,4 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 6 /**(T11-1) % 64x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 6 /**(K-1) % 64x */ +#endif + + ZERO4x16 + ble LSGEMM_L4x16_SUB0 + + MY_ALIGN +LSGEMM_L4x16_LOOP_START: + + LOAD4x16_0 /*we already zeroed */ + ##OffsetA=64 OffsetB=16 + addi AO,AO,2112 + addi BO,BO,16 + + mtctr L + + MY_ALIGN + +LSGEMM_L4x16_LOOP: + + KERNEL4x16_I1_L4_2 -2048,0, 0,0 + KERNEL4x16_I1_L4_2 -2048,0, 1,0 + KERNEL4x16_I1_L4_2 -2048,0, 2,0 + KERNEL4x16_I1_L4_2 -2048,0, 3,0 + KERNEL4x16_I1_L4_2 -2048,0, 4,0 + KERNEL4x16_I1_L4_2 -2048,0, 5,0 + KERNEL4x16_I1_L4_2 -2048,0, 6,0 + KERNEL4x16_I1_L4_2 -2048,0, 7,0 + KERNEL4x16_I1_L4_2 -2048,0, 8,0 + KERNEL4x16_I1_L4_2 -2048,0, 9,0 + KERNEL4x16_I1_L4_2 -2048,0, 10,0 + KERNEL4x16_I1_L4_2 -2048,0, 11,0 + KERNEL4x16_I1_L4_2 -2048,0, 12,0 + KERNEL4x16_I1_L4_2 -2048,0, 13,0 + KERNEL4x16_I1_L4_2 -2048,0, 14,0 + KERNEL4x16_I1_L4_2 -2048,0, 15,1 + + bdnz LSGEMM_L4x16_LOOP + + MY_ALIGN +LSGEMM_L4x16_LOOP_END: + + END4x16 0, AO, BO, -2048, 0 + + b LSGEMM_L4x16_SUB1 + MY_ALIGN +LSGEMM_L4x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 127 +#else + andi. L, K, 127 +#endif + b LSGEMM_L4x16_SUB2 + MY_ALIGN +LSGEMM_L4x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 63 +#else + andi. L, T12, 63 +#endif + ble LSGEMM_L4x16_SAVE + MY_ALIGN +LSGEMM_L4x16_SUB2: + + srawi. T10,L, 5 + ble LSGEMM_L4x16_SUB2_16 + mtctr T10 + MY_ALIGN +LSGEMM_L4x16_SUB2_LOOP: + LOAD4x16_0 + KERNEL4x16_I1_L4_2 64,16, 0,0 + KERNEL4x16_I1_L4_2 64,16, 1,0 + KERNEL4x16_I1_L4_2 64,16, 2,0 + KERNEL4x16_I1_L4_2 64,16, 3,0 + KERNEL4x16_I1_L4_2 64,16, 4,0 + KERNEL4x16_I1_L4_2 64,16, 5,0 + KERNEL4x16_I1_L4_2 64,16, 6,0 + KERNEL4x16_I1_L4_3 64,16, 7,1 + bdnz LSGEMM_L4x16_SUB2_LOOP + MY_ALIGN +LSGEMM_L4x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L4x16_SUB2_8 + LOAD4x16_0 + KERNEL4x16_I1_L4_2 64,16, 0,0 + KERNEL4x16_I1_L4_2 64,16, 1,0 + KERNEL4x16_I1_L4_2 64,16, 2,0 + KERNEL4x16_I1_L4_3 64,16, 3,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L4x16_SUB2_4 + LOAD4x16_0 + KERNEL4x16_I1_L4_2 64,16, 0,0 + KERNEL4x16_I1_L4_3 64,16, 1,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L4x16_SUB2_2 + LOAD4x16_0 + KERNEL4x16_I1_L4_3 64,16, 0,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L4x16_SUB2_1 + LOAD4x16_0 + KERNEL4x16_I1_L2_3 64,16, 0,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L4x16_SAVE + KERNEL4x16 0 +# addic. L, L, -1 +# bgt LSGEMM_L4x16_SUB2 + + MY_ALIGN +LSGEMM_L4x16_SAVE: + SAVE4x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4 +#endif + addic. I, I, -1 + bgt+ LSGEMM_L4x16_BEGIN + MY_ALIGN +LSGEMM_L4x16_END: +LSGEMM_L4x8_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L4x1_END + + andi. T1, M, 8 + ble LSGEMM_L4x8_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,4 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO4x8 + ble LSGEMM_L4x8_SUB0 + + MY_ALIGN +LSGEMM_L4x8_LOOP_START: + + LOAD4x8_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L4x8_LOOP: + + KERNEL4x8_I1_L4_2 32,16, 0,0 + KERNEL4x8_I1_L4_2 32,16, 1,0 + KERNEL4x8_I1_L4_2 32,16, 2,0 + KERNEL4x8_I1_L4_2 32,16, 3,1 + + bdnz LSGEMM_L4x8_LOOP + + MY_ALIGN +LSGEMM_L4x8_LOOP_END: + + END4x8 0, AO, BO, 32, 16 + + b LSGEMM_L4x8_SUB1 + MY_ALIGN +LSGEMM_L4x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L4x8_SUB2 + MY_ALIGN +LSGEMM_L4x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L4x8_SAVE + MY_ALIGN +LSGEMM_L4x8_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L4x8_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L4x8_SUB2_LOOP: + LOAD4x8_0 + KERNEL4x8_I1_L4_2 32,16, 0,0 + KERNEL4x8_I1_L4_3 32,16, 1,1 + bdnz LSGEMM_L4x8_SUB2_LOOP + MY_ALIGN +LSGEMM_L4x8_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L4x8_SUB2_2 + LOAD4x8_0 + KERNEL4x8_I1_L4_3 32,16, 0,1 + MY_ALIGN +LSGEMM_L4x8_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x8_SUB2_1 + LOAD4x8_0 + KERNEL4x8_I1_L2_3 32,16, 0,1 + MY_ALIGN +LSGEMM_L4x8_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x8_SAVE + KERNEL4x8 0 + + + MY_ALIGN +LSGEMM_L4x8_SAVE: + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4 +#endif + MY_ALIGN +LSGEMM_L4x8_END: +LSGEMM_L4x4_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L4x1_END + + andi. T1, M, 4 + ble LSGEMM_L4x4_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,4 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO4x4 + ble LSGEMM_L4x4_SUB0 + + MY_ALIGN +LSGEMM_L4x4_LOOP_START: + + LOAD4x4_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L4x4_LOOP: + + KERNEL4x4_I1_L4_2 16,16, 0,0 + KERNEL4x4_I1_L4_2 16,16, 1,0 + KERNEL4x4_I1_L4_2 16,16, 2,0 + KERNEL4x4_I1_L4_2 16,16, 3,1 + + bdnz LSGEMM_L4x4_LOOP + + MY_ALIGN +LSGEMM_L4x4_LOOP_END: + + END4x4 0, AO, BO, 16, 16 + + b LSGEMM_L4x4_SUB1 + MY_ALIGN +LSGEMM_L4x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L4x4_SUB2 + MY_ALIGN +LSGEMM_L4x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L4x4_SAVE + MY_ALIGN +LSGEMM_L4x4_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L4x4_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L4x4_SUB2_LOOP: + LOAD4x4_0 + KERNEL4x4_I1_L4_2 16,16, 0,0 + KERNEL4x4_I1_L4_3 16,16, 1,1 + bdnz LSGEMM_L4x4_SUB2_LOOP + MY_ALIGN +LSGEMM_L4x4_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L4x4_SUB2_2 + LOAD4x4_0 + KERNEL4x4_I1_L4_3 16,16, 0,1 + MY_ALIGN +LSGEMM_L4x4_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x4_SUB2_1 + LOAD4x4_0 + KERNEL4x4_I1_L2_3 16,16, 0,1 + MY_ALIGN +LSGEMM_L4x4_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x4_SAVE + KERNEL4x4 0 + + + MY_ALIGN +LSGEMM_L4x4_SAVE: + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4 +#endif + MY_ALIGN +LSGEMM_L4x4_END: +LSGEMM_L4x2_BEGIN: + andi. T1, M, 2 + ble LSGEMM_L4x2_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,4 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO4x2 + ble LSGEMM_L4x2_SUB0 + + MY_ALIGN +LSGEMM_L4x2_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L4x2_LOOP: + + KERNEL4x2_2 0,0, 0,0 + KERNEL4x2_2 0,0, 1,0 + KERNEL4x2_2 0,0, 2,0 + KERNEL4x2_2 0,0, 3,1 + + bdnz LSGEMM_L4x2_LOOP + + MY_ALIGN +LSGEMM_L4x2_LOOP_END: + +LSGEMM_L4x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L4x2_SAVE + MY_ALIGN +LSGEMM_L4x2_SUB2: + andi. T1,L, 4 + ble LSGEMM_L4x2_SUB2_2 + KERNEL4x2_2 0,0, 0,0 + KERNEL4x2_2 0,0, 1,1 + MY_ALIGN +LSGEMM_L4x2_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x2_SUB2_1 + KERNEL4x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L4x2_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x2_SAVE + KERNEL4x2 + + MY_ALIGN +LSGEMM_L4x2_SAVE: + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4 +#endif + MY_ALIGN +LSGEMM_L4x2_END: +LSGEMM_L4x1_BEGIN: + andi. T1, M, 1 + ble LSGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,4 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO4x1 + ble LSGEMM_L4x1_SUB0 + + MY_ALIGN +LSGEMM_L4x1_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L4x1_LOOP: + + KERNEL4x1_4 0,0, 0,0 + KERNEL4x1_4 0,0, 1,1 + + bdnz LSGEMM_L4x1_LOOP + + MY_ALIGN +LSGEMM_L4x1_LOOP_END: + +LSGEMM_L4x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L4x1_SAVE + MY_ALIGN +LSGEMM_L4x1_SUB2: + andi. T1,L, 4 + ble LSGEMM_L4x1_SUB2_2 + KERNEL4x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L4x1_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x1_SUB2_1 + KERNEL4x1_2 + MY_ALIGN +LSGEMM_L4x1_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x1_SAVE + KERNEL4x1 + + MY_ALIGN +LSGEMM_L4x1_SAVE: + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4 +#endif + MY_ALIGN +LSGEMM_L4x1_END: + + slwi T1, K, 4 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + + andi. T2, N, 3 + ble .L999 + +LSGEMM_L4_END: + andi. T1, N, 2 + ble LSGEMM_L2_END +LSGEMM_L2_BEGIN: + + + mr AO, A + mr CO, C + slwi T3, LDC , 1 + add C, C, T3 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_L2x16_END + + MY_ALIGN +LSGEMM_L2x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x16 + ble LSGEMM_L2x16_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_L2x16_LOOP: + + KERNEL2x16_4 -2048,0, 0,0 + KERNEL2x16_4 -2048,0, 1,0 + KERNEL2x16_4 -2048,0, 2,0 + KERNEL2x16_4 -2048,0, 3,0 + KERNEL2x16_4 -2048,0, 4,0 + KERNEL2x16_4 -2048,0, 5,0 + KERNEL2x16_4 -2048,0, 6,0 + KERNEL2x16_4 -2048,0, 7,0 + KERNEL2x16_4 -2048,0, 8,0 + KERNEL2x16_4 -2048,0, 9,0 + KERNEL2x16_4 -2048,0, 10,0 + KERNEL2x16_4 -2048,0, 11,0 + KERNEL2x16_4 -2048,0, 12,0 + KERNEL2x16_4 -2048,0, 13,0 + KERNEL2x16_4 -2048,0, 14,0 + KERNEL2x16_4 -2048,0, 15,1 + + bdnz LSGEMM_L2x16_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_L2x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x16_SAVE + MY_ALIGN +LSGEMM_L2x16_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x16_SUB2_16 + KERNEL2x16_4 0,0, 0,0 + KERNEL2x16_4 0,0, 1,0 + KERNEL2x16_4 0,0, 2,0 + KERNEL2x16_4 0,0, 3,0 + KERNEL2x16_4 0,0, 4,0 + KERNEL2x16_4 0,0, 5,0 + KERNEL2x16_4 0,0, 6,0 + KERNEL2x16_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x16_SUB2_8 + KERNEL2x16_4 0,0, 0,0 + KERNEL2x16_4 0,0, 1,0 + KERNEL2x16_4 0,0, 2,0 + KERNEL2x16_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x16_SUB2_4 + KERNEL2x16_4 0,0, 0,0 + KERNEL2x16_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x16_SUB2_2 + KERNEL2x16_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x16_SUB2_1 + KERNEL2x16_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x16_SAVE + KERNEL2x16 + + MY_ALIGN +LSGEMM_L2x16_SAVE: + SAVE2x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2 +#endif + addic. I, I, -1 + bgt+ LSGEMM_L2x16_BEGIN + MY_ALIGN +LSGEMM_L2x16_END: + andi. I, M, 8 + ble LSGEMM_L2x8_END + + MY_ALIGN +LSGEMM_L2x8_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x8 + ble LSGEMM_L2x8_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_L2x8_LOOP: + + KERNEL2x8_4 -2048,0, 0,0 + KERNEL2x8_4 -2048,0, 1,0 + KERNEL2x8_4 -2048,0, 2,0 + KERNEL2x8_4 -2048,0, 3,0 + KERNEL2x8_4 -2048,0, 4,0 + KERNEL2x8_4 -2048,0, 5,0 + KERNEL2x8_4 -2048,0, 6,0 + KERNEL2x8_4 -2048,0, 7,0 + KERNEL2x8_4 -2048,0, 8,0 + KERNEL2x8_4 -2048,0, 9,0 + KERNEL2x8_4 -2048,0, 10,0 + KERNEL2x8_4 -2048,0, 11,0 + KERNEL2x8_4 -2048,0, 12,0 + KERNEL2x8_4 -2048,0, 13,0 + KERNEL2x8_4 -2048,0, 14,0 + KERNEL2x8_4 -2048,0, 15,1 + + bdnz LSGEMM_L2x8_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_L2x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x8_SAVE + MY_ALIGN +LSGEMM_L2x8_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x8_SUB2_16 + KERNEL2x8_4 0,0, 0,0 + KERNEL2x8_4 0,0, 1,0 + KERNEL2x8_4 0,0, 2,0 + KERNEL2x8_4 0,0, 3,0 + KERNEL2x8_4 0,0, 4,0 + KERNEL2x8_4 0,0, 5,0 + KERNEL2x8_4 0,0, 6,0 + KERNEL2x8_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x8_SUB2_8 + KERNEL2x8_4 0,0, 0,0 + KERNEL2x8_4 0,0, 1,0 + KERNEL2x8_4 0,0, 2,0 + KERNEL2x8_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x8_SUB2_4 + KERNEL2x8_4 0,0, 0,0 + KERNEL2x8_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x8_SUB2_2 + KERNEL2x8_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x8_SUB2_1 + KERNEL2x8_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x8_SAVE + KERNEL2x8 + + MY_ALIGN +LSGEMM_L2x8_SAVE: + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2 +#endif + MY_ALIGN +LSGEMM_L2x8_END: + andi. I, M, 4 + ble LSGEMM_L2x4_END + + MY_ALIGN +LSGEMM_L2x4_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x4 + ble LSGEMM_L2x4_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_L2x4_LOOP: + + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,0 + KERNEL2x4_4 0,0, 2,0 + KERNEL2x4_4 0,0, 3,0 + KERNEL2x4_4 0,0, 4,0 + KERNEL2x4_4 0,0, 5,0 + KERNEL2x4_4 0,0, 6,0 + KERNEL2x4_4 0,0, 7,0 + KERNEL2x4_4 0,0, 8,0 + KERNEL2x4_4 0,0, 9,0 + KERNEL2x4_4 0,0, 10,0 + KERNEL2x4_4 0,0, 11,0 + KERNEL2x4_4 0,0, 12,0 + KERNEL2x4_4 0,0, 13,0 + KERNEL2x4_4 0,0, 14,0 + KERNEL2x4_4 0,0, 15,1 + + bdnz LSGEMM_L2x4_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_L2x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x4_SAVE + MY_ALIGN +LSGEMM_L2x4_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x4_SUB2_16 + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,0 + KERNEL2x4_4 0,0, 2,0 + KERNEL2x4_4 0,0, 3,0 + KERNEL2x4_4 0,0, 4,0 + KERNEL2x4_4 0,0, 5,0 + KERNEL2x4_4 0,0, 6,0 + KERNEL2x4_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x4_SUB2_8 + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,0 + KERNEL2x4_4 0,0, 2,0 + KERNEL2x4_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x4_SUB2_4 + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x4_SUB2_2 + KERNEL2x4_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x4_SUB2_1 + KERNEL2x4_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x4_SAVE + KERNEL2x4 + + MY_ALIGN +LSGEMM_L2x4_SAVE: + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2 +#endif + MY_ALIGN +LSGEMM_L2x4_END: + andi. I, M, 2 + ble LSGEMM_L2x2_END + + MY_ALIGN +LSGEMM_L2x2_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x2 + ble LSGEMM_L2x2_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_L2x2_LOOP: + + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,0 + KERNEL2x2_4 0,0, 2,0 + KERNEL2x2_4 0,0, 3,0 + KERNEL2x2_4 0,0, 4,0 + KERNEL2x2_4 0,0, 5,0 + KERNEL2x2_4 0,0, 6,0 + KERNEL2x2_4 0,0, 7,0 + KERNEL2x2_4 0,0, 8,0 + KERNEL2x2_4 0,0, 9,0 + KERNEL2x2_4 0,0, 10,0 + KERNEL2x2_4 0,0, 11,0 + KERNEL2x2_4 0,0, 12,0 + KERNEL2x2_4 0,0, 13,0 + KERNEL2x2_4 0,0, 14,0 + KERNEL2x2_4 0,0, 15,1 + + bdnz LSGEMM_L2x2_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_L2x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x2_SAVE + MY_ALIGN +LSGEMM_L2x2_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x2_SUB2_16 + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,0 + KERNEL2x2_4 0,0, 2,0 + KERNEL2x2_4 0,0, 3,0 + KERNEL2x2_4 0,0, 4,0 + KERNEL2x2_4 0,0, 5,0 + KERNEL2x2_4 0,0, 6,0 + KERNEL2x2_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x2_SUB2_8 + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,0 + KERNEL2x2_4 0,0, 2,0 + KERNEL2x2_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x2_SUB2_4 + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x2_SUB2_2 + KERNEL2x2_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x2_SUB2_1 + KERNEL2x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x2_SAVE + KERNEL2x2 + + MY_ALIGN +LSGEMM_L2x2_SAVE: + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2 +#endif + MY_ALIGN +LSGEMM_L2x2_END: + andi. I, M, 1 + ble LSGEMM_L2x1_END + + MY_ALIGN +LSGEMM_L2x1_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x1 + ble LSGEMM_L2x1_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_L2x1_LOOP: + + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,0 + KERNEL2x1_4 0,0, 2,0 + KERNEL2x1_4 0,0, 3,0 + KERNEL2x1_4 0,0, 4,0 + KERNEL2x1_4 0,0, 5,0 + KERNEL2x1_4 0,0, 6,0 + KERNEL2x1_4 0,0, 7,0 + KERNEL2x1_4 0,0, 8,0 + KERNEL2x1_4 0,0, 9,0 + KERNEL2x1_4 0,0, 10,0 + KERNEL2x1_4 0,0, 11,0 + KERNEL2x1_4 0,0, 12,0 + KERNEL2x1_4 0,0, 13,0 + KERNEL2x1_4 0,0, 14,0 + KERNEL2x1_4 0,0, 15,1 + + bdnz LSGEMM_L2x1_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_L2x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x1_SAVE + MY_ALIGN +LSGEMM_L2x1_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x1_SUB2_16 + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,0 + KERNEL2x1_4 0,0, 2,0 + KERNEL2x1_4 0,0, 3,0 + KERNEL2x1_4 0,0, 4,0 + KERNEL2x1_4 0,0, 5,0 + KERNEL2x1_4 0,0, 6,0 + KERNEL2x1_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x1_SUB2_8 + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,0 + KERNEL2x1_4 0,0, 2,0 + KERNEL2x1_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x1_SUB2_4 + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x1_SUB2_2 + KERNEL2x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x1_SUB2_1 + KERNEL2x1_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x1_SAVE + KERNEL2x1 + + MY_ALIGN +LSGEMM_L2x1_SAVE: + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2 +#endif + MY_ALIGN +LSGEMM_L2x1_END: + slwi T1, K, 3 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif +LSGEMM_L2_END: + andi. T1, N, 1 + ble LSGEMM_END +LSGEMM_1_BEGIN: + + + mr AO, A + mr CO, C + add C, C, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_1x16_END + + MY_ALIGN +LSGEMM_1x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x16 + ble LSGEMM_1x16_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_1x16_LOOP: + + KERNEL1x16_4 -2048,0, 0,0 + KERNEL1x16_4 -2048,0, 1,0 + KERNEL1x16_4 -2048,0, 2,0 + KERNEL1x16_4 -2048,0, 3,0 + KERNEL1x16_4 -2048,0, 4,0 + KERNEL1x16_4 -2048,0, 5,0 + KERNEL1x16_4 -2048,0, 6,0 + KERNEL1x16_4 -2048,0, 7,0 + KERNEL1x16_4 -2048,0, 8,0 + KERNEL1x16_4 -2048,0, 9,0 + KERNEL1x16_4 -2048,0, 10,0 + KERNEL1x16_4 -2048,0, 11,0 + KERNEL1x16_4 -2048,0, 12,0 + KERNEL1x16_4 -2048,0, 13,0 + KERNEL1x16_4 -2048,0, 14,0 + KERNEL1x16_4 -2048,0, 15,1 + + bdnz LSGEMM_1x16_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_1x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x16_SAVE + MY_ALIGN +LSGEMM_1x16_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x16_SUB2_16 + KERNEL1x16_4 0,0, 0,0 + KERNEL1x16_4 0,0, 1,0 + KERNEL1x16_4 0,0, 2,0 + KERNEL1x16_4 0,0, 3,0 + KERNEL1x16_4 0,0, 4,0 + KERNEL1x16_4 0,0, 5,0 + KERNEL1x16_4 0,0, 6,0 + KERNEL1x16_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x16_SUB2_8 + KERNEL1x16_4 0,0, 0,0 + KERNEL1x16_4 0,0, 1,0 + KERNEL1x16_4 0,0, 2,0 + KERNEL1x16_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x16_SUB2_4 + KERNEL1x16_4 0,0, 0,0 + KERNEL1x16_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x16_SUB2_2 + KERNEL1x16_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x16_SUB2_1 + KERNEL1x16_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x16_SAVE + KERNEL1x16 + + MY_ALIGN +LSGEMM_1x16_SAVE: + SAVE1x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1 +#endif + addic. I, I, -1 + bgt+ LSGEMM_1x16_BEGIN + MY_ALIGN +LSGEMM_1x16_END: + andi. I, M, 8 + ble LSGEMM_1x8_END + + MY_ALIGN +LSGEMM_1x8_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x8 + ble LSGEMM_1x8_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_1x8_LOOP: + + KERNEL1x8_4 -2048,0, 0,0 + KERNEL1x8_4 -2048,0, 1,0 + KERNEL1x8_4 -2048,0, 2,0 + KERNEL1x8_4 -2048,0, 3,0 + KERNEL1x8_4 -2048,0, 4,0 + KERNEL1x8_4 -2048,0, 5,0 + KERNEL1x8_4 -2048,0, 6,0 + KERNEL1x8_4 -2048,0, 7,0 + KERNEL1x8_4 -2048,0, 8,0 + KERNEL1x8_4 -2048,0, 9,0 + KERNEL1x8_4 -2048,0, 10,0 + KERNEL1x8_4 -2048,0, 11,0 + KERNEL1x8_4 -2048,0, 12,0 + KERNEL1x8_4 -2048,0, 13,0 + KERNEL1x8_4 -2048,0, 14,0 + KERNEL1x8_4 -2048,0, 15,1 + + bdnz LSGEMM_1x8_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_1x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x8_SAVE + MY_ALIGN +LSGEMM_1x8_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x8_SUB2_16 + KERNEL1x8_4 0,0, 0,0 + KERNEL1x8_4 0,0, 1,0 + KERNEL1x8_4 0,0, 2,0 + KERNEL1x8_4 0,0, 3,0 + KERNEL1x8_4 0,0, 4,0 + KERNEL1x8_4 0,0, 5,0 + KERNEL1x8_4 0,0, 6,0 + KERNEL1x8_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x8_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x8_SUB2_8 + KERNEL1x8_4 0,0, 0,0 + KERNEL1x8_4 0,0, 1,0 + KERNEL1x8_4 0,0, 2,0 + KERNEL1x8_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x8_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x8_SUB2_4 + KERNEL1x8_4 0,0, 0,0 + KERNEL1x8_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x8_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x8_SUB2_2 + KERNEL1x8_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x8_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x8_SUB2_1 + KERNEL1x8_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x8_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x8_SAVE + KERNEL1x8 + + MY_ALIGN +LSGEMM_1x8_SAVE: + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1 +#endif + MY_ALIGN +LSGEMM_1x8_END: + andi. I, M, 4 + ble LSGEMM_1x4_END + + MY_ALIGN +LSGEMM_1x4_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x4 + ble LSGEMM_1x4_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_1x4_LOOP: + + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,0 + KERNEL1x4_4 0,0, 2,0 + KERNEL1x4_4 0,0, 3,0 + KERNEL1x4_4 0,0, 4,0 + KERNEL1x4_4 0,0, 5,0 + KERNEL1x4_4 0,0, 6,0 + KERNEL1x4_4 0,0, 7,0 + KERNEL1x4_4 0,0, 8,0 + KERNEL1x4_4 0,0, 9,0 + KERNEL1x4_4 0,0, 10,0 + KERNEL1x4_4 0,0, 11,0 + KERNEL1x4_4 0,0, 12,0 + KERNEL1x4_4 0,0, 13,0 + KERNEL1x4_4 0,0, 14,0 + KERNEL1x4_4 0,0, 15,1 + + bdnz LSGEMM_1x4_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_1x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x4_SAVE + MY_ALIGN +LSGEMM_1x4_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x4_SUB2_16 + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,0 + KERNEL1x4_4 0,0, 2,0 + KERNEL1x4_4 0,0, 3,0 + KERNEL1x4_4 0,0, 4,0 + KERNEL1x4_4 0,0, 5,0 + KERNEL1x4_4 0,0, 6,0 + KERNEL1x4_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x4_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x4_SUB2_8 + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,0 + KERNEL1x4_4 0,0, 2,0 + KERNEL1x4_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x4_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x4_SUB2_4 + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x4_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x4_SUB2_2 + KERNEL1x4_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x4_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x4_SUB2_1 + KERNEL1x4_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x4_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x4_SAVE + KERNEL1x4 + + MY_ALIGN +LSGEMM_1x4_SAVE: + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1 +#endif + MY_ALIGN +LSGEMM_1x4_END: + andi. I, M, 2 + ble LSGEMM_1x2_END + + MY_ALIGN +LSGEMM_1x2_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x2 + ble LSGEMM_1x2_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_1x2_LOOP: + + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,0 + KERNEL1x2_4 0,0, 2,0 + KERNEL1x2_4 0,0, 3,0 + KERNEL1x2_4 0,0, 4,0 + KERNEL1x2_4 0,0, 5,0 + KERNEL1x2_4 0,0, 6,0 + KERNEL1x2_4 0,0, 7,0 + KERNEL1x2_4 0,0, 8,0 + KERNEL1x2_4 0,0, 9,0 + KERNEL1x2_4 0,0, 10,0 + KERNEL1x2_4 0,0, 11,0 + KERNEL1x2_4 0,0, 12,0 + KERNEL1x2_4 0,0, 13,0 + KERNEL1x2_4 0,0, 14,0 + KERNEL1x2_4 0,0, 15,1 + + bdnz LSGEMM_1x2_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_1x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x2_SAVE + MY_ALIGN +LSGEMM_1x2_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x2_SUB2_16 + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,0 + KERNEL1x2_4 0,0, 2,0 + KERNEL1x2_4 0,0, 3,0 + KERNEL1x2_4 0,0, 4,0 + KERNEL1x2_4 0,0, 5,0 + KERNEL1x2_4 0,0, 6,0 + KERNEL1x2_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x2_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x2_SUB2_8 + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,0 + KERNEL1x2_4 0,0, 2,0 + KERNEL1x2_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x2_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x2_SUB2_4 + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x2_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x2_SUB2_2 + KERNEL1x2_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x2_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x2_SUB2_1 + KERNEL1x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x2_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x2_SAVE + KERNEL1x2 + + MY_ALIGN +LSGEMM_1x2_SAVE: + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1 +#endif + MY_ALIGN +LSGEMM_1x2_END: + andi. I, M, 1 + ble LSGEMM_1x1_END + + MY_ALIGN +LSGEMM_1x1_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x1 + ble LSGEMM_1x1_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_1x1_LOOP: + + KERNEL1x1_16 0,0, 0,0 + KERNEL1x1_16 0,0, 1,0 + KERNEL1x1_16 0,0, 2,0 + KERNEL1x1_16 0,0, 3,1 + + bdnz LSGEMM_1x1_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_1x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x1_SAVE + MY_ALIGN +LSGEMM_1x1_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x1_SUB2_16 + KERNEL1x1_16 0,0, 0,0 + KERNEL1x1_16 0,0, 1,1 + MY_ALIGN +LSGEMM_1x1_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x1_SUB2_8 + KERNEL1x1_16 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x1_SUB2_4 + KERNEL1x1_8 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x1_SUB2_2 + KERNEL1x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x1_SUB2_1 + KERNEL1x1_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x1_SAVE + KERNEL1x1 + + MY_ALIGN +LSGEMM_1x1_SAVE: + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1 +#endif + MY_ALIGN +LSGEMM_1x1_END: + slwi T1, K, 2 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif +LSGEMM_END: \ No newline at end of file diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S new file mode 100644 index 000000000..c61f419ac --- /dev/null +++ b/kernel/power/sgemm_macros_power9.S @@ -0,0 +1,5828 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define unit_size 4 +#define DISP64(ind,disp) (ind*unit_size*64+disp) +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) + +/********************************************************************************************** +* Macros for N=8 and M=16 +**********************************************************************************************/ + +.macro LOAD8x16_1 + LOAD8x16 1 +.endm + +.macro LOAD8x16_0 + LOAD8x16 0 +.endm + +.macro KERNEL8x16_L1_L4 Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL8x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero8X16 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endm + +.macro LOAD8x16 Zero + + lxv vs24, 0(BO) + lxv vs28, 16(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endif +.endm + +.macro END8x16_NORMAL + END8x16 0, AO, BO, 64,32 +.endm + +.macro END8x16 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + xvmulsp vs50, vs2,vs28 + xvmulsp vs51, vs3,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + xvmulsp vs54, vs2,vs29 + xvmulsp vs55, vs3,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + xvmulsp vs58, vs2,vs30 + xvmulsp vs59, vs3,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + xvmulsp vs62, vs2,vs31 + xvmulsp vs63, vs3,vs31 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 + +.endif +.endm + +.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG) + + lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 + + lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) + + lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG) + lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 + + lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) + + lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG) + lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG) + lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG) + lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 + +.if \Complete==0 + lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) + + lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG) + lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG) + lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG) + lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) + addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP32(\Index,128) + addi \AREG, \AREG, DISP64(\Index,256) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 + +.endm + +.macro KERNEL8x16 First + + LOAD8x16 0 + END8x16 \First, AO, BO, 64,32 +.endm + +.macro KERNEL8x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 +.endif + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + xvmulsp vs50, vs2,vs28 + xvmulsp vs51, vs3,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + xvmulsp vs54, vs2,vs29 + xvmulsp vs55, vs3,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + xvmulsp vs58, vs2,vs30 + xvmulsp vs59, vs3,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + xvmulsp vs62, vs2,vs31 + xvmulsp vs63, vs3,vs31 + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 + +.endif +.if \Complete==0 + lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) + lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) + +.else + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + xvmulsp vs34, vs6,vs8 + xvmulsp vs35, vs7,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + xvmulsp vs38, vs6,vs9 + xvmulsp vs39, vs7,vs9 +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + xvmulsp vs42, vs6,vs10 + xvmulsp vs43, vs7,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + xvmulsp vs46, vs6,vs11 + xvmulsp vs47, vs7,vs11 + + xvmulsp vs48, vs4,vs12 + xvmulsp vs49, vs5,vs12 + xvmulsp vs50, vs6,vs12 + xvmulsp vs51, vs7,vs12 + + xvmulsp vs52, vs4,vs13 + xvmulsp vs53, vs5,vs13 + xvmulsp vs54, vs6,vs13 + xvmulsp vs55, vs7,vs13 + + xvmulsp vs56, vs4,vs14 + xvmulsp vs57, vs5,vs14 + xvmulsp vs58, vs6,vs14 + xvmulsp vs59, vs7,vs14 + + xvmulsp vs60, vs4,vs15 + xvmulsp vs61, vs5,vs15 + xvmulsp vs62, vs6,vs15 + xvmulsp vs63, vs7,vs15 + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 + +.endif + +.endm + + +.macro SAVE8x16 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + add T4, T2, T10 + add T5, T3, T10 + + add T6, T4, T10 + add T7, T5, T10 + + + + /* permute to restore butterfly rank 1 updateto normal promoted one */ + /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC) vs11 MEM(CO+3*LDC) */ + /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC) vs15 MEM(16+CO+3*LDC) */ + /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC) vs19 MEM(32+CO+3*LDC) */ + /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC) vs27 MEM(32+CO+3*LDC) */ + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxmrglw vs16, vs34, vs46 + xxmrglw vs18, vs38, vs42 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxmrghw vs4, vs38, vs42 + xxmrghw vs5, vs34, vs46 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxmrglw vs24, vs35, vs47 + xxmrglw vs26, vs39, vs43 + + xxlor vs17, vs16, vs16 + xxlor vs19, vs18, vs18 + + xxmrghw vs30, vs39, vs43 + xxmrghw vs31, vs35, vs47 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + +#ifndef TRMMKERNEL + lxv vs32, 0(CO) + lxv vs33, 16(CO) + lxv vs34, 32(CO) + lxv vs35, 48(CO) +#endif + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + +#ifndef TRMMKERNEL + lxv vs36, 0(T1) + lxv vs37, 16(T1) + lxv vs38, 32(T1) + lxv vs39, 48(T1) +#endif +#ifndef TRMMKERNEL + lxv vs40, 0(T2) + lxv vs41, 16(T2) + lxv vs42, 32(T2) + lxv vs43, 48(T2) +#endif +#ifndef TRMMKERNEL + lxv vs44, 0(T3) + lxv vs45, 16(T3) + lxv vs46, 32(T3) + lxv vs47, 48(T3) +#endif + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 + + xxperm vs17, vs4, save_permute_2 + xxperm vs19, vs5, save_permute_2 + + xxperm vs24, vs30, save_permute_1 + xxperm vs26, vs31, save_permute_1 + + xxperm vs25, vs30, save_permute_2 + xxperm vs27, vs31, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r + xvmulsp vs36, vs9, alpha_r + xvmulsp vs37, vs13, alpha_r + xvmulsp vs38, vs17, alpha_r + xvmulsp vs39, vs25, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r + xvmaddasp vs38, vs17, alpha_r + xvmaddasp vs39, vs25, alpha_r +#endif + + + +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r + +#endif + + stxv vs32, 0(CO) + stxv vs33, 16(CO) + stxv vs34, 32(CO) + stxv vs35, 48(CO) + + stxv vs36, 0(T1) + stxv vs37, 16(T1) + stxv vs38, 32(T1) + stxv vs39, 48(T1) + + stxv vs40, 0(T2) + stxv vs41, 16(T2) + stxv vs42, 32(T2) + stxv vs43, 48(T2) + stxv vs44, 0(T3) + stxv vs45, 16(T3) + stxv vs46, 32(T3) + stxv vs47, 48(T3) + + /*****the same with the second 8X8 ****/ +#ifndef TRMMKERNEL + + lxv vs32, 0(T4) + lxv vs33, 16(T4) + lxv vs34, 32(T4) + lxv vs35, 48(T4) + lxv vs36, 0(T5) + lxv vs37, 16(T5) + lxv vs38,32(T5) + lxv vs39, 48(T5) +#endif + + xxmrglw vs8, vs48, vs60 + xxmrglw vs10, vs52, vs56 + + xxmrghw vs1, vs48, vs60 + xxmrghw vs0, vs52, vs56 + xxmrglw vs12, vs49, vs61 + xxmrglw vs14, vs53, vs57 + +#ifndef TRMMKERNEL + lxv vs40, 0(T6) + lxv vs41, 16(T6) + lxv vs42, 32(T6) + lxv vs43, 48(T6) + lxv vs44, 0(T7) + lxv vs45, 16(T7) + lxv vs46, 32(T7) + lxv vs47, 48(T7) +#endif + xxmrghw vs2, vs53, vs57 + xxmrghw vs3, vs49, vs61 + + xxmrglw vs16, vs50, vs62 + xxmrglw vs18, vs54, vs58 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + xxmrghw vs4, vs54, vs58 + xxmrghw vs5, vs50, vs62 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxmrglw vs24, vs51, vs63 + xxmrglw vs26, vs55, vs59 + + xxlor vs17, vs16, vs16 + xxlor vs19, vs18, vs18 + xxmrghw vs30, vs55, vs59 + xxmrghw vs31, vs51, vs63 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 + xxperm vs17, vs4, save_permute_2 + xxperm vs19, vs5, save_permute_2 + xxperm vs24, vs30, save_permute_1 + xxperm vs26, vs31, save_permute_1 + xxperm vs25, vs30, save_permute_2 + xxperm vs27, vs31, save_permute_2 + +#ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r + xvmulsp vs36, vs9, alpha_r + xvmulsp vs37, vs13, alpha_r + xvmulsp vs38, vs17, alpha_r + xvmulsp vs39, vs25, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r + xvmaddasp vs38, vs17, alpha_r + xvmaddasp vs39, vs25, alpha_r +#endif + + stxv vs32, 0(T4) + stxv vs33, 16(T4) + stxv vs34, 32(T4) + stxv vs35, 48(T4) + + stxv vs36, 0(T5) + stxv vs37, 16(T5) + stxv vs38, 32(T5) + stxv vs39, 48(T5) + +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r + +#endif + + stxv vs40, 0(T6) + stxv vs41, 16(T6) + stxv vs42, 32(T6) + stxv vs43, 48(T6) + stxv vs44, 0(T7) + stxv vs45, 16(T7) + stxv vs46, 32(T7) + stxv vs47, 48(T7) + + + addi CO,CO,64 + + +.endm + + + +/********************************************************************************************** +* Macros for N=8 and M=8 +**********************************************************************************************/ + +.macro LOAD8x8_1 + LOAD8x8 1 +.endm + +.macro LOAD8x8_0 + LOAD8x8 0 +.endm + +.macro KERNEL8x8_L1_L4 Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL8x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL8x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro END8x8_NORMAL + END8x8 0, AO, BO, 32,32 +.endm + +.macro Zero8X8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + +.endm + +.macro LOAD8x8 Zero + + lxv vs24, 0(BO) + lxv vs28, 16(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 +.endif +.endm + + +.macro END8x8 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + +.endif +.endm + +.macro KERNEL8x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + + lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + + lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + +.if \Complete==0 + lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP32(\Index,128) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + +.endm + +.macro KERNEL8x8 First + + LOAD8x8 0 + END8x8 \First, AO, BO, 32,32 +.endm + +.macro KERNEL8x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + +.endif + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + +.endif +.if \Complete==0 + lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) + + lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) + addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) + +.else + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP16(\Index,64) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + + xvmulsp vs48, vs4,vs12 + xvmulsp vs49, vs5,vs12 + + xvmulsp vs52, vs4,vs13 + xvmulsp vs53, vs5,vs13 + + xvmulsp vs56, vs4,vs14 + xvmulsp vs57, vs5,vs14 + + xvmulsp vs60, vs4,vs15 + xvmulsp vs61, vs5,vs15 + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + +.endif + +.endm + + +.macro SAVE8x8 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + add T4, T2, T10 + add T5, T3, T10 + + add T6, T4, T10 + add T7, T5, T10 + +#ifndef TRMMKERNEL + lxv vs34, 0(CO) + lxv vs35, 16(CO) + lxv vs38, 0(T1) + lxv vs39, 16(T1) + lxv vs42, 0(T2) + lxv vs43, 16(T2) + lxv vs46, 0(T3) + lxv vs47, 16(T3) + + lxv vs50, 0(T4) + lxv vs51, 16(T4) + lxv vs54, 0(T5) + lxv vs55, 16(T5) + lxv vs58, 0(T6) + lxv vs59, 16(T6) + lxv vs62, 0(T7) + lxv vs63, 16(T7) +#endif + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs34, vs8, alpha_r + xvmulsp vs35, vs12, alpha_r + xvmulsp vs38, vs9, alpha_r + xvmulsp vs39, vs13, alpha_r + xvmulsp vs42, vs10, alpha_r + xvmulsp vs43, vs14, alpha_r + xvmulsp vs46, vs11, alpha_r + xvmulsp vs47, vs15, alpha_r +#else + xvmaddasp vs34, vs8, alpha_r + xvmaddasp vs35, vs12, alpha_r + xvmaddasp vs38, vs9, alpha_r + xvmaddasp vs39, vs13, alpha_r + xvmaddasp vs42, vs10, alpha_r + xvmaddasp vs43, vs14, alpha_r + xvmaddasp vs46, vs11, alpha_r + xvmaddasp vs47, vs15, alpha_r +#endif + + + xxmrglw vs8, vs48, vs60 + xxmrglw vs10, vs52, vs56 + + xxmrghw vs1, vs48, vs60 + xxmrghw vs0, vs52, vs56 + stxv vs34, 0(CO) + stxv vs35, 16(CO) + xxmrglw vs12, vs49, vs61 + xxmrglw vs14, vs53, vs57 + stxv vs38, 0(T1) + stxv vs39, 16(T1) + xxmrghw vs2, vs53, vs57 + xxmrghw vs3, vs49, vs61 + stxv vs42, 0(T2) + stxv vs43, 16(T2) + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + stxv vs46, 0(T3) + stxv vs47, 16(T3) + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + + + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + #ifdef TRMMKERNEL + xvmulsp vs50, vs8, alpha_r + xvmulsp vs51, vs12, alpha_r + xvmulsp vs54, vs9, alpha_r + xvmulsp vs55, vs13, alpha_r + xvmulsp vs58, vs10, alpha_r + xvmulsp vs59, vs14, alpha_r + xvmulsp vs62, vs11, alpha_r + xvmulsp vs63, vs15, alpha_r +#else + xvmaddasp vs50, vs8, alpha_r + xvmaddasp vs51, vs12, alpha_r + xvmaddasp vs54, vs9, alpha_r + xvmaddasp vs55, vs13, alpha_r + xvmaddasp vs58, vs10, alpha_r + xvmaddasp vs59, vs14, alpha_r + xvmaddasp vs62, vs11, alpha_r + xvmaddasp vs63, vs15, alpha_r +#endif + + stxv vs50, 0(T4) + stxv vs51, 16(T4) + stxv vs54, 0(T5) + stxv vs55, 16(T5) + stxv vs58, 0(T6) + stxv vs59, 16(T6) + stxv vs62, 0(T7) + stxv vs63, 16(T7) + + addi CO,CO,32 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=4 +**********************************************************************************************/ + +.macro LOAD8x4_1 + LOAD8x4 1 +.endm + +.macro LOAD8x4_0 + LOAD8x4 0 +.endm + +.macro KERNEL8x4_L1_L4 Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL8x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL8x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero8X4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + +.endm + +.macro LOAD8x4 Zero + + lxv vs0, 0(AO) + lxv vs24, 0(BO) + lxv vs25, 16(BO) + + + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 +.endif +.endm + +.macro END8x4_NORMAL + END8x4 0, AO, BO, 16,32 +.endm + +.macro END8x4 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 + + xvmulsp vs48, vs25, vs0 + xvmulsp vs49, vs25, vs1 + xvmulsp vs50, vs25, vs2 + xvmulsp vs51, vs25, vs3 +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 + +.endif +.endm + +.macro KERNEL8x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 + + lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP32(\Index, 32+\OffsetB)(\BREG) + lxv vs25, DISP32(\Index, 48+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + xvmaddasp vs48, vs27, vs4 + xvmaddasp vs49, vs27, vs5 + xvmaddasp vs50, vs27, vs6 + xvmaddasp vs51, vs27, vs7 + + + lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs26, DISP32(\Index, 64+\OffsetB)(\BREG) + lxv vs27, DISP32(\Index, 80+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 + +.if \Complete==0 + + lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) + lxv vs24, DISP32(\Index, 96+\OffsetB)(\BREG) + lxv vs25, DISP32(\Index, 96+16+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + xvmaddasp vs48, vs27, vs4 + xvmaddasp vs49, vs27, vs5 + xvmaddasp vs50, vs27, vs6 + xvmaddasp vs51, vs27, vs7 + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) + addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) + +.else + addi \AREG, \AREG, DISP16(\Index,64) + addi \BREG, \BREG, DISP32(\Index,128) + +.endif +.endif + + +.endm + +.macro KERNEL8x4 First + LOAD8x4 0 + END8x4 \First, AO, BO, 16,32 +.endm + +.macro KERNEL8x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 + + xvmulsp vs48, vs25, vs0 + xvmulsp vs49, vs25, vs1 + xvmulsp vs50, vs25, vs2 + xvmulsp vs51, vs25, vs3 +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 +.endif + +.if \Complete==0 + + lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP16(\Index, 32+\OffsetB)(\BREG) + lxv vs25, DISP16(\Index, 48+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + +.if \First==1 + xvmulsp vs32, vs26, vs4 + xvmulsp vs33, vs26, vs5 + xvmulsp vs34, vs26, vs6 + xvmulsp vs35, vs26, vs7 + + xvmulsp vs48, vs27, vs4 + xvmulsp vs49, vs27, vs5 + xvmulsp vs50, vs27, vs6 + xvmulsp vs51, vs27, vs7 + + +.else + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + xvmaddasp vs48, vs27, vs4 + xvmaddasp vs49, vs27, vs5 + xvmaddasp vs50, vs27, vs6 + xvmaddasp vs51, vs27, vs7 +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) + +.else + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP16(\Index,64) + +.endif +.endif + + +.endm + + +.macro SAVE8x4 + slwi T10, LDC , 1 + add T1, CO, LDC +#if !defined(TRMMKERNEL) + lxv vs36, 0(CO) + lxv vs37, 0(T1) +#endif + add T2, CO, T10 + add T3, T1, T10 +#if !defined(TRMMKERNEL) + lxv vs38, 0(T2) + lxv vs39, 0(T3) +#endif + add T4, T2, T10 + add T5, T3, T10 +#if !defined(TRMMKERNEL) + lxv vs40, 0(T4) + lxv vs41, 0(T5) +#endif + add T6, T4, T10 + add T7, T5, T10 +#if !defined(TRMMKERNEL) + lxv vs42, 0(T6) + lxv vs43, 0(T7) +#endif + xxmrglw vs0, vs35,vs32 + xxmrglw vs1, vs34,vs33 + xxmrglw vs4, vs32,vs35 + xxmrglw vs5, vs33,vs34 + + + xxmrghw vs2, vs35,vs32 + xxmrghw vs3, vs34,vs33 + xxmrghw vs6, vs32,vs35 + xxmrghw vs7, vs33,vs34 + + xxmrgld vs24, vs1, vs0 + xxmrghd vs25,vs5,vs4 + + xxmrgld vs26, vs2, vs3 + xxmrghd vs27,vs6,vs7 + + + xxmrglw vs0, vs51,vs48 + xxmrglw vs1, vs50,vs49 + xxmrglw vs4, vs48,vs51 + xxmrglw vs5, vs49,vs50 + + xxmrghw vs2, vs51,vs48 + xxmrghw vs3, vs50,vs49 + xxmrghw vs6, vs48,vs51 + xxmrghw vs7, vs49,vs50 + + xxmrgld vs28, vs1, vs0 + xxmrghd vs29,vs5,vs4 + + xxmrgld vs30, vs2, vs3 + xxmrghd vs31,vs6,vs7 +#if defined(TRMMKERNEL) + + xvmulsp vs36, vs24, alpha_r + xvmulsp vs37, vs25, alpha_r + xvmulsp vs38, vs26, alpha_r + xvmulsp vs39, vs27, alpha_r + xvmulsp vs40, vs28, alpha_r + xvmulsp vs41, vs29, alpha_r + xvmulsp vs42, vs30, alpha_r + xvmulsp vs43, vs31, alpha_r +#else + xvmaddasp vs36, vs24, alpha_r + xvmaddasp vs37, vs25, alpha_r + xvmaddasp vs38, vs26, alpha_r + xvmaddasp vs39, vs27, alpha_r + xvmaddasp vs40, vs28, alpha_r + xvmaddasp vs41, vs29, alpha_r + xvmaddasp vs42, vs30, alpha_r + xvmaddasp vs43, vs31, alpha_r +#endif + + stxv vs36, 0(CO) + stxv vs37, 0(T1) + stxv vs38, 0(T2) + stxv vs39, 0(T3) + stxv vs40, 0(T4) + stxv vs41, 0(T5) + stxv vs42, 0(T6) + stxv vs43, 0(T7) + + + addi CO,CO,16 +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=2 +**********************************************************************************************/ + + +.macro KERNEL8x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + + +.macro Zero8x2 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + +.endm + +.macro KERNEL8x2 + KERNEL8x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP8(\Index,16+\OffsetB)(\BREG) + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs26, vs9 + xvmulsp vs3, vs27, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs26, vs9 + xvmaddasp vs3, vs27, vs9 + + .endif + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP8(\Index,32) + +.endm + +.macro KERNEL8x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs29, DISP16(\Index,48+\OffsetB)(\BREG) + xxspltw vs8, vs4, 2 + xxspltw vs9, vs4, 3 + xxspltw vs10, vs4, 0 + xxspltw vs11, vs4, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs26, vs9 + xvmulsp vs3, vs27, vs9 + + xvmulsp vs0, vs28, vs10 + xvmulsp vs1, vs29, vs10 + xvmulsp vs2, vs28, vs11 + xvmulsp vs3, vs29, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs26, vs9 + xvmaddasp vs3, vs27, vs9 + + xvmaddasp vs0, vs28, vs10 + xvmaddasp vs1, vs29, vs10 + xvmaddasp vs2, vs28, vs11 + xvmaddasp vs3, vs29, vs11 + .endif + + +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP16(\Index,64) +.endif + +.endm + + +.macro SAVE8x2 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + add T4, T2, T10 + add T5, T3, T10 + add T6, T4, T10 + add T7, T5, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v1,4(CO) + + lxssp v2,0(T1) + lxssp v3,4(T1) + + lxssp v4,0(T2) + lxssp v5,4(T2) + + lxssp v6,0(T3) + lxssp v7,4(T3) + + lxssp v8,0(T4) + lxssp v9,4(T4) + + lxssp v10,0(T5) + lxssp v11,4(T5) + + lxssp v12,0(T6) + lxssp v13,4(T6) + + lxssp v14,0(T7) + lxssp v15,4(T7) +#endif + xscvspdp vs5, vs2 + xxspltw vs6, vs2, 1 + xxspltw vs7, vs2, 2 + xxspltw vs8, vs2, 3 + xscvspdp vs6,vs6 + xscvspdp vs7,vs7 + xscvspdp vs8,vs8 + + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + + xscvspdp vs9, vs3 + xxspltw vs10, vs3, 1 + xxspltw vs11, vs3, 2 + xxspltw vs12, vs3, 3 + xscvspdp vs10,vs10 + xscvspdp vs11,vs11 + xscvspdp vs12,vs12 + + xscvspdp vs28, vs1 + xxspltw vs29, vs1, 1 + xxspltw vs30, vs1, 2 + xxspltw vs31, vs1, 3 + xscvspdp vs29,vs29 + xscvspdp vs30,vs30 + xscvspdp vs31,vs31 + + + + +#if defined(TRMMKERNEL) + xsmuldp vs32,vs8, vs4 + xsmuldp vs33,vs27, vs4 + + xsmuldp vs34,vs7, vs4 + xsmuldp vs35,vs26, vs4 + + xsmuldp vs36,vs6, vs4 + xsmuldp vs37,vs25, vs4 + + xsmuldp vs38,vs5, vs4 + xsmuldp vs39,vs24, vs4 + + xsmuldp vs40,vs12, vs4 + xsmuldp vs41,vs31, vs4 + + xsmuldp vs42,vs11, vs4 + xsmuldp vs43,vs30, vs4 + + xsmuldp vs44,vs10, vs4 + xsmuldp vs45,vs29, vs4 + + xsmuldp vs46,vs9, vs4 + xsmuldp vs47,vs28, vs4 +#else + xsmaddadp vs32,vs8, vs4 + xsmaddadp vs33,vs27, vs4 + + xsmaddadp vs34,vs7, vs4 + xsmaddadp vs35,vs26, vs4 + + xsmaddadp vs36,vs6, vs4 + xsmaddadp vs37,vs25, vs4 + + xsmaddadp vs38,vs5, vs4 + xsmaddadp vs39,vs24, vs4 + + xsmaddadp vs40,vs12, vs4 + xsmaddadp vs41,vs31, vs4 + + xsmaddadp vs42,vs11, vs4 + xsmaddadp vs43,vs30, vs4 + + xsmaddadp vs44,vs10, vs4 + xsmaddadp vs45,vs29, vs4 + + xsmaddadp vs46,vs9, vs4 + xsmaddadp vs47,vs28, vs4 +#endif + + stxssp v0,0(CO) + stxssp v1,4(CO) + + stxssp v2,0(T1) + stxssp v3,4(T1) + + stxssp v4,0(T2) + stxssp v5,4(T2) + + stxssp v6,0(T3) + stxssp v7,4(T3) + + stxssp v8,0(T4) + stxssp v9,4(T4) + + stxssp v10,0(T5) + stxssp v11,4(T5) + + stxssp v12,0(T6) + stxssp v13,4(T6) + + stxssp v14,0(T7) + stxssp v15,4(T7) + + + addi CO,CO,8 +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=1 +**********************************************************************************************/ +.macro KERNEL8x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro Zero8x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +.endm + +.macro KERNEL8x1 + KERNEL8x1_1 AO,BO, 0 +.endm + +.macro KERNEL8x1_2 + KERNEL8x1_2_1 AO,BO, 0 +.endm + +.macro KERNEL8x1_1 AREG,BREG,First + lxvwsx vs8, 0, \AREG + lxv vs26, 0(\BREG) + lxv vs27, 16(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + .endif + addi \AREG, \AREG, 4 + addi \BREG, \BREG, 32 +.endm + +.macro KERNEL8x1_2_1 AREG,BREG,First + lxsd v4, 0(\AREG) + lxv vs26, 0(\BREG) + lxv vs27, 16(\BREG) + lxv vs28, 32(\BREG) + lxv vs29, 48(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs0, vs28, vs9 + xvmulsp vs1, vs29, vs9 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs0, vs28, vs9 + xvmaddasp vs1, vs29, vs9 + .endif + addi \AREG, \AREG, 8 + addi \BREG, \BREG, 64 +.endm + +.macro KERNEL8x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + xxspltw vs8, vs4, 3 + xxspltw vs9, vs4, 2 + xxspltw vs10, vs4, 1 + xxspltw vs11, vs4, 0 + lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,32+\OffsetB)(\BREG) + lxv vs29, DISP32(\Index,48+\OffsetB)(\BREG) + lxv vs30, DISP32(\Index,64+ 0+\OffsetB)(\BREG) + lxv vs31, DISP32(\Index,64+16+\OffsetB)(\BREG) + lxv vs32, DISP32(\Index,64+32+\OffsetB)(\BREG) + lxv vs33, DISP32(\Index,64+48+\OffsetB)(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs0, vs28, vs9 + xvmulsp vs1, vs29, vs9 + xvmulsp vs0, vs30, vs10 + xvmulsp vs1, vs31, vs10 + xvmulsp vs0, vs32, vs11 + xvmulsp vs1, vs33, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs0, vs28, vs9 + xvmaddasp vs1, vs29, vs9 + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs1, vs31, vs10 + xvmaddasp vs0, vs32, vs11 + xvmaddasp vs1, vs33, vs11 + .endif +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP32(\Index,128) +.endif +.endm + +.macro SAVE8x1 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + add T4, T2, T10 + add T5, T3, T10 + add T6, T4, T10 + add T7, T5, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v2,0(T1) + lxssp v4,0(T2) + lxssp v6,0(T3) + lxssp v8,0(T4) + lxssp v10,0(T5) + lxssp v12,0(T6) + lxssp v14,0(T7) +#endif + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + xscvspdp vs28, vs1 + xxspltw vs29, vs1, 1 + xxspltw vs30, vs1, 2 + xxspltw vs31, vs1, 3 + xscvspdp vs29,vs29 + xscvspdp vs30,vs30 + xscvspdp vs31,vs31 +#if defined(TRMMKERNEL) + xsmuldp vs32,vs27, vs4 + xsmuldp vs34,vs26, vs4 + xsmuldp vs36,vs25, vs4 + xsmuldp vs38,vs24, vs4 + xsmuldp vs40,vs31, vs4 + xsmuldp vs42,vs30, vs4 + xsmuldp vs44,vs29, vs4 + xsmuldp vs46,vs28, vs4 +#else + xsmaddadp vs32,vs27, vs4 + xsmaddadp vs34,vs26, vs4 + xsmaddadp vs36,vs25, vs4 + xsmaddadp vs38,vs24, vs4 + xsmaddadp vs40,vs31, vs4 + xsmaddadp vs42,vs30, vs4 + xsmaddadp vs44,vs29, vs4 + xsmaddadp vs46,vs28, vs4 +#endif + stxssp v0,0(CO) + stxssp v2,0(T1) + stxssp v4,0(T2) + stxssp v6,0(T3) + stxssp v8,0(T4) + stxssp v10,0(T5) + stxssp v12,0(T6) + stxssp v14,0(T7) + addi CO,CO,4 +.endm + + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro LOAD4x16_1 + LOAD4x16 1 +.endm + +.macro LOAD4x16_0 + LOAD4x16 0 +.endm + +.macro KERNEL4x16_L1_L4 Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero4X16 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + +.macro LOAD4x16 Zero + + lxv vs24, 0(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + +.endif +.endm + +.macro END4x16_NORMAL + END4x16 0, AO, BO, 64,16 +.endm + +.macro END4x16 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + +.endif +.endm + +.macro KERNEL4x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + + lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + + + lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG) + lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + + xxpermdi vs27, vs26, vs26,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + + lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) + + lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG) + lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG) + lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG) + lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + + +.if \Complete==0 + lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) + + lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG) + lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG) + lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG) + lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) + addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP64(\Index,256) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + + +.endm + +.macro KERNEL4x16 First + + LOAD4x16 0 + END4x16 \First, AO, BO, 64,16 +.endm + +.macro KERNEL4x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 +.endif + + xxpermdi vs11, vs10, vs10,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + +.endif +.if \Complete==0 + lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) + lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) + lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) + +.else + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + xvmulsp vs34, vs6,vs8 + xvmulsp vs35, vs7,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + xvmulsp vs38, vs6,vs9 + xvmulsp vs39, vs7,vs9 +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + xvmulsp vs42, vs6,vs10 + xvmulsp vs43, vs7,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + xvmulsp vs46, vs6,vs11 + xvmulsp vs47, vs7,vs11 + + + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + + +.endif + +.endm + + +.macro SAVE4x16 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxmrglw vs16, vs34, vs46 + xxmrglw vs18, vs38, vs42 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxmrghw vs4, vs38, vs42 + xxmrghw vs5, vs34, vs46 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxmrglw vs24, vs35, vs47 + xxmrglw vs26, vs39, vs43 + + xxlor vs17, vs16, vs16 + xxlor vs19, vs18, vs18 + + xxmrghw vs30, vs39, vs43 + xxmrghw vs31, vs35, vs47 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + +#ifndef TRMMKERNEL + lxv vs32, 0(CO) + lxv vs33, 16(CO) + lxv vs34, 32(CO) + lxv vs35, 48(CO) +#endif + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + +#ifndef TRMMKERNEL + lxv vs36, 0(T1) + lxv vs37, 16(T1) + lxv vs38, 32(T1) + lxv vs39, 48(T1) +#endif +#ifndef TRMMKERNEL + lxv vs40, 0(T2) + lxv vs41, 16(T2) + lxv vs42, 32(T2) + lxv vs43, 48(T2) +#endif +#ifndef TRMMKERNEL + lxv vs44, 0(T3) + lxv vs45, 16(T3) + lxv vs46, 32(T3) + lxv vs47, 48(T3) +#endif + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 + + xxperm vs17, vs4, save_permute_2 + xxperm vs19, vs5, save_permute_2 + + xxperm vs24, vs30, save_permute_1 + xxperm vs26, vs31, save_permute_1 + + xxperm vs25, vs30, save_permute_2 + xxperm vs27, vs31, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r + xvmulsp vs36, vs9, alpha_r + xvmulsp vs37, vs13, alpha_r + xvmulsp vs38, vs17, alpha_r + xvmulsp vs39, vs25, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r + xvmaddasp vs38, vs17, alpha_r + xvmaddasp vs39, vs25, alpha_r +#endif + + + +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r + +#endif + + stxv vs32, 0(CO) + stxv vs33, 16(CO) + stxv vs34, 32(CO) + stxv vs35, 48(CO) + + stxv vs36, 0(T1) + stxv vs37, 16(T1) + stxv vs38, 32(T1) + stxv vs39, 48(T1) + + stxv vs40, 0(T2) + stxv vs41, 16(T2) + stxv vs42, 32(T2) + stxv vs43, 48(T2) + stxv vs44, 0(T3) + stxv vs45, 16(T3) + stxv vs46, 32(T3) + stxv vs47, 48(T3) + + addi CO,CO,64 + + +.endm + + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro LOAD4x8_1 + LOAD4x8 1 +.endm + +.macro LOAD4x8_0 + LOAD4x8 0 +.endm + +.macro KERNEL4x8_L1_L4 Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro END4x8_NORMAL + END4x8 0, AO, BO, 32,16 +.endm + +.macro Zero4X8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + +.endm + +.macro LOAD4x8 Zero + + lxv vs24, 0(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + + xxpermdi vs27, vs26, vs26,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + +.endif +.endm + + +.macro END4x8 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + +.endif +.endm + +.macro KERNEL4x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + + + lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + + xxpermdi vs27, vs26, vs26,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + + + lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + + +.if \Complete==0 + lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + + +.endm + +.macro KERNEL4x8 First + + LOAD4x8 0 + END4x8 \First, AO, BO, 32,16 +.endm + +.macro KERNEL4x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + +.endif + + xxpermdi vs11, vs10, vs10,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + +.endif +.if \Complete==0 + lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) + + lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) + addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) + +.else + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP16(\Index,64) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + +.endif + +.endm + + +.macro SAVE4x8 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + + +#ifndef TRMMKERNEL + lxv vs34, 0(CO) + lxv vs35, 16(CO) + lxv vs38, 0(T1) + lxv vs39, 16(T1) + lxv vs42, 0(T2) + lxv vs43, 16(T2) + lxv vs46, 0(T3) + lxv vs47, 16(T3) + + +#endif + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs34, vs8, alpha_r + xvmulsp vs35, vs12, alpha_r + xvmulsp vs38, vs9, alpha_r + xvmulsp vs39, vs13, alpha_r + xvmulsp vs42, vs10, alpha_r + xvmulsp vs43, vs14, alpha_r + xvmulsp vs46, vs11, alpha_r + xvmulsp vs47, vs15, alpha_r +#else + xvmaddasp vs34, vs8, alpha_r + xvmaddasp vs35, vs12, alpha_r + xvmaddasp vs38, vs9, alpha_r + xvmaddasp vs39, vs13, alpha_r + xvmaddasp vs42, vs10, alpha_r + xvmaddasp vs43, vs14, alpha_r + xvmaddasp vs46, vs11, alpha_r + xvmaddasp vs47, vs15, alpha_r +#endif + + + stxv vs34, 0(CO) + stxv vs35, 16(CO) + stxv vs38, 0(T1) + stxv vs39, 16(T1) + stxv vs42, 0(T2) + stxv vs43, 16(T2) + stxv vs46, 0(T3) + stxv vs47, 16(T3) + + + addi CO,CO,32 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro LOAD4x4_1 + LOAD4x4 1 +.endm + +.macro LOAD4x4_0 + LOAD4x4 0 +.endm + +.macro KERNEL4x4_L1_L4 Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL4x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero4X4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endm + +.macro LOAD4x4 Zero + + lxv vs0, 0(AO) + lxv vs24, 0(BO) + + + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endif +.endm + +.macro END4x4_NORMAL + END4x4 0, AO, BO, 16,16 +.endm + +.macro END4x4 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + +.endif +.endm + +.macro KERNEL4x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + + lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP16(\Index, 16+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + + + lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 32+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + +.if \Complete==0 + + lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) + lxv vs24, DISP16(\Index, 48+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) + addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) + +.else + addi \AREG, \AREG, DISP16(\Index,64) + addi \BREG, \BREG, DISP16(\Index,64) + +.endif +.endif + + +.endm + +.macro KERNEL4x4 First + LOAD4x4 0 + END4x4 \First, AO, BO, 16,16 +.endm + +.macro KERNEL4x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 + +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + +.endif + +.if \Complete==0 + + lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP8(\Index, 16+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + +.if \First==1 + xvmulsp vs32, vs26, vs4 + xvmulsp vs33, vs26, vs5 + xvmulsp vs34, vs26, vs6 + xvmulsp vs35, vs26, vs7 + + +.else + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) + +.else + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP8(\Index,32) + +.endif +.endif + + +.endm + + +.macro SAVE4x4 + slwi T10, LDC , 1 + add T1, CO, LDC +#if !defined(TRMMKERNEL) + lxv vs36, 0(CO) + lxv vs37, 0(T1) +#endif + add T2, CO, T10 + add T3, T1, T10 +#if !defined(TRMMKERNEL) + lxv vs38, 0(T2) + lxv vs39, 0(T3) +#endif + + xxmrglw vs0, vs35,vs32 + xxmrglw vs1, vs34,vs33 + xxmrglw vs4, vs32,vs35 + xxmrglw vs5, vs33,vs34 + + + xxmrghw vs2, vs35,vs32 + xxmrghw vs3, vs34,vs33 + xxmrghw vs6, vs32,vs35 + xxmrghw vs7, vs33,vs34 + + xxmrgld vs24, vs1, vs0 + xxmrghd vs25,vs5,vs4 + + xxmrgld vs26, vs2, vs3 + xxmrghd vs27,vs6,vs7 + + #if defined(TRMMKERNEL) + xvmulsp vs36, vs24, alpha_r + xvmulsp vs37, vs25, alpha_r + xvmulsp vs38, vs26, alpha_r + xvmulsp vs39, vs27, alpha_r +#else + xvmaddasp vs36, vs24, alpha_r + xvmaddasp vs37, vs25, alpha_r + xvmaddasp vs38, vs26, alpha_r + xvmaddasp vs39, vs27, alpha_r + #endif + stxv vs36, 0(CO) + stxv vs37, 0(T1) + stxv vs38, 0(T2) + stxv vs39, 0(T3) + + + + addi CO,CO,16 +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + + +.macro KERNEL4x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + + +.macro Zero4x2 + xxlxor vs0, vs0, vs0 + xxlxor vs2, vs2, vs2 + +.endm + +.macro KERNEL4x2 + KERNEL4x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs2, vs26, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs2, vs26, vs9 + + .endif + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP4(\Index,16) + +.endm + +.macro KERNEL4x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs28, DISP8(\Index,16+\OffsetB)(\BREG) + xxspltw vs8, vs4, 2 + xxspltw vs9, vs4, 3 + xxspltw vs10, vs4, 0 + xxspltw vs11, vs4, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs2, vs26, vs9 + + xvmulsp vs0, vs28, vs10 + xvmulsp vs2, vs28, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs2, vs26, vs9 + + xvmaddasp vs0, vs28, vs10 + xvmaddasp vs2, vs28, vs11 + .endif + + +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP8(\Index,32) +.endif + +.endm + + +.macro SAVE4x2 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v1,4(CO) + + lxssp v2,0(T1) + lxssp v3,4(T1) + + lxssp v4,0(T2) + lxssp v5,4(T2) + + lxssp v6,0(T3) + lxssp v7,4(T3) + + +#endif + xscvspdp vs5, vs2 + xxspltw vs6, vs2, 1 + xxspltw vs7, vs2, 2 + xxspltw vs8, vs2, 3 + xscvspdp vs6,vs6 + xscvspdp vs7,vs7 + xscvspdp vs8,vs8 + + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + + +#if defined(TRMMKERNEL) + xsmuldp vs32,vs8, vs4 + xsmuldp vs33,vs27, vs4 + + xsmuldp vs34,vs7, vs4 + xsmuldp vs35,vs26, vs4 + + xsmuldp vs36,vs6, vs4 + xsmuldp vs37,vs25, vs4 + + xsmuldp vs38,vs5, vs4 + xsmuldp vs39,vs24, vs4 + + +#else + xsmaddadp vs32,vs8, vs4 + xsmaddadp vs33,vs27, vs4 + + xsmaddadp vs34,vs7, vs4 + xsmaddadp vs35,vs26, vs4 + + xsmaddadp vs36,vs6, vs4 + xsmaddadp vs37,vs25, vs4 + + xsmaddadp vs38,vs5, vs4 + xsmaddadp vs39,vs24, vs4 + + +#endif + + stxssp v0,0(CO) + stxssp v1,4(CO) + + stxssp v2,0(T1) + stxssp v3,4(T1) + + stxssp v4,0(T2) + stxssp v5,4(T2) + + stxssp v6,0(T3) + stxssp v7,4(T3) + + + + + addi CO,CO,8 +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ +.macro KERNEL4x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro Zero4x1 + xxlxor vs0, vs0, vs0 +.endm + +.macro KERNEL4x1 + KERNEL4x1_1 AO,BO, 0 +.endm + +.macro KERNEL4x1_2 + KERNEL4x1_2_1 AO,BO, 0 +.endm + +.macro KERNEL4x1_1 AREG,BREG,First + lxvwsx vs8, 0, \AREG + lxv vs26, 0(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 +.else + xvmaddasp vs0, vs26, vs8 + .endif + addi \AREG, \AREG, 4 + addi \BREG, \BREG, 16 +.endm + +.macro KERNEL4x1_2_1 AREG,BREG,First + lxsd v4, 0(\AREG) + lxv vs26, 0(\BREG) + lxv vs28, 16(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs0, vs28, vs9 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs0, vs28, vs9 + .endif + addi \AREG, \AREG, 8 + addi \BREG, \BREG, 32 +.endm + +.macro KERNEL4x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + xxspltw vs8, vs4, 3 + xxspltw vs9, vs4, 2 + xxspltw vs10, vs4, 1 + xxspltw vs11, vs4, 0 + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,16+\OffsetB)(\BREG) + lxv vs30, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs32, DISP16(\Index,48+\OffsetB)(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs0, vs28, vs9 + xvmulsp vs0, vs30, vs10 + xvmulsp vs0, vs32, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs0, vs28, vs9 + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs0, vs32, vs11 + .endif +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP16(\Index,64) +.endif +.endm + +.macro SAVE4x1 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v2,0(T1) + lxssp v4,0(T2) + lxssp v6,0(T3) +#endif + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + +#if defined(TRMMKERNEL) + xsmuldp vs32,vs27, vs4 + xsmuldp vs34,vs26, vs4 + xsmuldp vs36,vs25, vs4 + xsmuldp vs38,vs24, vs4 +#else + xsmaddadp vs32,vs27, vs4 + xsmaddadp vs34,vs26, vs4 + xsmaddadp vs36,vs25, vs4 + xsmaddadp vs38,vs24, vs4 +#endif + stxssp v0,0(CO) + stxssp v2,0(T1) + stxssp v4,0(T2) + stxssp v6,0(T3) + addi CO,CO,4 +.endm + +/****************************N=2 section*****************/ + +.macro KERNEL2x16_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero2x16 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 +.endm + +.macro KERNEL2x16 + KERNEL2x16_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs28, vs8 + xvmulsp vs3, vs29, vs8 + + xvmulsp vs4, vs26, vs9 + xvmulsp vs5, vs27, vs9 + xvmulsp vs6, vs28, vs9 + xvmulsp vs7, vs29, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + xvmaddasp vs6, vs28, vs9 + xvmaddasp vs7, vs29, vs9 + + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP16(\Index,64) + +.endm + + + + +.macro KERNEL2x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) + + lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) + + lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) + lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) + lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) + lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) + + lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) + lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) + lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) + lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + xxspltw vs12, vs39, 3 + xxspltw vs13, vs39, 2 + xxspltw vs14, vs39, 1 + xxspltw vs15, vs39, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + xvmaddasp vs6, vs28, vs9 + xvmaddasp vs7, vs29, vs9 + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + xvmaddasp vs2, vs18, vs10 + xvmaddasp vs3, vs19, vs10 + + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + xvmaddasp vs6, vs18, vs11 + xvmaddasp vs7, vs19, vs11 + + xvmaddasp vs0, vs30, vs12 + xvmaddasp vs1, vs31, vs12 + xvmaddasp vs2, vs32, vs12 + xvmaddasp vs3, vs33, vs12 + + xvmaddasp vs4, vs30, vs13 + xvmaddasp vs5, vs31, vs13 + xvmaddasp vs6, vs32, vs13 + xvmaddasp vs7, vs33, vs13 + + xvmaddasp vs0, vs34, vs14 + xvmaddasp vs1, vs35, vs14 + xvmaddasp vs2, vs36, vs14 + xvmaddasp vs3, vs37, vs14 + + xvmaddasp vs4, vs34, vs15 + xvmaddasp vs5, vs35, vs15 + xvmaddasp vs6, vs36, vs15 + xvmaddasp vs7, vs37, vs15 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP64(\Index,256) +.endif + +.endm + +.macro KERNEL2x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 3 + xxspltw vs9, vs36, 2 + xxspltw vs10, vs36, 1 + xxspltw vs11, vs36, 0 + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) + lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + xvmaddasp vs6, vs28, vs9 + xvmaddasp vs7, vs29, vs9 + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + xvmaddasp vs2, vs18, vs10 + xvmaddasp vs3, vs19, vs10 + + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + xvmaddasp vs6, vs18, vs11 + xvmaddasp vs7, vs19, vs11 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + + +.macro SAVE2x16 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) + lxv vs18, 32(CO) + lxv vs19, 48(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs26, 0(T1) + lxv vs27, 16(T1) + lxv vs28, 32(T1) + lxv vs29, 48(T1) +#endif + +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r + xvmulsp vs18, vs2, alpha_r + xvmulsp vs19, vs3, alpha_r + xvmulsp vs26, vs4, alpha_r + xvmulsp vs27, vs5, alpha_r + xvmulsp vs28, vs6, alpha_r + xvmulsp vs29, vs7, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r + xvmaddasp vs18, vs2, alpha_r + xvmaddasp vs19, vs3, alpha_r + xvmaddasp vs26, vs4, alpha_r + xvmaddasp vs27, vs5, alpha_r + xvmaddasp vs28, vs6, alpha_r + xvmaddasp vs29, vs7, alpha_r +#endif + stxv vs16, 0(CO) + stxv vs17, 16(CO) + stxv vs18, 32(CO) + stxv vs19, 48(CO) + + stxv vs26, 0(T1) + stxv vs27, 16(T1) + stxv vs28, 32(T1) + stxv vs29, 48(T1) + + addi CO,CO,64 + +.endm + +/* M=8 N=2 */ + +.macro KERNEL2x8_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero2x8 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +.endm + +.macro KERNEL2x8 + KERNEL2x8_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + + xvmulsp vs4, vs26, vs9 + xvmulsp vs5, vs27, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP8(\Index,32) + +.endm + + + + +.macro KERNEL2x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + + lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) + + lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + + lxv vs34, DISP32(\Index, 96+ 0+\OffsetA)(\AREG) + lxv vs35, DISP32(\Index, 96+ 16+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + xxspltw vs12, vs39, 3 + xxspltw vs13, vs39, 2 + xxspltw vs14, vs39, 1 + xxspltw vs15, vs39, 0 + + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + + + xvmaddasp vs0, vs30, vs12 + xvmaddasp vs1, vs31, vs12 + xvmaddasp vs4, vs30, vs13 + xvmaddasp vs5, vs31, vs13 + + xvmaddasp vs0, vs34, vs14 + xvmaddasp vs1, vs35, vs14 + xvmaddasp vs4, vs34, vs15 + xvmaddasp vs5, vs35, vs15 + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + +.macro KERNEL2x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 3 + xxspltw vs9, vs36, 2 + xxspltw vs10, vs36, 1 + xxspltw vs11, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs16, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs17, DISP16(\Index,48+\OffsetA)(\AREG) + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + + +.macro SAVE2x8 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs26, 0(T1) + lxv vs27, 16(T1) + +#endif + +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r + xvmulsp vs26, vs4, alpha_r + xvmulsp vs27, vs5, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r + xvmaddasp vs26, vs4, alpha_r + xvmaddasp vs27, vs5, alpha_r +#endif + + stxv vs16, 0(CO) + stxv vs17, 16(CO) + + + stxv vs26, 0(T1) + stxv vs27, 16(T1) + + addi CO,CO,32 + +.endm + + +/*M=4*/ + + +.macro KERNEL2x4_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + /* we will aggregate on save vs0 +vs4 vs11+vs5 */ +.macro Zero2x4 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +.endm + +.macro KERNEL2x4 + KERNEL2x4_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs26, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP4(\Index,16) + +.endm + + + + +.macro KERNEL2x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs16, DISP16(\Index,16+\OffsetA)(\AREG) + + lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs34, DISP16(\Index,32+ 16+\OffsetA)(\AREG) + + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + xxspltw vs12, vs39, 3 + xxspltw vs13, vs39, 2 + xxspltw vs14, vs39, 1 + xxspltw vs15, vs39, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + xvmaddasp vs4, vs16, vs10 + xvmaddasp vs5, vs16, vs11 + + + xvmaddasp vs0, vs30, vs12 + xvmaddasp vs1, vs30, vs13 + xvmaddasp vs4, vs34, vs14 + xvmaddasp vs5, vs34, vs15 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + +.macro KERNEL2x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 3 + xxspltw vs9, vs36, 2 + xxspltw vs10, vs36, 1 + xxspltw vs11, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs16, DISP8(\Index, 16+\OffsetA)(\AREG) + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + xvmaddasp vs4, vs16, vs10 + xvmaddasp vs5, vs16, vs11 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP8(\Index,32) +.endif + +.endm + + +.macro SAVE2x4 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs26, 0(T1) + +#endif + /*aggregate vectors*/ + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs26, vs1, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs26, vs1, alpha_r +#endif + + stxv vs16, 0(CO) + stxv vs26, 0(T1) + + addi CO,CO,16 + +.endm + + +/* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2 */ +.macro SWITCH_PERMUTE_INNER + xxpermdi permute_mask, permute_mask, permute_mask,2 +.endm + +.macro Zero2x2 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + SWITCH_PERMUTE_INNER +.endm + +.macro KERNEL2x2 + KERNEL2x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxperm vs9, vs36, permute_mask + lxsd v5, DISP2(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs37, vs36 + xvmulsp vs1, vs37, vs9 + +.else + xvmaddasp vs0, vs37, vs36 + xvmaddasp vs1, vs37, vs9 + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP2(\Index,8) + +.endm + + + + +.macro KERNEL2x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs16, DISP8(\Index,16+\OffsetA)(\AREG) + + + xxperm vs9, vs8, permute_mask + xxperm vs11, vs10, permute_mask + + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs16, vs11 + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP8(\Index,32) +.endif + +.endm + +.macro KERNEL2x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP4(\Index, 0+\OffsetB)(\BREG) + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + + xxperm vs9, vs8, permute_mask + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP4(\Index,16) +.endif +.endm + + +.macro SAVE2x2 + +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxsd v5 , 0(T1) + +#endif + /*aggregate vectors*/ + xxpermdi vs4,vs0,vs0,2 + xxpermdi vs5,vs1,vs1,2 + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 + /* */ + /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10} */ + xxperm vs1,vs1, permute_mask + + + xxmrghw vs2 ,vs1,vs0 + xxpermdi vs2,vs2,vs2,2 + xxmrghw vs3 ,vs0,vs1 +#if defined(TRMMKERNEL) + xvmulsp vs36, vs2, alpha_r + xvmulsp vs37, vs3, alpha_r +#else + xvmaddasp vs36, vs2, alpha_r + xvmaddasp vs37, vs3, alpha_r +#endif + /**** store last two words*/ + + + stxsd v4, 0(CO) + stxsd v5, 0(T1) + + addi CO,CO,8 + +.endm + +/*--------------------------- M=1 N=2 */ +.macro Zero2x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2,vs2,vs2 + xxlxor vs3,vs3,vs3 +.endm + +.macro KERNEL2x1 + KERNEL2x1_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + /* + we will calculate 1 alone then will add it to batched ones + */ +.macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v3, DISP2(\Index, 0+\OffsetB)(\BREG) + lxssp v4, DISP2(\Index, 4+\OffsetB)(\BREG) + lxssp v5, DISP1(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs2, vs37, vs35 + xvmulsp vs3, vs37, vs36 + +.else + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP1(\Index,4) + +.endm + + + + +.macro KERNEL2x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + xxmrglw vs5, vs26,vs26 + xxmrghw vs6, vs26,vs26 + + xvmaddasp vs0, vs8, vs5 + xvmaddasp vs1, vs10, vs6 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP4(\Index,16) +.endif + +.endm + +.macro KERNEL2x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxssp v3, DISP4(\Index, 0+\OffsetB)(\BREG) + lxssp v4, DISP4(\Index, 4+\OffsetB)(\BREG) + lxssp v7, DISP4(\Index, 8+\OffsetB)(\BREG) + lxssp v8, DISP4(\Index, 12+\OffsetB)(\BREG) + lxssp v5, DISP2(\Index, 0+\OffsetA)(\AREG) + lxssp v6, DISP2(\Index, 4+\OffsetA)(\AREG) + + + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + + xsmaddadp vs2, vs38, vs39 + xsmaddadp vs3, vs38, vs40 + + + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP2(\Index,8) +.endm + + +.macro SAVE2x1 + +#ifndef TRMMKERNEL + lxssp v4 , 0(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxssp v5 , 0(T1) + +#endif + + /*convert alpha_r for multiply*/ + xscvspdp vs16,alpha_r + + /*aggregate vectors 2x2_4 */ + xxpermdi vs4,vs0,vs0,2 + xxpermdi vs5,vs1,vs1,2 + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 + xvaddsp vs0,vs0,vs1 +/*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/ + xscvspdp vs5, vs0 + xxspltw vs6, vs0, 1 + xscvspdp vs6,vs6 + xsadddp vs2,vs2,vs6 + xsadddp vs3,vs3,vs5 + + /**** store last two words*/ +#if defined(TRMMKERNEL) + xsmuldp vs36,vs2, vs16 + xsmuldp vs37,vs3, vs16 + +#else + xsmaddadp vs36,vs2, vs16 + xsmaddadp vs37,vs3, vs16 +#endif + + stxssp v4, 0(CO) + stxssp v5, 0(T1) + + addi CO,CO,4 + +.endm + + + +/****************************N=1 section*****************/ + +.macro KERNEL1x16_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero1x16 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 +.endm + +.macro KERNEL1x16 + KERNEL1x16_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) + xscvdpspn vs36,vs36 + xxspltw vs8, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs28, vs8 + xvmulsp vs3, vs29, vs8 + + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + .endif + + addi \BREG, \BREG, DISP1(\Index,4) + addi \AREG, \AREG, DISP16(\Index,64) + +.endm + + + + +.macro KERNEL1x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) + + lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) + + lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + + lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) + lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) + lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) + lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) + + lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) + lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) + lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) + lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) + + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + + xvmaddasp vs0, vs16, vs9 + xvmaddasp vs1, vs17, vs9 + xvmaddasp vs2, vs18, vs9 + xvmaddasp vs3, vs19, vs9 + + + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs1, vs31, vs10 + xvmaddasp vs2, vs32, vs10 + xvmaddasp vs3, vs33, vs10 + + + xvmaddasp vs0, vs34, vs11 + xvmaddasp vs1, vs35, vs11 + xvmaddasp vs2, vs36, vs11 + xvmaddasp vs3, vs37, vs11 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP64(\Index,256) +.endif + +.endm + +.macro KERNEL1x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) + lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + + xvmaddasp vs0, vs16, vs9 + xvmaddasp vs1, vs17, vs9 + xvmaddasp vs2, vs18, vs9 + xvmaddasp vs3, vs19, vs9 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + + +.macro SAVE1x16 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) + lxv vs18, 32(CO) + lxv vs19, 48(CO) +#endif + + +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r + xvmulsp vs18, vs2, alpha_r + xvmulsp vs19, vs3, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r + xvmaddasp vs18, vs2, alpha_r + xvmaddasp vs19, vs3, alpha_r +#endif + stxv vs16, 0(CO) + stxv vs17, 16(CO) + stxv vs18, 32(CO) + stxv vs19, 48(CO) + + addi CO,CO,64 + +.endm + +/* M=8 N=1 */ + +.macro KERNEL1x8_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero1x8 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 +.endm + +.macro KERNEL1x8 + KERNEL1x8_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) + xscvdpspn vs36,vs36 + xxspltw vs8, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + .endif + + addi \BREG, \BREG, DISP1(\Index,4) + addi \AREG, \AREG, DISP8(\Index,32) + +.endm + + + + +.macro KERNEL1x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) + + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + + lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + + lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + + lxv vs34, DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG) + lxv vs35, DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG) + + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + + xvmaddasp vs2, vs16, vs9 + xvmaddasp vs3, vs17, vs9 + + + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs1, vs31, vs10 + + + xvmaddasp vs2, vs34, vs11 + xvmaddasp vs3, vs35, vs11 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + +.macro KERNEL1x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs16, DISP16(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs17, DISP16(\Index,32+ 16+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + + xvmaddasp vs2, vs16, vs9 + xvmaddasp vs3, vs17, vs9 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + + +.macro SAVE1x8 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) +#endif + /* aggregate vs0 vs2 and vs1 vs3*/ + xvaddsp vs0,vs0,vs2 + xvaddsp vs1,vs1,vs3 +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r +#endif + stxv vs16, 0(CO) + stxv vs17, 16(CO) + + addi CO,CO,32 + +.endm +/*M=4*/ + +.macro KERNEL1x4_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero1x4 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 +.endm + +.macro KERNEL1x4 + KERNEL1x4_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) + xscvdpspn vs36,vs36 + xxspltw vs8, vs36, 0 + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 +.else + xvmaddasp vs0, vs26, vs8 + + .endif + + addi \BREG, \BREG, DISP1(\Index,4) + addi \AREG, \AREG, DISP4(\Index,16) + +.endm + + + + +.macro KERNEL1x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) + + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + + lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs31, DISP16(\Index,32+ 16+\OffsetA)(\AREG) + + + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + + xvmaddasp vs0, vs26, vs8 + + xvmaddasp vs1, vs27, vs9 + + xvmaddasp vs2, vs30, vs10 + + + xvmaddasp vs3, vs31, vs11 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + +.macro KERNEL1x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs9 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP8(\Index,32) +.endif + +.endm + + +.macro SAVE1x4 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) +#endif + /* aggregate */ + xvaddsp vs0,vs0,vs2 + xvaddsp vs1,vs1,vs3 + xvaddsp vs0,vs1,vs0 +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r +#endif + stxv vs16, 0(CO) + + addi CO,CO,16 + +.endm + +/* M=2 N=1*/ +.macro Zero1x2 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2,vs2,vs2 + xxlxor vs3,vs3,vs3 +.endm + +.macro KERNEL1x2 + KERNEL1x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + /* + we will calculate 1 alone then will add it to batched ones + */ +.macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v3, DISP2(\Index, 0+\OffsetB)(\AREG) + lxssp v4, DISP2(\Index, 4+\OffsetB)(\AREG) + lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) + + +.if \First==1 + xvmuldp vs2, vs37, vs35 + xvmuldp vs3, vs37, vs36 + +.else + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + .endif + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP1(\Index,4) + +.endm + + + + +.macro KERNEL1x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) + lxv vs10, DISP8(\Index, 16+\OffsetB)(\AREG) + + lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) + + xxmrglw vs5, vs26,vs26 + xxmrghw vs6, vs26,vs26 + + xvmaddasp vs0, vs8, vs5 + xvmaddasp vs1, vs10, vs6 + + +.if \IsLast==1 + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP4(\Index,16) +.endif + +.endm + +.macro KERNEL1x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxssp v3, DISP4(\Index, 0+\OffsetB)(\AREG) + lxssp v4, DISP4(\Index, 4+\OffsetB)(\AREG) + lxssp v7, DISP4(\Index, 8+\OffsetB)(\AREG) + lxssp v8, DISP4(\Index, 12+\OffsetB)(\AREG) + lxssp v5, DISP2(\Index, 0+\OffsetA)(\BREG) + lxssp v6, DISP2(\Index, 4+\OffsetA)(\BREG) + + + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + + xsmaddadp vs2, vs38, vs39 + xsmaddadp vs3, vs38, vs40 + + + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP2(\Index,8) +.endm + + +.macro SAVE1x2 + +#ifndef TRMMKERNEL + lxssp v4 , 0(CO) + lxssp v5 , 4(CO) + +#endif + + /*convert alpha_r for multiply*/ + xscvspdp vs16,alpha_r + + /*aggregate vectors 1x2_4 */ + xxpermdi vs4,vs0,vs0,2 + xxpermdi vs5,vs1,vs1,2 + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 + xvaddsp vs0,vs0,vs1 +/*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/ + xscvspdp vs5, vs0 + xxspltw vs6, vs0, 1 + xscvspdp vs6,vs6 + xsadddp vs2,vs2,vs6 + xsadddp vs3,vs3,vs5 + + /**** store last two words*/ +#if defined(TRMMKERNEL) + xsmuldp vs36,vs2, vs16 + xsmuldp vs37,vs3, vs16 + +#else + xsmaddadp vs36,vs2, vs16 + xsmaddadp vs37,vs3, vs16 +#endif + + stxssp v4, 0(CO) + stxssp v5, 4(CO) + + addi CO,CO,8 + +.endm +/*///////////////// N=1 M=1 //////////////////*/ +.macro Zero1x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2,vs2 + xxlxor vs3,vs3,vs3 + xxlxor vs4,vs4,vs4 +.endm + +.macro KERNEL1x1 + KERNEL1x1_1 AO,BO, 1, 0,0,0 +.endm + +.macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_16 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_8 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + /* + we will calculate 1 alone ( FIRST==1 to zero vs4) + */ +.macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v3, DISP1(\Index, 0+\OffsetB)(\AREG) + lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) + + +.if \First==1 + xvmuldp vs4, vs37, vs35 + +.else + xsmaddadp vs4, vs37, vs35 + .endif + + addi \AREG, \AREG, DISP1(\Index,4) + addi \BREG, \BREG, DISP1(\Index,4) + +.endm + + +.macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\AREG) + lxv vs9, DISP16(\Index, 16+\OffsetB)(\AREG) + lxv vs10, DISP16(\Index, 32+0+\OffsetB)(\AREG) + lxv vs11, DISP16(\Index, 32+ 16+\OffsetB)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetA)(\BREG) + lxv vs16, DISP16(\Index, 16+\OffsetA)(\BREG) + lxv vs17, DISP16(\Index, 32+0+\OffsetA)(\BREG) + lxv vs18, DISP16(\Index, 32+16+\OffsetA)(\BREG) + xvmaddasp vs0, vs8, vs26 + xvmaddasp vs1, vs9, vs16 + xvmaddasp vs2, vs10, vs17 + xvmaddasp vs3, vs11, vs18 +.if \IsLast==1 + addi \AREG, \AREG, DISP16(\Index,64) + addi \BREG, \BREG, DISP16(\Index,64) +.endif + +.endm + +.macro KERNEL1x1_I_8 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) + lxv vs9, DISP8(\Index, 16+\OffsetB)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetA)(\BREG) + lxv vs16, DISP8(\Index, 16+\OffsetA)(\BREG) + xvmaddasp vs0, vs8, vs26 + xvmaddasp vs1, vs9, vs16 + +.if \IsLast==1 + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP8(\Index,32) +.endif + +.endm + + +.macro KERNEL1x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP4(\Index, 0+\OffsetB)(\AREG) + lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) + + xvmaddasp vs0, vs8, vs26 + + +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP4(\Index,16) +.endif + +.endm + +.macro KERNEL1x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\AREG) + lxsd v5, DISP2(\Index, 0+\OffsetA)(\BREG) + + xvmaddasp vs0, vs36, vs37 + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP2(\Index,8) +.endm + + +.macro SAVE1x1 + +#ifndef TRMMKERNEL + lxssp v4 , 0(CO) + +#endif + + /*convert alpha_r for multiply*/ + xscvspdp vs16,alpha_r + + /*aggregate vectors */ + xvaddsp vs0,vs0,vs1 + xvaddsp vs2,vs2,vs3 + xvaddsp vs0,vs0,vs2 + + xxpermdi vs7,vs0,vs0,2 + xvaddsp vs0,vs0,vs7 +/*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/ + xscvspdp vs5, vs0 + xxspltw vs6, vs0, 1 + xscvspdp vs6,vs6 + xsadddp vs7,vs5,vs6 + xsadddp vs4,vs4,vs7 + + /**** store last two words*/ +#if defined(TRMMKERNEL) + xsmuldp vs36,vs4, vs16 + +#else + xsmaddadp vs36,vs4, vs16 +#endif + + stxssp v4, 0(CO) + + addi CO,CO,4 + +.endm + + + + +/****************************TRMM POINTER REFRESH MACROSES*************************/ + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 4 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 3 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 2 + .endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif + +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif + +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + + #endif + + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif +.endm \ No newline at end of file diff --git a/param.h b/param.h index 938a82a9e..d59cb1656 100644 --- a/param.h +++ b/param.h @@ -2248,12 +2248,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 1280 +#define SGEMM_DEFAULT_P 640 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 640 #define ZGEMM_DEFAULT_P 320 -#define SGEMM_DEFAULT_Q 640 +#define SGEMM_DEFAULT_Q 1408 #define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 640 #define ZGEMM_DEFAULT_Q 640 From bfeb9c16b0011f4f5f508a6d6df18017ab28f95a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 29 Apr 2019 19:24:53 +0200 Subject: [PATCH 002/127] Increment version to 0.3.7.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 969696179..8900973a5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 6) +set(OpenBLAS_PATCH_VERSION 7.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 4f8143b098418487b261653b48b16dc71cc2a259 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 29 Apr 2019 19:25:32 +0200 Subject: [PATCH 003/127] Increment version to 0.3.7.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 21782a2b9..b46479d03 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.6 +VERSION = 0.3.7.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From daf2fec12db90c02aa74cb13726efd8f9b708312 Mon Sep 17 00:00:00 2001 From: "luz.paz" Date: Mon, 29 Apr 2019 17:03:56 -0400 Subject: [PATCH 004/127] Misc. typo fixes Found via `codespell -q 3 -w -L ith,als,dum,nd,amin,nto,wis,ba -S ./relapack,./kernel,./lapack-netlib` --- Changelog.txt | 14 +++++++------- Makefile.rule | 6 +++--- README.md | 2 +- cmake/kernel.cmake | 2 +- cmake/system.cmake | 2 +- cmake/utils.cmake | 2 +- common_stackalloc.h | 2 +- common_x86.h | 2 +- common_x86_64.h | 2 +- ctest/c_cblat1.f | 2 +- ctest/c_dblat1.f | 2 +- ctest/c_sblat1.f | 2 +- ctest/c_zblat1.f | 2 +- driver/others/blas_server.c | 6 +++--- driver/others/blas_server_win32.c | 4 ++-- driver/others/init.c | 2 +- driver/others/memory.c | 2 +- f_check | 2 +- interface/CMakeLists.txt | 2 +- interface/axpy.c | 2 +- interface/zaxpy.c | 2 +- reference/ctbmvf.f | 2 +- reference/ctpmvf.f | 2 +- reference/ctrmvf.f | 2 +- reference/dtbmvf.f | 2 +- reference/dtpmvf.f | 2 +- reference/dtrmvf.f | 2 +- reference/stbmvf.f | 2 +- reference/stpmvf.f | 2 +- reference/strmvf.f | 2 +- reference/ztbmvf.f | 2 +- reference/ztpmvf.f | 2 +- reference/ztrmvf.f | 2 +- test/cblat1.f | 2 +- test/dblat1.f | 2 +- test/sblat1.f | 2 +- test/zblat1.f | 2 +- 37 files changed, 48 insertions(+), 48 deletions(-) diff --git a/Changelog.txt b/Changelog.txt index 8df35d5c3..9feacf071 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -74,7 +74,7 @@ ARMv8: * ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7 IBM Z: - * optimized microkernels for single precicion BLAS1/2 functions have been added + * optimized microkernels for single precision BLAS1/2 functions have been added for both Z13 and Z14 ==================================================================== @@ -588,8 +588,8 @@ common: s/d/c/zaxpby, s/d/c/zimatcopy, s/d/c/zomatcopy. * Added OPENBLAS_CORETYPE environment for dynamic_arch. (a86d34) * Added NO_AVX2 flag for old binutils. (#401) - * Support outputing the CPU corename on runtime.(#407) - * Patched LAPACK to fix bug 114, 117, 118. + * Support outputting the CPU corename on runtime.(#407) + * Patched LAPACK to fix bug 114, 117, 118. (http://www.netlib.org/lapack/bug_list.html) * Disabled ?gemm3m for a work-around fix. (#400) x86/x86-64: @@ -628,7 +628,7 @@ Version 0.2.9.rc1 13-Jan-2013 common: * Update LAPACK to 3.5.0 version - * Fixed compatiable issues with Clang and Pathscale compilers. + * Fixed compatible issues with Clang and Pathscale compilers. x86/x86-64: * Optimization on Intel Haswell. @@ -705,7 +705,7 @@ Version 0.2.5 26-Nov-2012 common: * Added NO_SHARED flag to disable generating the shared library. - * Compile LAPACKE with ILP64 modle when INTERFACE64=1 (#158) + * Compile LAPACKE with ILP64 model when INTERFACE64=1 (#158) * Export LAPACK 3.4.2 symbols in shared library. (#147) * Only detect the number of physical CPU cores on Mac OSX. (#157) * Fixed NetBSD build. (#155) @@ -896,7 +896,7 @@ x86/x86_64: * Fixed #28 a wrong result of dsdot on x86_64. * Fixed #32 a SEGFAULT bug of zdotc with gcc-4.6. * Fixed #33 ztrmm bug on Nehalem. - * Work-around #27 the low performance axpy issue with small imput size & multithreads. + * Work-around #27 the low performance axpy issue with small input size & multithreads. MIPS64: * Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64. @@ -919,7 +919,7 @@ common: * Imported GotoBLAS2 1.13 BSD version x86/x86_64: - * On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would casue + * On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would cause zdotu & zdotc failures. Instead, work-around it. (Refs issue #8 #9 on github) * Modified ?axpy functions to return same netlib BLAS results when incx==0 or incy==0 (Refs issue #7 on github) diff --git a/Makefile.rule b/Makefile.rule index b46479d03..17815096e 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -181,17 +181,17 @@ NO_AFFINITY = 1 # time out to improve performance. This number should be from 4 to 30 # which corresponds to (1 << n) cycles. For example, if you set to 26, # thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz -# system). Also you can control this mumber by THREAD_TIMEOUT +# system). Also you can control this number by THREAD_TIMEOUT # CCOMMON_OPT += -DTHREAD_TIMEOUT=26 -# Using special device driver for mapping physically contigous memory +# Using special device driver for mapping physically contiguous memory # to the user space. If bigphysarea is enabled, it will use it. # DEVICEDRIVER_ALLOCATION = 1 # If you need to synchronize FP CSR between threads (for x86/x86_64 only). # CONSISTENT_FPCSR = 1 -# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute +# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute # with single thread. (Actually in recent versions this is a factor proportional to the # number of floating point operations necessary for the given problem size, no longer # an individual dimension). You can use this setting to avoid the overhead of multi- diff --git a/README.md b/README.md index 26055c745..76a65b74b 100644 --- a/README.md +++ b/README.md @@ -133,7 +133,7 @@ Please read `GotoBLAS_01Readme.txt`. #### PPC/PPC64 -- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` +- **POWER8**: Optimized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` #### IBM zEnterprise System diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 0ed09e776..9b238f004 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -1,7 +1,7 @@ # helper functions for the kernel CMakeLists.txt -# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file. +# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file. macro(SetDefaultL1) set(SAMAXKERNEL amax.S) set(DAMAXKERNEL amax.S) diff --git a/cmake/system.cmake b/cmake/system.cmake index 7fda2adb9..d0f560872 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -283,7 +283,7 @@ endif () set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}") -# TODO: nead to convert these Makefiles +# TODO: need to convert these Makefiles # include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake if (${CORE} STREQUAL "PPC440") diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 28ef65f47..fd93f8a70 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in) set(CODES_OUT ${CODES_OUT} PARENT_SCOPE) endfunction () -# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition +# generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition # @param sources_in the source files to build from # @param defines_in (optional) preprocessor definitions that will be applied to all objects # @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended. diff --git a/common_stackalloc.h b/common_stackalloc.h index ec0fa1611..d3d54669c 100644 --- a/common_stackalloc.h +++ b/common_stackalloc.h @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * SIZE must be carefully chosen to be: * - as small as possible to maximize the number of stack allocation * - large enough to support all architectures and kernel - * Chosing a too small SIZE will lead to a stack smashing. + * Choosing a SIZE too small will lead to a stack smashing. */ #define STACK_ALLOC(SIZE, TYPE, BUFFER) \ /* make it volatile because some function (ex: dgemv_n.S) */ \ diff --git a/common_x86.h b/common_x86.h index 3fdffe2a8..99adc9f5b 100644 --- a/common_x86.h +++ b/common_x86.h @@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #endif #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) -//Enable some optimazation for barcelona. +//Enable some optimization for barcelona. #define BARCELONA_OPTIMIZATION #endif diff --git a/common_x86_64.h b/common_x86_64.h index 718a81050..f59ff6627 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -276,7 +276,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #ifdef ASSEMBLER #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) -//Enable some optimazation for barcelona. +//Enable some optimization for barcelona. #define BARCELONA_OPTIMIZATION #endif diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f index c741ce506..1a123d74d 100644 --- a/ctest/c_cblat1.f +++ b/ctest/c_cblat1.f @@ -577,7 +577,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/ctest/c_dblat1.f b/ctest/c_dblat1.f index c570a9140..4a71b4dcf 100644 --- a/ctest/c_dblat1.f +++ b/ctest/c_dblat1.f @@ -653,7 +653,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/ctest/c_sblat1.f b/ctest/c_sblat1.f index 773787d6f..89902f12d 100644 --- a/ctest/c_sblat1.f +++ b/ctest/c_sblat1.f @@ -653,7 +653,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/ctest/c_zblat1.f b/ctest/c_zblat1.f index 03753e782..cd0c8541d 100644 --- a/ctest/c_zblat1.f +++ b/ctest/c_zblat1.f @@ -577,7 +577,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index e5db1804f..6f4e20610 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout(); /* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */ /* jobs is queued. */ -/* We need this grobal for cheking if initialization is finished. */ +/* We need this global for checking if initialization is finished. */ int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; /* Local Variables */ @@ -150,8 +150,8 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT)); #ifdef MONITOR -/* Monitor is a function to see thread's status for every seconds. */ -/* Usually it turns off and it's for debugging. */ +/* Monitor is a function to see thread's status for every second. */ +/* Usually it turns off and it's for debugging. */ static pthread_t monitor_thread; static int main_status[MAX_CPU_NUMBER]; diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 0b38ee365..bace54a23 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -50,7 +50,7 @@ /* This is a thread implementation for Win32 lazy implementation */ -/* Thread server common infomation */ +/* Thread server common information */ typedef struct{ CRITICAL_SECTION lock; HANDLE filled; @@ -61,7 +61,7 @@ typedef struct{ } blas_pool_t; -/* We need this global for cheking if initialization is finished. */ +/* We need this global for checking if initialization is finished. */ int blas_server_avail = 0; /* Local Variables */ diff --git a/driver/others/init.c b/driver/others/init.c index 012ef6647..0aad9c407 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) { int mynode = 1; - /* if number of threads is larger than inital condition */ + /* if number of threads is larger than initial condition */ if (pos < 0) { sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); return 0; diff --git a/driver/others/memory.c b/driver/others/memory.c index ac8545f35..3fe31168d 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2751,7 +2751,7 @@ void *blas_memory_alloc(int procpos){ #ifdef ALLOC_DEVICEDRIVER if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { - fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); + fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n"); } #endif diff --git a/f_check b/f_check index 34caa00be..b05db85bd 100644 --- a/f_check +++ b/f_check @@ -125,7 +125,7 @@ if ($compiler eq "") { $openmp = "-openmp"; } - # for embeded underscore name, e.g. zho_ge, it may append 2 underscores. + # for embedded underscore name, e.g. zho_ge, it may append 2 underscores. $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; if ($data =~ / zho_ge__/) { $need2bu = 1; diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index f76d5c13f..5ea39f864 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -24,7 +24,7 @@ set(BLAS1_MANGLED_SOURCES axpby.c ) -# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f +# TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f # these all have 'z' sources for complex versions set(BLAS2_SOURCES gemv.c ger.c diff --git a/interface/axpy.c b/interface/axpy.c index 9032946d2..eaa19f4df 100644 --- a/interface/axpy.c +++ b/interface/axpy.c @@ -91,7 +91,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. // - //Temporarily work-around the low performance issue with small imput size & + //Temporarily work-around the low performance issue with small input size & //multithreads. if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) nthreads = 1; diff --git a/interface/zaxpy.c b/interface/zaxpy.c index dbd559628..da3b48ead 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -99,7 +99,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. // - //Temporarily work-around the low performance issue with small imput size & + //Temporarily work-around the low performance issue with small input size & //multithreads. if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) nthreads = 1; diff --git a/reference/ctbmvf.f b/reference/ctbmvf.f index ff3c5268d..ada701d70 100644 --- a/reference/ctbmvf.f +++ b/reference/ctbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ctpmvf.f b/reference/ctpmvf.f index 340234270..ffc4766d2 100644 --- a/reference/ctpmvf.f +++ b/reference/ctpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ctrmvf.f b/reference/ctrmvf.f index f9d3b445a..9cd1d17ad 100644 --- a/reference/ctrmvf.f +++ b/reference/ctrmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/dtbmvf.f b/reference/dtbmvf.f index da340774e..621489085 100644 --- a/reference/dtbmvf.f +++ b/reference/dtbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/dtpmvf.f b/reference/dtpmvf.f index e8f6eb412..492f9fd46 100644 --- a/reference/dtpmvf.f +++ b/reference/dtpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/dtrmvf.f b/reference/dtrmvf.f index 0619d3eca..79b2eb806 100644 --- a/reference/dtrmvf.f +++ b/reference/dtrmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/stbmvf.f b/reference/stbmvf.f index 353e63ee8..f21e5aa8b 100644 --- a/reference/stbmvf.f +++ b/reference/stbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/stpmvf.f b/reference/stpmvf.f index 1e93b843a..d97a695f5 100644 --- a/reference/stpmvf.f +++ b/reference/stpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/strmvf.f b/reference/strmvf.f index 249aff275..7614dcd32 100644 --- a/reference/strmvf.f +++ b/reference/strmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ztbmvf.f b/reference/ztbmvf.f index 8df5609ad..c8487cf7c 100644 --- a/reference/ztbmvf.f +++ b/reference/ztbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ztpmvf.f b/reference/ztpmvf.f index 7e52ef74e..5dc03bac9 100644 --- a/reference/ztpmvf.f +++ b/reference/ztpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ztrmvf.f b/reference/ztrmvf.f index 9e4f85380..5f52622e2 100644 --- a/reference/ztrmvf.f +++ b/reference/ztrmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/test/cblat1.f b/test/cblat1.f index a4c996fda..d6b53d105 100644 --- a/test/cblat1.f +++ b/test/cblat1.f @@ -576,7 +576,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/test/dblat1.f b/test/dblat1.f index f3255fef4..28af121cd 100644 --- a/test/dblat1.f +++ b/test/dblat1.f @@ -991,7 +991,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/test/sblat1.f b/test/sblat1.f index a5c1c6af6..fe05bbe87 100644 --- a/test/sblat1.f +++ b/test/sblat1.f @@ -946,7 +946,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/test/zblat1.f b/test/zblat1.f index e2415e1c4..8b4b8d21e 100644 --- a/test/zblat1.f +++ b/test/zblat1.f @@ -576,7 +576,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * From b43c8382c885551b0f230c8493e79bf04d94e366 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 1 May 2019 10:46:46 +0200 Subject: [PATCH 005/127] Correct argument of CPU_ISSET for glibc <2.5 fixes #2104 --- driver/others/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index ac8545f35..db14cde02 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -229,7 +229,7 @@ int get_num_procs(void) { n=0; #if !__GLIBC_PREREQ(2, 6) for (i=0;i Date: Wed, 1 May 2019 19:36:22 +0000 Subject: [PATCH 006/127] conflict resolve --- kernel/power/KERNEL.POWER9 | 10 +++++----- kernel/power/icamax.c | 2 +- kernel/power/icamin.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 6d5cf9068..0e0d62393 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -12,11 +12,11 @@ SGEMMKERNEL = sgemm_kernel_power9.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c -SGEMMOTCOPY = sgemm_tcopy_8_power8.S -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMOTCOPY = sgemm_tcopy_8_power8.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_power9.S DGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/power/icamax.c b/kernel/power/icamax.c index 06fc5d8ad..bd74d20e5 100644 --- a/kernel/power/icamax.c +++ b/kernel/power/icamax.c @@ -75,7 +75,7 @@ static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { BLASLONG index; - BLASLONG i; + BLASLONG i=0; #if defined(USE_MASK_PERMUTATIONS) register __vector unsigned int static_index0 = {0,1,2,3}; #else diff --git a/kernel/power/icamin.c b/kernel/power/icamin.c index 36432c993..336766245 100644 --- a/kernel/power/icamin.c +++ b/kernel/power/icamin.c @@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { BLASLONG index; - BLASLONG i; + BLASLONG i=0; register __vector unsigned int static_index0 = {0,1,2,3}; register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} From 858e609e1feba715065a65034eef02c9516aa107 Mon Sep 17 00:00:00 2001 From: "luz.paz" Date: Sat, 4 May 2019 15:01:29 -0400 Subject: [PATCH 007/127] Revert reference/ fixes --- reference/ctbmvf.f | 2 +- reference/ctpmvf.f | 2 +- reference/ctrmvf.f | 2 +- reference/dtbmvf.f | 2 +- reference/dtpmvf.f | 2 +- reference/dtrmvf.f | 2 +- reference/stbmvf.f | 2 +- reference/stpmvf.f | 2 +- reference/strmvf.f | 2 +- reference/ztbmvf.f | 2 +- reference/ztpmvf.f | 2 +- reference/ztrmvf.f | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/reference/ctbmvf.f b/reference/ctbmvf.f index ada701d70..ff3c5268d 100644 --- a/reference/ctbmvf.f +++ b/reference/ctbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ctpmvf.f b/reference/ctpmvf.f index ffc4766d2..340234270 100644 --- a/reference/ctpmvf.f +++ b/reference/ctpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ctrmvf.f b/reference/ctrmvf.f index 9cd1d17ad..f9d3b445a 100644 --- a/reference/ctrmvf.f +++ b/reference/ctrmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/dtbmvf.f b/reference/dtbmvf.f index 621489085..da340774e 100644 --- a/reference/dtbmvf.f +++ b/reference/dtbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/dtpmvf.f b/reference/dtpmvf.f index 492f9fd46..e8f6eb412 100644 --- a/reference/dtpmvf.f +++ b/reference/dtpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/dtrmvf.f b/reference/dtrmvf.f index 79b2eb806..0619d3eca 100644 --- a/reference/dtrmvf.f +++ b/reference/dtrmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/stbmvf.f b/reference/stbmvf.f index f21e5aa8b..353e63ee8 100644 --- a/reference/stbmvf.f +++ b/reference/stbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/stpmvf.f b/reference/stpmvf.f index d97a695f5..1e93b843a 100644 --- a/reference/stpmvf.f +++ b/reference/stpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/strmvf.f b/reference/strmvf.f index 7614dcd32..249aff275 100644 --- a/reference/strmvf.f +++ b/reference/strmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ztbmvf.f b/reference/ztbmvf.f index c8487cf7c..8df5609ad 100644 --- a/reference/ztbmvf.f +++ b/reference/ztbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ztpmvf.f b/reference/ztpmvf.f index 5dc03bac9..7e52ef74e 100644 --- a/reference/ztpmvf.f +++ b/reference/ztpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ztrmvf.f b/reference/ztrmvf.f index 5f52622e2..9e4f85380 100644 --- a/reference/ztrmvf.f +++ b/reference/ztrmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of From b46875b76b8d4ebbc320547c20f7f4486fe52563 Mon Sep 17 00:00:00 2001 From: "luz.paz" Date: Sat, 4 May 2019 15:43:17 -0400 Subject: [PATCH 008/127] Revert Changelog.txt typos --- Changelog.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Changelog.txt b/Changelog.txt index 9feacf071..8df35d5c3 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -74,7 +74,7 @@ ARMv8: * ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7 IBM Z: - * optimized microkernels for single precision BLAS1/2 functions have been added + * optimized microkernels for single precicion BLAS1/2 functions have been added for both Z13 and Z14 ==================================================================== @@ -588,8 +588,8 @@ common: s/d/c/zaxpby, s/d/c/zimatcopy, s/d/c/zomatcopy. * Added OPENBLAS_CORETYPE environment for dynamic_arch. (a86d34) * Added NO_AVX2 flag for old binutils. (#401) - * Support outputting the CPU corename on runtime.(#407) - * Patched LAPACK to fix bug 114, 117, 118. + * Support outputing the CPU corename on runtime.(#407) + * Patched LAPACK to fix bug 114, 117, 118. (http://www.netlib.org/lapack/bug_list.html) * Disabled ?gemm3m for a work-around fix. (#400) x86/x86-64: @@ -628,7 +628,7 @@ Version 0.2.9.rc1 13-Jan-2013 common: * Update LAPACK to 3.5.0 version - * Fixed compatible issues with Clang and Pathscale compilers. + * Fixed compatiable issues with Clang and Pathscale compilers. x86/x86-64: * Optimization on Intel Haswell. @@ -705,7 +705,7 @@ Version 0.2.5 26-Nov-2012 common: * Added NO_SHARED flag to disable generating the shared library. - * Compile LAPACKE with ILP64 model when INTERFACE64=1 (#158) + * Compile LAPACKE with ILP64 modle when INTERFACE64=1 (#158) * Export LAPACK 3.4.2 symbols in shared library. (#147) * Only detect the number of physical CPU cores on Mac OSX. (#157) * Fixed NetBSD build. (#155) @@ -896,7 +896,7 @@ x86/x86_64: * Fixed #28 a wrong result of dsdot on x86_64. * Fixed #32 a SEGFAULT bug of zdotc with gcc-4.6. * Fixed #33 ztrmm bug on Nehalem. - * Work-around #27 the low performance axpy issue with small input size & multithreads. + * Work-around #27 the low performance axpy issue with small imput size & multithreads. MIPS64: * Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64. @@ -919,7 +919,7 @@ common: * Imported GotoBLAS2 1.13 BSD version x86/x86_64: - * On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would cause + * On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would casue zdotu & zdotc failures. Instead, work-around it. (Refs issue #8 #9 on github) * Modified ?axpy functions to return same netlib BLAS results when incx==0 or incy==0 (Refs issue #7 on github) From 7ed8431527eb00f161de4dd309fd4d2b6c885b0c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 4 May 2019 22:54:41 +0200 Subject: [PATCH 009/127] Disable the SkyLakeX DGEMMITCOPY kernel as well as a stopgap measure for https://github.com/numpy/numpy/issues/13401 as mentioned in #1955 --- kernel/x86_64/KERNEL.SKYLAKEX | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 5d0a300b5..3c678904d 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -10,7 +10,7 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_4.c #DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c DGEMMINCOPY = dgemm_ncopy_8_skylakex.c -DGEMMITCOPY = dgemm_tcopy_8_skylakex.c +#DGEMMITCOPY = dgemm_tcopy_8_skylakex.c DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c From b1561ecc6864428baa4f1336d47d23729b9636f2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 5 May 2019 15:52:01 +0200 Subject: [PATCH 010/127] Disable DGEMMINCOPY as well for now #1955 --- kernel/x86_64/KERNEL.SKYLAKEX | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 3c678904d..d61c51628 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -9,7 +9,7 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_4.c #DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c -DGEMMINCOPY = dgemm_ncopy_8_skylakex.c +#DGEMMINCOPY = dgemm_ncopy_8_skylakex.c #DGEMMITCOPY = dgemm_tcopy_8_skylakex.c DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c From 5a9cce2bf6740110b93a534f876072f220d928d1 Mon Sep 17 00:00:00 2001 From: Fabrice Fontaine Date: Sun, 5 May 2019 18:37:28 +0200 Subject: [PATCH 011/127] Makefile.arm: remove -march flags The provided -march flags, especially for ARMv5 and ARMv6 may not necessarily match the needed ones: for ARMv5, it might be armv5, armv5te, armv5t, etc. If the wrong one is used, the incorrect toolchain sysroot can be used in a multilib toolchain. Therefore, let the user building OpenBLAS pass the appropriate -march flag. The other flags, such as -mfpu=vfp or -mfloat-abi=hard are kept, as they are actually required for the build to proceed (OpenBLAS uses VFP instructions, and assume an EABIhf ABI). [Peter: update for v0.2.20] Signed-off-by: Thomas Petazzoni Signed-off-by: Peter Korsgaard [Retrieved from: https://git.buildroot.net/buildroot/tree/package/openblas/0001-Makefile.arm-remove-march-flags.patch] Signed-off-by: Fabrice Fontaine --- Makefile.arm | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/Makefile.arm b/Makefile.arm index eedd39b73..b5d80f8e6 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -1,7 +1,7 @@ ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15)) ifeq ($(OSNAME), Android) -CCOMMON_OPT += -mfpu=neon -march=armv7-a -FCOMMON_OPT += -mfpu=neon -march=armv7-a +CCOMMON_OPT += -mfpu=neon +FCOMMON_OPT += -mfpu=neon else CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a @@ -9,11 +9,6 @@ endif endif ifeq ($(CORE), ARMV6) -CCOMMON_OPT += -mfpu=vfp -march=armv6 -FCOMMON_OPT += -mfpu=vfp -march=armv6 -endif - -ifeq ($(CORE), ARMV5) -CCOMMON_OPT += -march=armv5 -FCOMMON_OPT += -march=armv5 +CCOMMON_OPT += -mfpu=vfp +FCOMMON_OPT += -mfpu=vfp endif From 3d7debbb280d0e671691469da650a124bedda219 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Tue, 7 May 2019 13:15:08 +0300 Subject: [PATCH 012/127] init From a6a8cc2b7fa30f46fdaa4fb6e50c19da8c11e335 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 7 May 2019 13:34:52 +0200 Subject: [PATCH 013/127] Fix errors in cpu enumeration with glibc 2.6 for #2114 --- driver/others/init.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/driver/others/init.c b/driver/others/init.c index 012ef6647..a29dce971 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) { int mynode = 1; - /* if number of threads is larger than inital condition */ + /* if number of threads is larger than initial condition */ if (pos < 0) { sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); return 0; @@ -857,7 +857,14 @@ void gotoblas_affinity_init(void) { common -> shmid = pshmid; if (common -> magic != SH_MAGIC) { + +#if defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 7) cpu_set_t *cpusetp; +#else + cpu_set_t cpuset; +#endif +#endif int nums; int ret; @@ -890,7 +897,7 @@ void gotoblas_affinity_init(void) { } CPU_FREE(cpusetp); #else - ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); + ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset); if (ret!=0) { common->num_procs = nums; } else { @@ -898,11 +905,11 @@ void gotoblas_affinity_init(void) { int i; int n = 0; for (i=0;inum_procs = n; } #else - common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp); + common->num_procs = CPU_COUNT(&cpuset); } #endif From c516209581a77790b8d67d6dcd0c3f95fe713643 Mon Sep 17 00:00:00 2001 From: Diazonium Date: Tue, 7 May 2019 14:55:20 +0200 Subject: [PATCH 014/127] Change two http links to https Closes #2109 --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 76a65b74b..620e393f1 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. -Please read the documentation on the OpenBLAS wiki pages: . +Please read the documentation on the OpenBLAS wiki pages: . ## Binary Packages @@ -22,7 +22,7 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge ## Installation from Source -Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code +Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code using Git from https://github.com/xianyi/OpenBLAS.git. ### Dependencies From 575a84398a1569738029594372f9143a6743c52c Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Tue, 7 May 2019 23:46:54 +0300 Subject: [PATCH 015/127] remove redundant code #2113 --- lapack/getrf/getrf_parallel.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index 591ce4a99..c82defcab 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -279,9 +279,6 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * for (i = 0; i < args -> nthreads; i++) #if 1 { - LOCK_COMMAND(&getrf_lock); - jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside]; - UNLOCK_COMMAND(&getrf_lock); do { LOCK_COMMAND(&getrf_lock); jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside]; @@ -368,9 +365,6 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * if ((current != mypos) && (!is)) { #if 1 - LOCK_COMMAND(&getrf_lock); - jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside]; - UNLOCK_COMMAND(&getrf_lock); do { LOCK_COMMAND(&getrf_lock); jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside]; @@ -402,9 +396,6 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * for (i = 0; i < args -> nthreads; i++) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { #if 1 - LOCK_COMMAND(&getrf_lock); - jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx]; - UNLOCK_COMMAND(&getrf_lock); do { LOCK_COMMAND(&getrf_lock); jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx]; From 7d1b468d9d83789d25eb6996afb5e358ee861f1d Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 8 May 2019 09:58:01 +0800 Subject: [PATCH 016/127] Set up CI with Azure Pipelines [skip ci] --- azure-pipelines.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 azure-pipelines.yml diff --git a/azure-pipelines.yml b/azure-pipelines.yml new file mode 100644 index 000000000..aa912913d --- /dev/null +++ b/azure-pipelines.yml @@ -0,0 +1,19 @@ +# Starter pipeline +# Start with a minimal pipeline that you can customize to build and deploy your code. +# Add steps that build, run tests, deploy, and more: +# https://aka.ms/yaml + +trigger: +- master + +pool: + vmImage: 'ubuntu-latest' + +steps: +- script: echo Hello, world! + displayName: 'Run a one-line script' + +- script: | + echo Add other tasks to build, test, and deploy your project. + echo See https://aka.ms/yaml + displayName: 'Run a multi-line script' From e47b63466b26dab9618443fd5754885bea653845 Mon Sep 17 00:00:00 2001 From: Tyler Reddy Date: Tue, 7 May 2019 16:06:42 -0700 Subject: [PATCH 017/127] TST: add native POWER8 to CI * add native POWER8 testing to Travis CI matrix with ppc64le os entry --- .travis.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.travis.yml b/.travis.yml index eee7674fe..00a2509f9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,6 +25,15 @@ matrix: - TARGET_BOX=LINUX64 - BTYPE="BINARY=64" + - <<: *test-ubuntu + os: linux-ppc64le + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" + env: + # for matrix annotation only + - TARGET_BOX=PPC64LE_LINUX + - BTYPE="BINARY=64 USE_OPENMP=1" + - <<: *test-ubuntu env: - TARGET_BOX=LINUX64 From 70cea0b96b70330ae6ef80b954e708d6acd86911 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 12:20:00 +0200 Subject: [PATCH 018/127] Update link to IBM MASS library, update cpu support status --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 620e393f1..68a121498 100644 --- a/README.md +++ b/README.md @@ -63,9 +63,7 @@ A debug version can be built using `make DEBUG=1`. ### Compile with MASS support on Power CPU (optional) -The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library -consists of a set of mathematical functions for C, C++, and Fortran applications that are -are tuned for optimum performance on POWER architectures. +The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER. The library can be installed as shown: @@ -115,6 +113,7 @@ Please read `GotoBLAS_01Readme.txt`. - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. +- **AMD ZEN**: Uses Haswell codes with some optimizations. #### MIPS64 @@ -133,11 +132,13 @@ Please read `GotoBLAS_01Readme.txt`. #### PPC/PPC64 -- **POWER8**: Optimized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` +- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1` +- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only. #### IBM zEnterprise System - **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) +- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision) ### Supported OS From 3a49e8c05aa24bba832e5e05bd8888fbee039919 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 13:52:22 +0200 Subject: [PATCH 019/127] first try migrating one of the arm builds from travis --- azure-pipelines.yml | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index aa912913d..87b4de3f0 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -14,6 +14,26 @@ steps: displayName: 'Run a one-line script' - script: | - echo Add other tasks to build, test, and deploy your project. - echo See https://aka.ms/yaml - displayName: 'Run a multi-line script' + docker run --rm --privileged multiarch/qemu-user-static:register --reset + ls /proc/sys/fs/binfmt_misc/ + condition: not(startsWith(variables['CONFIG'], 'linux_64')) + displayName: Configure binfmt_misc + +- script: | + echo "FROM openblas/alpine:arm32 + COPY . /tmp/openblas + RUN mkdir /tmp/openblas/build && \ + cd /tmp/openblas/build && \ + CC=gcc cmake -D DYNAMIC_ARCH=OFF \ + -D TARGET=ARMV6 \ + -D BUILD_SHARED_LIBS=ON \ + -D BUILD_WITHOUT_LAPACK=ON \ + -D BUILD_WITHOUT_CBLAS=ON \ + -D CMAKE_BUILD_TYPE=Release ../ && \ + cmake --build ." > Dockerfile + docker build . + +#- script: | +# echo Add other tasks to build, test, and deploy your project. +# echo See https://aka.ms/yaml +# displayName: 'Run a multi-line script' From 5cf434167ab9622c6788e4fdc9b418ab7bf96e61 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 13:58:59 +0200 Subject: [PATCH 020/127] fix tabbing in azure commands --- azure-pipelines.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 87b4de3f0..3b277073a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -14,10 +14,10 @@ steps: displayName: 'Run a one-line script' - script: | - docker run --rm --privileged multiarch/qemu-user-static:register --reset - ls /proc/sys/fs/binfmt_misc/ + docker run --rm --privileged multiarch/qemu-user-static:register --reset + ls /proc/sys/fs/binfmt_misc/ condition: not(startsWith(variables['CONFIG'], 'linux_64')) - displayName: Configure binfmt_misc + displayName: 'Configure binfmt_misc' - script: | echo "FROM openblas/alpine:arm32 @@ -32,7 +32,7 @@ steps: -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile docker build . - + displayname: 'Run ARMV6 docker build' #- script: | # echo Add other tasks to build, test, and deploy your project. # echo See https://aka.ms/yaml From aa4c41bad26bbb6d550ddad3141063c2260b7afd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 14:12:02 +0200 Subject: [PATCH 021/127] Update azure-pipelines.yml take out offending lines (although stolen from https://github.com/conda-forge/opencv-feedstock azure-pipelines fiie) --- azure-pipelines.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 3b277073a..d7e6cdc9b 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -15,9 +15,9 @@ steps: - script: | docker run --rm --privileged multiarch/qemu-user-static:register --reset - ls /proc/sys/fs/binfmt_misc/ - condition: not(startsWith(variables['CONFIG'], 'linux_64')) - displayName: 'Configure binfmt_misc' +# ls /proc/sys/fs/binfmt_misc/ +# condition: not(startsWith(variables['CONFIG'], 'linux_64')) +# displayName: 'Configure binfmt_misc' - script: | echo "FROM openblas/alpine:arm32 From 16fd8e3dbe510802860f1981321bf9cd70676de4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 14:14:22 +0200 Subject: [PATCH 022/127] Update azure-pipelines.yml --- azure-pipelines.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index d7e6cdc9b..12ea40b61 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -32,7 +32,8 @@ steps: -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile docker build . - displayname: 'Run ARMV6 docker build' + displayName: 'Run ARMV6 docker build' + #- script: | # echo Add other tasks to build, test, and deploy your project. # echo See https://aka.ms/yaml From a598ab1d32c1d5fcf9b9eb0c503a24db13757bc2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 15:23:54 +0200 Subject: [PATCH 023/127] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 12ea40b61..2b092c256 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -32,7 +32,7 @@ steps: -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile docker build . - displayName: 'Run ARMV6 docker build' +# displayName: 'Run ARMV6 docker build' #- script: | # echo Add other tasks to build, test, and deploy your project. From dd77a3f0e27dee0c15b6e1da3649aba6723631ab Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 15:25:43 +0200 Subject: [PATCH 024/127] Update azure-pipelines.yml --- azure-pipelines.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 2b092c256..e25f11cb1 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -32,6 +32,8 @@ steps: -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile docker build . + + # displayName: 'Run ARMV6 docker build' #- script: | From ad20ceaa680e555e6f4e5e6d199f4c158ef1b6df Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 19:07:58 +0200 Subject: [PATCH 025/127] Update azure-pipelines.yml --- azure-pipelines.yml | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index e25f11cb1..0b1ba16fd 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -13,14 +13,14 @@ steps: - script: echo Hello, world! displayName: 'Run a one-line script' -- script: | - docker run --rm --privileged multiarch/qemu-user-static:register --reset +#- script: | +# docker run --rm --privileged multiarch/qemu-user-static:register --reset # ls /proc/sys/fs/binfmt_misc/ # condition: not(startsWith(variables['CONFIG'], 'linux_64')) # displayName: 'Configure binfmt_misc' - script: | - echo "FROM openblas/alpine:arm32 + echo "FROM openblas/alpine:arm32 COPY . /tmp/openblas RUN mkdir /tmp/openblas/build && \ cd /tmp/openblas/build && \ @@ -31,10 +31,8 @@ steps: -D BUILD_WITHOUT_CBLAS=ON \ -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile - docker build . - - -# displayName: 'Run ARMV6 docker build' + docker build . + displayName: Run ARMV6 docker build #- script: | # echo Add other tasks to build, test, and deploy your project. From 53703585aa5ac170cabfe035a32bd0e07e1877c8 Mon Sep 17 00:00:00 2001 From: Tyler Reddy Date: Wed, 8 May 2019 15:14:01 -0700 Subject: [PATCH 026/127] DOC: Add Azure CI status badge --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 68a121498..14815ff00 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,8 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=dev AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) +[![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop) + ## Introduction OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. From 406c7242f49730e45453544b601d717e02ebe07d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 May 2019 00:47:44 +0200 Subject: [PATCH 027/127] Add ARMV6 build to azure CI setup (#2122) using aytekinar's Alpine image and docker script from the Travis setup [skip ci] --- azure-pipelines.yml | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 0b1ba16fd..cef2ef973 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -13,19 +13,15 @@ steps: - script: echo Hello, world! displayName: 'Run a one-line script' -#- script: | -# docker run --rm --privileged multiarch/qemu-user-static:register --reset -# ls /proc/sys/fs/binfmt_misc/ -# condition: not(startsWith(variables['CONFIG'], 'linux_64')) -# displayName: 'Configure binfmt_misc' - - script: | + docker run --rm --privileged multiarch/qemu-user-static:register --reset echo "FROM openblas/alpine:arm32 COPY . /tmp/openblas RUN mkdir /tmp/openblas/build && \ cd /tmp/openblas/build && \ - CC=gcc cmake -D DYNAMIC_ARCH=OFF \ - -D TARGET=ARMV6 \ + CC=gcc cmake -D DYNAMIC_ARCH=OFF \ + -D TARGET=ARMV6 \ + -D NOFORTRAN=ON \ -D BUILD_SHARED_LIBS=ON \ -D BUILD_WITHOUT_LAPACK=ON \ -D BUILD_WITHOUT_CBLAS=ON \ From 4efbac28ed42b79ac0ba27cfe065d38a3ba5af68 Mon Sep 17 00:00:00 2001 From: Tyler Reddy Date: Wed, 8 May 2019 18:51:59 -0700 Subject: [PATCH 028/127] TST: Azure manylinux1 & clean-up * remove some of the steps & comments from the original Azure yml template * modify the trigger section to use develop since OpenBLAS primarily uses this branch; use the same batching behavior as downstream projects NumPy/ SciPy * remove Travis emulated ARMv6 gcc build because this now happens in Azure * use documented Ubuntu vmImage name for Azure and add in a manylinux1 test run to the matrix [skip appveyor] --- .travis.yml | 8 ++----- azure-pipelines.yml | 57 +++++++++++++++++++++++++++------------------ 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/.travis.yml b/.travis.yml index 00a2509f9..82e2aaac8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -177,8 +177,8 @@ matrix: dist: trusty sudo: required services: docker - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc - name: "Emulated Build for ARMV6 with gcc" + env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang + name: "Emulated Build for ARMV6 with clang" before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset script: | echo "FROM openblas/alpine:${IMAGE_ARCH} @@ -193,9 +193,6 @@ matrix: -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile docker build . - - <<: *emulated-arm - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang - name: "Emulated Build for ARMV6 with clang" - <<: *emulated-arm env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc name: "Emulated Build for ARMV8 with gcc" @@ -204,7 +201,6 @@ matrix: name: "Emulated Build for ARMV8 with clang" allow_failures: - - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang diff --git a/azure-pipelines.yml b/azure-pipelines.yml index cef2ef973..cbea6f4a7 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -1,21 +1,18 @@ -# Starter pipeline -# Start with a minimal pipeline that you can customize to build and deploy your code. -# Add steps that build, run tests, deploy, and more: -# https://aka.ms/yaml - trigger: -- master + # start a new build for every push + batch: False + branches: + include: + - develop -pool: - vmImage: 'ubuntu-latest' - -steps: -- script: echo Hello, world! - displayName: 'Run a one-line script' - -- script: | - docker run --rm --privileged multiarch/qemu-user-static:register --reset - echo "FROM openblas/alpine:arm32 +jobs: +- job: ARMv6_gcc + pool: + vmImage: 'ubuntu-16.04' + steps: + - script: | + docker run --rm --privileged multiarch/qemu-user-static:register --reset + echo "FROM openblas/alpine:arm32 COPY . /tmp/openblas RUN mkdir /tmp/openblas/build && \ cd /tmp/openblas/build && \ @@ -27,10 +24,24 @@ steps: -D BUILD_WITHOUT_CBLAS=ON \ -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile - docker build . - displayName: Run ARMV6 docker build - -#- script: | -# echo Add other tasks to build, test, and deploy your project. -# echo See https://aka.ms/yaml -# displayName: 'Run a multi-line script' + docker build . + displayName: Run ARMV6 docker build +# manylinux1 is useful to test because the +# standard Docker container uses an old version +# of gcc / glibc +- job: manylinux1_gcc + pool: + vmImage: 'ubuntu-16.04' + steps: + - script: | + echo "FROM quay.io/pypa/manylinux1_x86_64 + COPY . /tmp/openblas + RUN cd /tmp/openblas && \ + COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \ + BTYPE='BINARY=64' CC=gcc && \ + make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \ + make -C test $COMMON_FLAGS $BTYPE && \ + make -C ctest $COMMON_FLAGS $BTYPE && \ + make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile + docker build . + displayName: Run manylinux1 docker build From a3d4c65d62cf3689fe5840e65a7fcdb64d986435 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 May 2019 11:52:02 +0200 Subject: [PATCH 029/127] Add NO_AFFINITY to available options on Linux, and set it to ON to match the gmake default. Fixes second part of #2114 --- CMakeLists.txt | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a27c1c0fc..50da721cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 6.dev) +set(OpenBLAS_PATCH_VERSION 7.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions @@ -20,9 +20,14 @@ if(MSVC) option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) endif() option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) -option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64 only)" OFF) -option(DYNAMIC_OLDER "Include specific support for older cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) +option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) +option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) +if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") +option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) +else() +set(NO_AFFINITY 1) +endif() # Add a prefix or suffix to all exported symbol names in the shared library. # Avoids conflicts with other BLAS libraries, especially when using From 9ea30f3788b64b7f42acfaf08e234591aee33e23 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 May 2019 14:42:36 +0200 Subject: [PATCH 030/127] Replace ISMIN and ISAMIN kernels on all x86_64 platforms (#2125) * Mark iamax_sse.S as unsuitable for MIN due to issue #2116 * Use iamax.S rather than iamax_sse.S for ISMIN/ISAMIN on all x86_64 as workaround for #2116 --- kernel/x86_64/KERNEL | 4 +- kernel/x86_64/iamax_sse.S | 106 ++++++++++++++++++++------------------ 2 files changed, 58 insertions(+), 52 deletions(-) diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 4874711bb..92d121ab2 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -171,7 +171,7 @@ IXAMAXKERNEL = izamax.S endif ifndef ISAMINKERNEL -ISAMINKERNEL = iamax_sse.S +ISAMINKERNEL = iamax.S endif ifndef IDAMINKERNEL @@ -207,7 +207,7 @@ IQMAXKERNEL = iamax.S endif ifndef ISMINKERNEL -ISMINKERNEL = iamax_sse.S +ISMINKERNEL = iamax.S endif ifndef IDMINKERNEL diff --git a/kernel/x86_64/iamax_sse.S b/kernel/x86_64/iamax_sse.S index f22e34a1d..d50c1699c 100644 --- a/kernel/x86_64/iamax_sse.S +++ b/kernel/x86_64/iamax_sse.S @@ -36,6 +36,10 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ +/* This kernel was found to give wrong results when used for ISMIN/ISAMIN + with increment != 1, although it appears to be correct for corresponding + MAX operations. See issue 2116 */ + #define ASSEMBLER #include "common.h" @@ -48,9 +52,11 @@ #define XX %r10 #define MM %r11 +#define MAXPS maxps +#define MAXSS maxss #ifdef USE_MIN -#define maxps minps -#define maxss minss +#define MAXPS minps +#define MAXSS minss #endif #include "l1param.h" @@ -103,7 +109,7 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxss %xmm4, %xmm0 + MAXSS %xmm4, %xmm0 decq M addq $SIZE, X ALIGN_3 @@ -117,7 +123,7 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxps %xmm4, %xmm1 + MAXPS %xmm4, %xmm1 subq $2, M addq $2 * SIZE, X ALIGN_3 @@ -137,25 +143,25 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxps %xmm4, %xmm0 + MAXPS %xmm4, %xmm0 movaps 4 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxps %xmm5, %xmm1 + MAXPS %xmm5, %xmm1 movaps 8 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxps %xmm6, %xmm2 + MAXPS %xmm6, %xmm2 movaps 12 * SIZE(X), %xmm7 #ifdef USE_ABS andps %xmm15, %xmm7 #endif - maxps %xmm7, %xmm3 + MAXPS %xmm7, %xmm3 addq $16 * SIZE, X decq I @@ -173,13 +179,13 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxps %xmm4, %xmm0 + MAXPS %xmm4, %xmm0 movaps 4 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxps %xmm5, %xmm1 + MAXPS %xmm5, %xmm1 addq $8 * SIZE, X ALIGN_3 @@ -191,7 +197,7 @@ #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxps %xmm6, %xmm2 + MAXPS %xmm6, %xmm2 addq $4 * SIZE, X ALIGN_3 @@ -204,7 +210,7 @@ #ifdef USE_ABS andps %xmm15, %xmm7 #endif - maxps %xmm7, %xmm3 + MAXPS %xmm7, %xmm3 addq $2 * SIZE, X .L18: @@ -215,22 +221,22 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxss %xmm4, %xmm0 + MAXSS %xmm4, %xmm0 ALIGN_3 .L20: movq XX, X movq MM, M - maxps %xmm1, %xmm0 - maxps %xmm3, %xmm2 - maxps %xmm2, %xmm0 + MAXPS %xmm1, %xmm0 + MAXPS %xmm3, %xmm2 + MAXPS %xmm2, %xmm0 movaps %xmm0, %xmm1 movhlps %xmm0, %xmm0 - maxps %xmm1, %xmm0 + MAXPS %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 - maxss %xmm1, %xmm0 + MAXSS %xmm1, %xmm0 shufps $0, %xmm0, %xmm0 testq $4, X @@ -427,28 +433,28 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxps %xmm4, %xmm0 + MAXPS %xmm4, %xmm0 movsd 4 * SIZE(X), %xmm5 movhps 6 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxps %xmm5, %xmm1 + MAXPS %xmm5, %xmm1 movsd 8 * SIZE(X), %xmm6 movhps 10 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxps %xmm6, %xmm2 + MAXPS %xmm6, %xmm2 movsd 12 * SIZE(X), %xmm7 movhps 14 * SIZE(X), %xmm7 #ifdef USE_ABS andps %xmm15, %xmm7 #endif - maxps %xmm7, %xmm3 + MAXPS %xmm7, %xmm3 addq $16 * SIZE, X decq I @@ -467,14 +473,14 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxps %xmm4, %xmm0 + MAXPS %xmm4, %xmm0 movsd 4 * SIZE(X), %xmm5 movhps 6 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxps %xmm5, %xmm1 + MAXPS %xmm5, %xmm1 addq $8 * SIZE, X ALIGN_3 @@ -488,7 +494,7 @@ #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxps %xmm6, %xmm2 + MAXPS %xmm6, %xmm2 addq $4 * SIZE, X ALIGN_3 @@ -501,7 +507,7 @@ #ifdef USE_ABS andps %xmm15, %xmm7 #endif - maxps %xmm7, %xmm3 + MAXPS %xmm7, %xmm3 addq $2 * SIZE, X .L38: @@ -512,7 +518,7 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxss %xmm4, %xmm0 + MAXSS %xmm4, %xmm0 jmp .L40 ALIGN_4 @@ -520,15 +526,15 @@ movq XX, X movq MM, M - maxps %xmm1, %xmm0 - maxps %xmm3, %xmm2 - maxps %xmm2, %xmm0 + MAXPS %xmm1, %xmm0 + MAXPS %xmm3, %xmm2 + MAXPS %xmm2, %xmm0 movaps %xmm0, %xmm1 movhlps %xmm0, %xmm0 - maxps %xmm1, %xmm0 + MAXPS %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 - maxss %xmm1, %xmm0 + MAXSS %xmm1, %xmm0 shufps $0, %xmm0, %xmm0 movq M, I @@ -687,56 +693,56 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxss %xmm4, %xmm0 + MAXSS %xmm4, %xmm0 movss 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxss %xmm5, %xmm1 + MAXSS %xmm5, %xmm1 movss 0 * SIZE(X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxss %xmm6, %xmm2 + MAXSS %xmm6, %xmm2 movss 0 * SIZE(X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif - maxss %xmm7, %xmm3 + MAXSS %xmm7, %xmm3 movss 0 * SIZE(X), %xmm4 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxss %xmm4, %xmm0 + MAXSS %xmm4, %xmm0 movss 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxss %xmm5, %xmm1 + MAXSS %xmm5, %xmm1 movss 0 * SIZE(X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxss %xmm6, %xmm2 + MAXSS %xmm6, %xmm2 movss 0 * SIZE(X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif - maxss %xmm7, %xmm3 + MAXSS %xmm7, %xmm3 decq I jg .L81 @@ -754,28 +760,28 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxss %xmm4, %xmm0 + MAXSS %xmm4, %xmm0 movss 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxss %xmm5, %xmm1 + MAXSS %xmm5, %xmm1 movss 0 * SIZE(X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxss %xmm6, %xmm2 + MAXSS %xmm6, %xmm2 movss 0 * SIZE(X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif - maxss %xmm7, %xmm3 + MAXSS %xmm7, %xmm3 ALIGN_3 .L86: @@ -787,14 +793,14 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxss %xmm4, %xmm0 + MAXSS %xmm4, %xmm0 movss 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxss %xmm5, %xmm1 + MAXSS %xmm5, %xmm1 ALIGN_3 .L87: @@ -806,16 +812,16 @@ #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxss %xmm6, %xmm2 + MAXSS %xmm6, %xmm2 ALIGN_4 .L90: movq XX, X movq MM, M - maxss %xmm1, %xmm0 - maxss %xmm3, %xmm2 - maxss %xmm2, %xmm0 + MAXSS %xmm1, %xmm0 + MAXSS %xmm3, %xmm2 + MAXSS %xmm2, %xmm0 shufps $0, %xmm0, %xmm0 movq M, I From 3cb1c8d210046f4f6e2935fe796af3648387a38e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 May 2019 16:07:30 +0200 Subject: [PATCH 031/127] Move ARMv8 gcc build from Travis to Azure --- azure-pipelines.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index cbea6f4a7..4673d07fe 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -26,6 +26,26 @@ jobs: cmake --build ." > Dockerfile docker build . displayName: Run ARMV6 docker build +- job: ARMv8_gcc + pool: + vmImage: 'ubuntu-16.04' + steps: + - script: | + docker run --rm --privileged multiarch/qemu-user-static:register --reset + echo "FROM openblas/alpine:arm64 + COPY . /tmp/openblas + RUN mkdir /tmp/openblas/build && \ + cd /tmp/openblas/build && \ + CC=gcc cmake -D DYNAMIC_ARCH=OFF \ + -D TARGET=ARMV8 \ + -D NOFORTRAN=ON \ + -D BUILD_SHARED_LIBS=ON \ + -D BUILD_WITHOUT_LAPACK=ON \ + -D BUILD_WITHOUT_CBLAS=ON \ + -D CMAKE_BUILD_TYPE=Release ../ && \ + cmake --build ." > Dockerfile + docker build . + displayName: Run ARMV8 docker build # manylinux1 is useful to test because the # standard Docker container uses an old version # of gcc / glibc From 999a04f101250a189c92919277db6cbc50a584ff Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 May 2019 16:08:23 +0200 Subject: [PATCH 032/127] Move ARMv8 gcc build from Travis to Azure --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 82e2aaac8..eb74ded37 100644 --- a/.travis.yml +++ b/.travis.yml @@ -202,7 +202,6 @@ matrix: allow_failures: - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang - - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang # whitelist From 43068288e9fc035cd9ebc7254de7a5f0a3600090 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 May 2019 22:37:06 +0200 Subject: [PATCH 033/127] Update .travis.yml --- .travis.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index eb74ded37..b2827997c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -193,9 +193,6 @@ matrix: -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile docker build . - - <<: *emulated-arm - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc - name: "Emulated Build for ARMV8 with gcc" - <<: *emulated-arm env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang name: "Emulated Build for ARMV8 with clang" From d86f0b9e74130ab659062bca40badc1dc36649f0 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:35:07 -0500 Subject: [PATCH 034/127] Test drone CI --- .drone.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 .drone.yml diff --git a/.drone.yml b/.drone.yml new file mode 100644 index 000000000..b2300b81d --- /dev/null +++ b/.drone.yml @@ -0,0 +1,19 @@ +--- +kind: pipeline +name: arm64_gcc + +platform: + os: linux + arch: arm64 + +steps: +- name: Build + image: centos:7 + environment: + CC: gcc + COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' + commands: + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS" From 58829c098841d2da28defa96538a7a2f9d3e0f21 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:39:51 -0500 Subject: [PATCH 035/127] install make --- .drone.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.drone.yml b/.drone.yml index b2300b81d..75868e919 100644 --- a/.drone.yml +++ b/.drone.yml @@ -7,12 +7,13 @@ platform: arch: arm64 steps: -- name: Build +- name: Build and Test image: centos:7 environment: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: + - sudo yum -y install make - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS From ff807473bb6e0faf8e7767c18b5cfae1318e0aaa Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:40:23 -0500 Subject: [PATCH 036/127] remove sudo --- .drone.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.drone.yml b/.drone.yml index 75868e919..da9520975 100644 --- a/.drone.yml +++ b/.drone.yml @@ -13,7 +13,7 @@ steps: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: - - sudo yum -y install make + - yum -y install make - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS From 21acf03e9a2b21e39fa6e81899f100084de0ba93 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:42:16 -0500 Subject: [PATCH 037/127] Install gcc --- .drone.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.drone.yml b/.drone.yml index da9520975..c4f216ed6 100644 --- a/.drone.yml +++ b/.drone.yml @@ -13,7 +13,7 @@ steps: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: - - yum -y install make + - yum -y install make gcc - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS From 15f925fe9a0ca823352fd252cad2da95c810cec4 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:44:15 -0500 Subject: [PATCH 038/127] Install perl --- .drone.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.drone.yml b/.drone.yml index c4f216ed6..765c2b02c 100644 --- a/.drone.yml +++ b/.drone.yml @@ -13,7 +13,7 @@ steps: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: - - yum -y install make gcc + - yum -y install make gcc perl - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS From a0aaf308ed682d58962f1dd6f568647e97572596 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:47:49 -0500 Subject: [PATCH 039/127] Install gfortran and add a clang job --- .drone.yml | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/.drone.yml b/.drone.yml index 765c2b02c..3b1515c33 100644 --- a/.drone.yml +++ b/.drone.yml @@ -1,6 +1,6 @@ --- kind: pipeline -name: arm64_gcc +name: arm64_gcc_make platform: os: linux @@ -13,7 +13,28 @@ steps: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: - - yum -y install make gcc perl + - yum -y install make gcc gfortran perl + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS" + +--- +kind: pipeline +name: arm64_clang_make + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: centos:7 + environment: + CC: clang + COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' + commands: + - yum -y install make gcc gfortran perl clang - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS From 9184590c33e9b8df68460877a0d56e229d21d2ce Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:50:37 -0500 Subject: [PATCH 040/127] gfortran->gcc-gfortran --- .drone.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.drone.yml b/.drone.yml index 3b1515c33..37ca7478f 100644 --- a/.drone.yml +++ b/.drone.yml @@ -13,7 +13,7 @@ steps: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: - - yum -y install make gcc gfortran perl + - yum -y install make gcc gcc-gfortran perl - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS @@ -34,7 +34,7 @@ steps: CC: clang COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: - - yum -y install make gcc gfortran perl clang + - yum -y install make gcc gcc-gfortran perl clang - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS From dc110e179d5110bb807ee9c962e9b7da938ac9a6 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:53:58 -0500 Subject: [PATCH 041/127] Switch to ubuntu and parallel jobs --- .drone.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.drone.yml b/.drone.yml index 37ca7478f..f048cad1f 100644 --- a/.drone.yml +++ b/.drone.yml @@ -8,12 +8,12 @@ platform: steps: - name: Build and Test - image: centos:7 + image: ubuntu:18.04 environment: CC: gcc - COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' + COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j' commands: - - yum -y install make gcc gcc-gfortran perl + - apt install make gcc gfortran perl clang - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS @@ -29,12 +29,12 @@ platform: steps: - name: Build and Test - image: centos:7 + image: ubuntu:18.04 environment: CC: clang - COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' + COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j' commands: - - yum -y install make gcc gcc-gfortran perl clang + - apt install make gcc gfortran perl clang - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS From 612c2d78e0589634de791c72769c978c2fdc0141 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:55:04 -0500 Subject: [PATCH 042/127] apt update --- .drone.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.drone.yml b/.drone.yml index f048cad1f..973e00c00 100644 --- a/.drone.yml +++ b/.drone.yml @@ -13,7 +13,8 @@ steps: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j' commands: - - apt install make gcc gfortran perl clang + - apt-get update + - apt-get install make gcc gfortran perl clang - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS @@ -34,7 +35,8 @@ steps: CC: clang COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j' commands: - - apt install make gcc gfortran perl clang + - apt-get update + - apt-get install make gcc gfortran perl clang - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS From 231472c4c6c5e4b76000e62b1ad8b0a0b25c6ed4 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:55:38 -0500 Subject: [PATCH 043/127] Fix typo --- .drone.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.drone.yml b/.drone.yml index 973e00c00..5fe9983ae 100644 --- a/.drone.yml +++ b/.drone.yml @@ -18,7 +18,7 @@ steps: - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS - - make -C utest $COMMON_FLAGS" + - make -C utest $COMMON_FLAGS --- kind: pipeline @@ -40,4 +40,4 @@ steps: - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS - - make -C utest $COMMON_FLAGS" + - make -C utest $COMMON_FLAGS From 608cd69b66059de14b29639ab29957c99190be5c Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:56:59 -0500 Subject: [PATCH 044/127] update yes --- .drone.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.drone.yml b/.drone.yml index 5fe9983ae..6413bd1c9 100644 --- a/.drone.yml +++ b/.drone.yml @@ -13,8 +13,8 @@ steps: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j' commands: - - apt-get update - - apt-get install make gcc gfortran perl clang + - apt-get update -y + - apt-get install -y make gcc gfortran perl clang - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS @@ -35,8 +35,8 @@ steps: CC: clang COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j' commands: - - apt-get update - - apt-get install make gcc gfortran perl clang + - apt-get update -y + - apt-get install -y make gcc gfortran perl clang - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS From d40c109eb0ace38d967e221308496854d207a70f Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 14:02:39 -0500 Subject: [PATCH 045/127] no need of gcc in clang build --- .drone.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.drone.yml b/.drone.yml index 6413bd1c9..0d7fd2000 100644 --- a/.drone.yml +++ b/.drone.yml @@ -11,10 +11,10 @@ steps: image: ubuntu:18.04 environment: CC: gcc - COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j' + COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: - apt-get update -y - - apt-get install -y make gcc gfortran perl clang + - apt-get install -y make $CC gfortran perl - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS @@ -33,10 +33,10 @@ steps: image: ubuntu:18.04 environment: CC: clang - COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j' + COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: - apt-get update -y - - apt-get install -y make gcc gfortran perl clang + - apt-get install -y make $CC gfortran perl - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS From dadafcdcd84ffa8f5545a14aa3b2c0b39398195c Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 14:06:04 -0500 Subject: [PATCH 046/127] Add a cmake build as well --- .drone.yml | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/.drone.yml b/.drone.yml index 0d7fd2000..70bfc5f19 100644 --- a/.drone.yml +++ b/.drone.yml @@ -41,3 +41,47 @@ steps: - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS - make -C utest $COMMON_FLAGS + +--- +kind: pipeline +name: arm64_gcc_cmake + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: gcc + CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32' + commands: + - apt-get update -y + - apt-get install -y make $CC gfortran perl cmake + - mkdir build && cd build + - cmake $CMAKE_FLAGS .. + - cmake --build . + - ctest + +--- +kind: pipeline +name: arm64_clang_cmake + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: clang + CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32' + commands: + - apt-get update -y + - apt-get install -y make $CC gfortran perl cmake + - mkdir build && cd build + - cmake $CMAKE_FLAGS .. + - cmake --build . + - ctest From cd99dfe034e3df8ad850dbae96e2e1deac6fdc2d Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 14:09:29 -0500 Subject: [PATCH 047/127] Add cmake builds and print options --- .drone.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.drone.yml b/.drone.yml index 70bfc5f19..a7224036e 100644 --- a/.drone.yml +++ b/.drone.yml @@ -13,8 +13,10 @@ steps: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" - apt-get update -y - apt-get install -y make $CC gfortran perl + - $CC --version - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS @@ -35,8 +37,10 @@ steps: CC: clang COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" - apt-get update -y - apt-get install -y make $CC gfortran perl + - $CC --version - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS @@ -57,8 +61,10 @@ steps: CC: gcc CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32' commands: + - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" - apt-get update -y - apt-get install -y make $CC gfortran perl cmake + - $CC --version - mkdir build && cd build - cmake $CMAKE_FLAGS .. - cmake --build . @@ -79,8 +85,10 @@ steps: CC: clang CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32' commands: + - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" - apt-get update -y - apt-get install -y make $CC gfortran perl cmake + - $CC --version - mkdir build && cd build - cmake $CMAKE_FLAGS .. - cmake --build . From 3d94ab660f6352d31ef4a92835fd7506869cb80d Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 14:17:12 -0500 Subject: [PATCH 048/127] build without lapack on cmake --- .drone.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.drone.yml b/.drone.yml index a7224036e..3df5f406a 100644 --- a/.drone.yml +++ b/.drone.yml @@ -59,11 +59,12 @@ steps: image: ubuntu:18.04 environment: CC: gcc - CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32' + CXX: g++ + CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' commands: - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" - apt-get update -y - - apt-get install -y make $CC gfortran perl cmake + - apt-get install -y make $CC $CXX g++ perl cmake - $CC --version - mkdir build && cd build - cmake $CMAKE_FLAGS .. @@ -83,11 +84,12 @@ steps: image: ubuntu:18.04 environment: CC: clang - CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32' + CXX: clang++ + CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' commands: - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" - apt-get update -y - - apt-get install -y make $CC gfortran perl cmake + - apt-get install -y make $CC $CXX perl cmake - $CC --version - mkdir build && cd build - cmake $CMAKE_FLAGS .. From 7aa6faad5f17cbd6e477e0c393a3ae853e610de8 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 14:22:36 -0500 Subject: [PATCH 049/127] parallel build --- .drone.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.drone.yml b/.drone.yml index 3df5f406a..a8c69f8ca 100644 --- a/.drone.yml +++ b/.drone.yml @@ -59,16 +59,15 @@ steps: image: ubuntu:18.04 environment: CC: gcc - CXX: g++ CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' commands: - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" - apt-get update -y - - apt-get install -y make $CC $CXX g++ perl cmake + - apt-get install -y make $CC g++ perl cmake - $CC --version - mkdir build && cd build - cmake $CMAKE_FLAGS .. - - cmake --build . + - make -j - ctest --- @@ -84,14 +83,13 @@ steps: image: ubuntu:18.04 environment: CC: clang - CXX: clang++ CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' commands: - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" - apt-get update -y - - apt-get install -y make $CC $CXX perl cmake + - apt-get install -y make $CC g++ perl cmake - $CC --version - mkdir build && cd build - cmake $CMAKE_FLAGS .. - - cmake --build . + - make -j - ctest From e3cb8ad2d6cef8a56d8a0543d58c678f7b068ecd Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 14:28:48 -0500 Subject: [PATCH 050/127] See if ubuntu 19.04 fixes the ICE --- .drone.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.drone.yml b/.drone.yml index a8c69f8ca..46f259794 100644 --- a/.drone.yml +++ b/.drone.yml @@ -8,7 +8,7 @@ platform: steps: - name: Build and Test - image: ubuntu:18.04 + image: ubuntu:19.04 environment: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' From 7ff44e0016f1f1bdeb518e108d9ae65e30004233 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 15:09:53 -0500 Subject: [PATCH 051/127] Remove qemu armv8 builds --- .travis.yml | 7 ------- azure-pipelines.yml | 20 -------------------- 2 files changed, 27 deletions(-) diff --git a/.travis.yml b/.travis.yml index b2827997c..dc388459b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -193,13 +193,6 @@ matrix: -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile docker build . - - <<: *emulated-arm - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang - name: "Emulated Build for ARMV8 with clang" - - allow_failures: - - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang - - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang # whitelist branches: diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4673d07fe..cbea6f4a7 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -26,26 +26,6 @@ jobs: cmake --build ." > Dockerfile docker build . displayName: Run ARMV6 docker build -- job: ARMv8_gcc - pool: - vmImage: 'ubuntu-16.04' - steps: - - script: | - docker run --rm --privileged multiarch/qemu-user-static:register --reset - echo "FROM openblas/alpine:arm64 - COPY . /tmp/openblas - RUN mkdir /tmp/openblas/build && \ - cd /tmp/openblas/build && \ - CC=gcc cmake -D DYNAMIC_ARCH=OFF \ - -D TARGET=ARMV8 \ - -D NOFORTRAN=ON \ - -D BUILD_SHARED_LIBS=ON \ - -D BUILD_WITHOUT_LAPACK=ON \ - -D BUILD_WITHOUT_CBLAS=ON \ - -D CMAKE_BUILD_TYPE=Release ../ && \ - cmake --build ." > Dockerfile - docker build . - displayName: Run ARMV8 docker build # manylinux1 is useful to test because the # standard Docker container uses an old version # of gcc / glibc From b911525c81063db8b7525800cff2a7d842b99518 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 15:14:46 -0500 Subject: [PATCH 052/127] arm32 build --- .drone.yml | 48 +++++++++++++++++++++++++++++++++++++++++++++ .travis.yml | 21 -------------------- azure-pipelines.yml | 20 ------------------- 3 files changed, 48 insertions(+), 41 deletions(-) diff --git a/.drone.yml b/.drone.yml index 46f259794..aa9e129e0 100644 --- a/.drone.yml +++ b/.drone.yml @@ -22,6 +22,30 @@ steps: - make -C ctest $COMMON_FLAGS - make -C utest $COMMON_FLAGS +--- +kind: pipeline +name: arm32_gcc_make + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: gcc + COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32' + commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" + - apt-get update -y + - apt-get install -y make $CC gfortran perl + - $CC --version + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS + --- kind: pipeline name: arm64_clang_make @@ -46,6 +70,30 @@ steps: - make -C ctest $COMMON_FLAGS - make -C utest $COMMON_FLAGS +--- +kind: pipeline +name: arm32_clang_cmake + +platform: + os: linux + arch: arm + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: clang + CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV6 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' + commands: + - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" + - apt-get update -y + - apt-get install -y make $CC g++ perl cmake + - $CC --version + - mkdir build && cd build + - cmake $CMAKE_FLAGS .. + - make -j + - ctest + --- kind: pipeline name: arm64_gcc_cmake diff --git a/.travis.yml b/.travis.yml index dc388459b..a92bb0687 100644 --- a/.travis.yml +++ b/.travis.yml @@ -173,27 +173,6 @@ matrix: env: - BTYPE="BINARY=32" - - &emulated-arm - dist: trusty - sudo: required - services: docker - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang - name: "Emulated Build for ARMV6 with clang" - before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset - script: | - echo "FROM openblas/alpine:${IMAGE_ARCH} - COPY . /tmp/openblas - RUN mkdir /tmp/openblas/build && \ - cd /tmp/openblas/build && \ - CC=${COMPILER} cmake -D DYNAMIC_ARCH=OFF \ - -D TARGET=${TARGET_ARCH} \ - -D BUILD_SHARED_LIBS=ON \ - -D BUILD_WITHOUT_LAPACK=ON \ - -D BUILD_WITHOUT_CBLAS=ON \ - -D CMAKE_BUILD_TYPE=Release ../ && \ - cmake --build ." > Dockerfile - docker build . - # whitelist branches: only: diff --git a/azure-pipelines.yml b/azure-pipelines.yml index cbea6f4a7..7197062d1 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -6,26 +6,6 @@ trigger: - develop jobs: -- job: ARMv6_gcc - pool: - vmImage: 'ubuntu-16.04' - steps: - - script: | - docker run --rm --privileged multiarch/qemu-user-static:register --reset - echo "FROM openblas/alpine:arm32 - COPY . /tmp/openblas - RUN mkdir /tmp/openblas/build && \ - cd /tmp/openblas/build && \ - CC=gcc cmake -D DYNAMIC_ARCH=OFF \ - -D TARGET=ARMV6 \ - -D NOFORTRAN=ON \ - -D BUILD_SHARED_LIBS=ON \ - -D BUILD_WITHOUT_LAPACK=ON \ - -D BUILD_WITHOUT_CBLAS=ON \ - -D CMAKE_BUILD_TYPE=Release ../ && \ - cmake --build ." > Dockerfile - docker build . - displayName: Run ARMV6 docker build # manylinux1 is useful to test because the # standard Docker container uses an old version # of gcc / glibc From b43deb4ad60b2960b4c0ee1aca6afeaadc30673c Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 15:25:45 -0500 Subject: [PATCH 053/127] Fix typo --- .drone.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.drone.yml b/.drone.yml index aa9e129e0..779912954 100644 --- a/.drone.yml +++ b/.drone.yml @@ -28,11 +28,11 @@ name: arm32_gcc_make platform: os: linux - arch: arm64 + arch: arm steps: - name: Build and Test - image: ubuntu:18.04 + image: ubuntu:19.04 environment: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32' From a211bc9b6a6e597a38fc8b8b7ed0b006cb367c46 Mon Sep 17 00:00:00 2001 From: Tyler Reddy Date: Tue, 14 May 2019 11:32:23 -0700 Subject: [PATCH 054/127] TST: add SkylakeX AVX512 CI test * adapt the C-level reproducer code for some recent SkylakeX AVX512 kernel issues, provided by Isuru Fernando and modified by Martin Kroeker, for usage in the utest suite * add an Intel SDE SkylakeX emulation utest run to the Azure CI matrix; a custom Docker build was required because Ubuntu image provided by Azure does not support AVX512VL instructions --- azure-pipelines.yml | 24 ++++++++++++++++++ utest/CMakeLists.txt | 1 + utest/Makefile | 1 + utest/test_kernel_regress.c | 50 +++++++++++++++++++++++++++++++++++++ 4 files changed, 76 insertions(+) create mode 100644 utest/test_kernel_regress.c diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 7197062d1..9b4c85367 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -25,3 +25,27 @@ jobs: make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile docker build . displayName: Run manylinux1 docker build +- job: Intel_SDE_skx + pool: + vmImage: 'ubuntu-16.04' + steps: + - script: | + # at the time of writing the available Azure Ubuntu vm image + # does not support AVX512VL, so use more recent LTS version + echo "FROM ubuntu:bionic + COPY . /tmp/openblas + RUN apt-get -y update && apt-get -y install \\ + cmake \\ + gfortran \\ + make \\ + wget + RUN mkdir /tmp/SDE && cd /tmp/SDE && \\ + mkdir sde-external-8.35.0-2019-03-11-lin && \\ + wget --quiet -O sde-external-8.35.0-2019-03-11-lin.tar.bz2 https://www.dropbox.com/s/fopsnzj67572sj5/sde-external-8.35.0-2019-03-11-lin.tar.bz2?dl=0 && \\ + tar -xjvf sde-external-8.35.0-2019-03-11-lin.tar.bz2 -C /tmp/SDE/sde-external-8.35.0-2019-03-11-lin --strip-components=1 + RUN cd /tmp/openblas && CC=gcc make QUIET_MAKE=1 DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64 + CMD cd /tmp/openblas && echo 0 > /proc/sys/kernel/yama/ptrace_scope && CC=gcc OPENBLAS_VERBOSE=2 /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/sde64 -cpuid_in /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/misc/cpuid/skx/cpuid.def -- make -C utest DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64" > Dockerfile + docker build -t intel_sde . + # we need a privileged docker run for sde process attachment + docker run --privileged intel_sde + displayName: 'Run AVX512 SkylakeX docker build / test' diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index dc306501f..4e647cadc 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -38,6 +38,7 @@ if (NOT NO_LAPACK) set(OpenBLAS_utest_src ${OpenBLAS_utest_src} test_potrs.c + test_kernel_regress.c ) endif() diff --git a/utest/Makefile b/utest/Makefile index 550a65569..cbe639cdb 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -13,6 +13,7 @@ OBJS=utest_main.o test_amax.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o ifneq ($(NO_LAPACK), 1) OBJS += test_potrs.o +OBJS += test_kernel_regress.o endif #this does not work with OpenMP nor with native Windows or Android threads diff --git a/utest/test_kernel_regress.c b/utest/test_kernel_regress.c new file mode 100644 index 000000000..93a30b30c --- /dev/null +++ b/utest/test_kernel_regress.c @@ -0,0 +1,50 @@ +#include "openblas_utest.h" +#include +#include +#include + +#define LAPACK_ROW_MAJOR 101 +blasint LAPACKE_dgesvd( blasint matrix_layout, char jobu, char jobvt, + blasint m, blasint n, double* a, + blasint lda, double* s, double* u, blasint ldu, + double* vt, blasint ldvt, double* superb ); + + +#define DATASIZE 100 + +double s[DATASIZE]; +double u[DATASIZE*DATASIZE]; +double vt[DATASIZE*DATASIZE]; +double X[DATASIZE*DATASIZE]; +double superb[DATASIZE]; +double tmp[DATASIZE*DATASIZE]; +double m[DATASIZE*DATASIZE]; + +CTEST(kernel_regress,skx_avx) +{ + double norm; + int i, j, info; + srand(0); + for (i = 0; i < DATASIZE*DATASIZE; i++) { + m[i] = (rand()+0.0)/RAND_MAX * 10; + tmp[i] = m[i]; + } + + info = LAPACKE_dgesvd( LAPACK_ROW_MAJOR, 'A', 'A', DATASIZE, DATASIZE, m, DATASIZE, + s, u, DATASIZE, vt, DATASIZE, superb); + + for (i = 0; i < DATASIZE; i++) { + for (j = 0; j < DATASIZE; j++) { + u[i*DATASIZE+j] = u[i*DATASIZE+j]*s[j]; + } + } + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + DATASIZE, DATASIZE, DATASIZE, 1, u, DATASIZE, vt, DATASIZE, 0, X, DATASIZE); + + for (i = 0; i < DATASIZE*DATASIZE; i++) { + X[i] = X[i] - tmp[i]; + } + + norm = cblas_dnrm2(DATASIZE*DATASIZE, X, 1); + ASSERT_DBL_NEAR_TOL(0.0, norm, 1e-10); +} From d2cb610272137536416df2e44f1bc8175ddd4eaf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 May 2019 23:18:43 +0200 Subject: [PATCH 055/127] Add option USE_LOCKING for single-threaded build with locking support for calling from concurrent threads --- Makefile.rule | 10 ++++++++-- Makefile.system | 12 ++++++++++++ common.h | 4 ++-- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 17815096e..faf8c8013 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -56,7 +56,13 @@ VERSION = 0.3.7.dev # specify it. # For force setting for single threaded, specify USE_THREAD = 0 # For force setting for multi threaded, specify USE_THREAD = 1 -# USE_THREAD = 0 +USE_THREAD = 0 + +# If you want to build a single-threaded OpenBLAS, but expect to call this +# from several concurrent threads in some other program, comment this in for +# thread safety. (This is done automatically for USE_THREAD=1 , and should not +# be necessary when USE_OPENMP=1) +# USE_LOCKING = 1 # If you're going to use this library with OpenMP, please comment it in. # This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8. @@ -220,7 +226,7 @@ NO_AFFINITY = 1 COMMON_PROF = -pg # Build Debug version -# DEBUG = 1 +DEBUG = 1 # Set maximum stack allocation. # The default value is 2048. 0 disable stack allocation a may reduce GER and GEMV diff --git a/Makefile.system b/Makefile.system index a95d6190f..29aef7e27 100644 --- a/Makefile.system +++ b/Makefile.system @@ -237,6 +237,10 @@ SMP = 1 endif endif +ifeq ($(SMP), 1) +USE_LOCKING = +endif + ifndef NEED_PIC NEED_PIC = 1 endif @@ -388,6 +392,12 @@ ifneq ($(MAX_STACK_ALLOC), 0) CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC) endif +ifdef USE_LOCKING +ifneq ($(USE_LOCKING), 0) +CCOMMON_OPT += -DUSE_LOCKING +endif +endif + # # Architecture dependent settings # @@ -744,6 +754,8 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT FCOMMON_OPT += -Wall # make single-threaded LAPACK calls thread-safe #1847 FCOMMON_OPT += -frecursive +# work around ABI changes in gfortran 9 that break calls from C code +FCOMMON_OPT += -fno-optimize-sibling-calls #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc ifneq ($(NO_LAPACK), 1) EXTRALIB += -lgfortran diff --git a/common.h b/common.h index 0ac74bb20..a9fe8d911 100644 --- a/common.h +++ b/common.h @@ -131,7 +131,7 @@ extern "C" { #include #include #include -#ifdef SMP +#if defined(SMP) || defined(USE_LOCKING) #include #endif #endif @@ -200,7 +200,7 @@ extern "C" { #error "You can't specify both LOCK operation!" #endif -#ifdef SMP +#if defined(SMP) || defined(USE_LOCKING) #define USE_PTHREAD_LOCK #undef USE_PTHREAD_SPINLOCK #endif From 1e52572be38541cc11ac39cef6cded8a640bb65b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 May 2019 23:19:30 +0200 Subject: [PATCH 056/127] Add option USE_LOCKING for single-threaded build with locking support --- cmake/system.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index d0f560872..adedd32cc 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -136,6 +136,10 @@ endif () if (USE_THREAD) message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.") +else() + if (${USE_LOCKING}) + set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_LOCKING") + endif () endif () include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") From 86dda5c2fa9e298deacdd17211e2c4e58f2688ea Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 May 2019 23:21:20 +0200 Subject: [PATCH 057/127] Add option USE_LOCKING for SMP-like locking in USE_THREAD=0 builds --- driver/others/memory.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 02352b3ae..adb1ec86c 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2062,13 +2062,13 @@ static void *alloc_mmap(void *address){ } if (map_address != (void *)-1) { -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif } @@ -2214,13 +2214,13 @@ static void *alloc_mmap(void *address){ #endif if (map_address != (void *)-1) { -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif } @@ -2701,7 +2701,7 @@ void *blas_memory_alloc(int procpos){ position = 0; -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif do { @@ -2718,7 +2718,7 @@ void *blas_memory_alloc(int procpos){ position ++; } while (position < NUM_BUFFERS); -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif goto error; @@ -2730,7 +2730,7 @@ void *blas_memory_alloc(int procpos){ #endif memory[position].used = 1; -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #else blas_unlock(&memory[position].lock); @@ -2779,11 +2779,11 @@ void *blas_memory_alloc(int procpos){ } while ((BLASLONG)map_address == -1); -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif memory[position].addr = map_address; -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif @@ -2839,7 +2839,7 @@ void blas_memory_free(void *free_area){ #endif position = 0; -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) @@ -2855,7 +2855,7 @@ void blas_memory_free(void *free_area){ WMB; memory[position].used = 0; -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif @@ -2872,7 +2872,7 @@ void blas_memory_free(void *free_area){ for (position = 0; position < NUM_BUFFERS; position++) printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); #endif -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif return; @@ -2924,7 +2924,7 @@ void blas_shutdown(void){ #if defined(OS_LINUX) && !defined(NO_WARMUP) -#ifdef SMP +#if defined(SMP) || defined(USE_LOCKING) #if defined(USE_PTHREAD_LOCK) static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER; #elif defined(USE_PTHREAD_SPINLOCK) @@ -2949,7 +2949,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, if (hot_alloc != 2) { #endif -#ifdef SMP +#if defined(SMP) || defined(USE_LOCKING) LOCK_COMMAND(&init_lock); #endif @@ -2959,7 +2959,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, size -= PAGESIZE; } -#ifdef SMP +#if defined(SMP) || defined(USE_LOCKING) UNLOCK_COMMAND(&init_lock); #endif From 5ecffc28f2c32a23222ab633c904c9886923ecf1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 May 2019 23:36:17 +0200 Subject: [PATCH 058/127] Add option USE_LOCKING but keep default settings intact --- Makefile.rule | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index faf8c8013..255d1da46 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -56,7 +56,7 @@ VERSION = 0.3.7.dev # specify it. # For force setting for single threaded, specify USE_THREAD = 0 # For force setting for multi threaded, specify USE_THREAD = 1 -USE_THREAD = 0 +# USE_THREAD = 0 # If you want to build a single-threaded OpenBLAS, but expect to call this # from several concurrent threads in some other program, comment this in for @@ -226,7 +226,7 @@ NO_AFFINITY = 1 COMMON_PROF = -pg # Build Debug version -DEBUG = 1 +# DEBUG = 1 # Set maximum stack allocation. # The default value is 2048. 0 disable stack allocation a may reduce GER and GEMV From f66c11fc22fa01eb8e120d4274d262b3795e4281 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 May 2019 23:38:12 +0200 Subject: [PATCH 059/127] Remove unrelated change --- Makefile.system | 2 -- 1 file changed, 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 29aef7e27..f574edf88 100644 --- a/Makefile.system +++ b/Makefile.system @@ -754,8 +754,6 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT FCOMMON_OPT += -Wall # make single-threaded LAPACK calls thread-safe #1847 FCOMMON_OPT += -frecursive -# work around ABI changes in gfortran 9 that break calls from C code -FCOMMON_OPT += -fno-optimize-sibling-calls #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc ifneq ($(NO_LAPACK), 1) EXTRALIB += -lgfortran From 1778fd4219688e84463844f3aeaf824ca4043b31 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 22 May 2019 13:48:27 +0200 Subject: [PATCH 060/127] Do not try ancient PGI hacks with recent versions of that compiler should fix #2139 --- driver/others/memory.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 02352b3ae..bf2cfb996 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1622,6 +1622,7 @@ void gotoblas_dummy_for_PGI(void) { gotoblas_init(); gotoblas_quit(); +#if __PGIC__ < 19 #if 0 asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); @@ -1629,6 +1630,7 @@ void gotoblas_dummy_for_PGI(void) { asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); #endif +#endif } #endif @@ -3192,7 +3194,7 @@ void gotoblas_dummy_for_PGI(void) { gotoblas_init(); gotoblas_quit(); - +#if __PGIC__ < 19 #if 0 asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); @@ -3200,6 +3202,7 @@ void gotoblas_dummy_for_PGI(void) { asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); #endif +#endif } #endif From 940f38f6dd504c02a554470b53545270e8e5a351 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 24 May 2019 13:02:23 +0200 Subject: [PATCH 061/127] Build and run utests in any case, they do their own checks for fortran availability --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 273fde33e..aed248ef2 100644 --- a/Makefile +++ b/Makefile @@ -123,8 +123,8 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all - $(MAKE) -C utest all endif + $(MAKE) -C utest all ifndef NO_CBLAS $(MAKE) -C ctest all endif From 79366ff7a9548e7eb5d200c7ac444d35b28f2b7a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 May 2019 20:34:22 +0200 Subject: [PATCH 062/127] Add softfp support in min/max kernels fix for #1912 --- kernel/arm/iamax_vfp.S | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/arm/iamax_vfp.S b/kernel/arm/iamax_vfp.S index fd43b15b1..ae362935e 100644 --- a/kernel/arm/iamax_vfp.S +++ b/kernel/arm/iamax_vfp.S @@ -469,9 +469,11 @@ iamax_kernel_S10: iamax_kernel_L999: - +#if !defined(__ARM_PCS_VFP) + vmov r0, s0 +#else mov r0, INDEX // set return value - +#endif pop {r4} bx lr From d76b20b4d2617582c8e1ac8a5aeb079e5c9de6f4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 29 May 2019 14:07:17 +0200 Subject: [PATCH 063/127] Revert "Add softfp support in min/max kernels" --- kernel/arm/iamax_vfp.S | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/arm/iamax_vfp.S b/kernel/arm/iamax_vfp.S index ae362935e..fd43b15b1 100644 --- a/kernel/arm/iamax_vfp.S +++ b/kernel/arm/iamax_vfp.S @@ -469,11 +469,9 @@ iamax_kernel_S10: iamax_kernel_L999: -#if !defined(__ARM_PCS_VFP) - vmov r0, s0 -#else + mov r0, INDEX // set return value -#endif + pop {r4} bx lr From c70496b1082983e4d68a2513486a9d2fcbef44e2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 29 May 2019 15:02:51 +0200 Subject: [PATCH 064/127] Separate implementations of AMAX and IAMAX on arm As noted in #1912 and comment on #1942, the combined implementation happens to "do the right thing" on hardfp, but cannot return both value and index on softfp where they would have to share the return register --- kernel/arm/KERNEL.ARMV6 | 24 +-- kernel/arm/amax_vfp.S | 441 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 453 insertions(+), 12 deletions(-) create mode 100644 kernel/arm/amax_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index b773a5ba0..1c561deb6 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -1,20 +1,20 @@ include $(KERNELDIR)/KERNEL.ARMV5 -SAMAXKERNEL = iamax_vfp.S -DAMAXKERNEL = iamax_vfp.S -CAMAXKERNEL = iamax_vfp.S -ZAMAXKERNEL = iamax_vfp.S +SAMAXKERNEL = amax_vfp.S +DAMAXKERNEL = amax_vfp.S +CAMAXKERNEL = amax_vfp.S +ZAMAXKERNEL = amax_vfp.S -SAMINKERNEL = iamax_vfp.S -DAMINKERNEL = iamax_vfp.S -CAMINKERNEL = iamax_vfp.S -ZAMINKERNEL = iamax_vfp.S +SAMINKERNEL = amax_vfp.S +DAMINKERNEL = amax_vfp.S +CAMINKERNEL = amax_vfp.S +ZAMINKERNEL = amax_vfp.S -SMAXKERNEL = iamax_vfp.S -DMAXKERNEL = iamax_vfp.S +SMAXKERNEL = amax_vfp.S +DMAXKERNEL = amax_vfp.S -SMINKERNEL = iamax_vfp.S -DMINKERNEL = iamax_vfp.S +SMINKERNEL = amax_vfp.S +DMINKERNEL = amax_vfp.S ISAMAXKERNEL = iamax_vfp.S IDAMAXKERNEL = iamax_vfp.S diff --git a/kernel/arm/amax_vfp.S b/kernel/arm/amax_vfp.S new file mode 100644 index 000000000..c780ce5bd --- /dev/null +++ b/kernel/arm/amax_vfp.S @@ -0,0 +1,441 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/14 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if defined(USE_ABS) + +#if defined(DOUBLE) + +#define VABS(x0,x1) vabs.f64 x0, x1 + +#else + +#define VABS(x0,x1) vabs.f32 x0, x1 + +#endif + +#else + +#define VABS(x0,x1) nop + +#endif + +/*****************************************************************************************/ + +#if defined(USE_MIN) + +#define MOVCOND movlt + +#if defined(DOUBLE) + +#define VMOVCOND vmovlt.f64 + +#else + +#define VMOVCOND vmovlt.f32 + +#endif + +#else + +#define MOVCOND movgt + +#if defined(DOUBLE) + +#define VMOVCOND vmovgt.f64 + +#else + +#define VMOVCOND vmovgt.f32 + +#endif + + +#endif + + +/*****************************************************************************************/ + + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro INIT_F + + vldmia.f64 X!, { d0 } + VABS( d0, d0 ) + +.endm + +.macro KERNEL_F1 + + vldmia.f64 X!, { d4 } + VABS( d4, d4 ) + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + +.endm + +.macro INIT_S + + vldmia.f64 X, { d0 } + VABS( d0, d0 ) + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + vldmia.f64 X, { d4 } + VABS( d4, d4 ) + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + add X, X, INC_X + +.endm + +#else + +.macro INIT_F + + vldmia.f32 X!, { s0 } + VABS( s0, s0 ) + +.endm + +.macro KERNEL_F1 + + vldmia.f32 X!, { s4 } + VABS( s4, s4 ) + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + +.endm + +.macro INIT_S + + vldmia.f32 X, { s0 } + VABS( s0, s0 ) + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + vldmia.f32 X, { s4 } + VABS( s4, s4 ) + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + add X, X, INC_X + +.endm + + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro INIT_F + + vldmia.f64 X!, { d0 -d1 } + vabs.f64 d0, d0 + vabs.f64 d1, d1 + vadd.f64 d0 , d0, d1 +.endm + + +.macro KERNEL_F1 + + vldmia.f64 X!, { d4 - d5 } + vabs.f64 d4, d4 + vabs.f64 d5, d5 + vadd.f64 d4 , d4, d5 + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + +.endm + +.macro INIT_S + + vldmia.f64 X, { d0 -d1 } + vabs.f64 d0, d0 + vabs.f64 d1, d1 + vadd.f64 d0 , d0, d1 + add X, X, INC_X + +.endm + + + +.macro KERNEL_S1 + + vldmia.f64 X, { d4 - d5 } + vabs.f64 d4, d4 + vabs.f64 d5, d5 + vadd.f64 d4 , d4, d5 + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + add X, X, INC_X + +.endm + +#else + +.macro INIT_F + + vldmia.f32 X!, { s0 -s1 } + vabs.f32 s0, s0 + vabs.f32 s1, s1 + vadd.f32 s0 , s0, s1 + +.endm + + +.macro KERNEL_F1 + + vldmia.f32 X!, { s4 - s5 } + vabs.f32 s4, s4 + vabs.f32 s5, s5 + vadd.f32 s4 , s4, s5 + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + +.endm + +.macro INIT_S + + vldmia.f32 X, { s0 -s1 } + vabs.f32 s0, s0 + vabs.f32 s1, s1 + vadd.f32 s0 , s0, s1 + add X, X, INC_X + +.endm + + + +.macro KERNEL_S1 + + vldmia.f32 X, { s4 - s5 } + vabs.f32 s4, s4 + vabs.f32 s5, s5 + vadd.f32 s4 , s4, s5 + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + add X, X, INC_X + +.endm + + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + movs r12, #0 // clear floating point register + vmov s0, r12 +#if defined(DOUBLE) + vcvt.f64.f32 d0, s0 +#endif + + + cmp N, #0 + ble amax_kernel_L999 + + cmp INC_X, #0 + beq amax_kernel_L999 + + + cmp INC_X, #1 + bne amax_kernel_S_BEGIN + + +amax_kernel_F_BEGIN: + + INIT_F + + subs N, N , #1 + ble amax_kernel_L999 + + asrs I, N, #2 // I = N / 4 + ble amax_kernel_F1 + + .align 5 + +amax_kernel_F4: + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 +#if defined(COMPLEX) && defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + KERNEL_F1 + KERNEL_F1 + + subs I, I, #1 + ble amax_kernel_F1 + + +#if defined(COMPLEX) || defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + KERNEL_F1 + KERNEL_F1 +#if defined(COMPLEX) && defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + KERNEL_F1 + KERNEL_F1 + + subs I, I, #1 + bne amax_kernel_F4 + +amax_kernel_F1: + + ands I, N, #3 + ble amax_kernel_L999 + +amax_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne amax_kernel_F10 + + b amax_kernel_L999 + +amax_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + INIT_S + + subs N, N , #1 + ble amax_kernel_L999 + + asrs I, N, #2 // I = N / 4 + ble amax_kernel_S1 + + .align 5 + +amax_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne amax_kernel_S4 + +amax_kernel_S1: + + ands I, N, #3 + ble amax_kernel_L999 + +amax_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne amax_kernel_S10 + + +amax_kernel_L999: +#if !defined(__ARM_PCS_VFP) + vmov r0, s0 +#endif + bx lr + + EPILOGUE + From c5495d20563d9a7a142c6726d24c0fd485fcedf6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 30 May 2019 11:25:43 +0200 Subject: [PATCH 065/127] Ensure correct output for DAMAX with softfp --- kernel/arm/amax_vfp.S | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/arm/amax_vfp.S b/kernel/arm/amax_vfp.S index c780ce5bd..d3770ea1e 100644 --- a/kernel/arm/amax_vfp.S +++ b/kernel/arm/amax_vfp.S @@ -432,8 +432,12 @@ amax_kernel_S10: amax_kernel_L999: -#if !defined(__ARM_PCS_VFP) +#if !defined(__ARM_PCS_VFP) +#if defined(DOUBLE) + vmov r0, r1, d0 +#else vmov r0, s0 +#endif #endif bx lr From 74c10b57c6ea9d80f77c469b50f90989843b0bb9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 30 May 2019 11:38:11 +0200 Subject: [PATCH 066/127] Use generic kernels for complex (I)AMAX to support softfp --- kernel/arm/KERNEL.ARMV6 | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 1c561deb6..344a71885 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -2,13 +2,13 @@ include $(KERNELDIR)/KERNEL.ARMV5 SAMAXKERNEL = amax_vfp.S DAMAXKERNEL = amax_vfp.S -CAMAXKERNEL = amax_vfp.S -ZAMAXKERNEL = amax_vfp.S +#CAMAXKERNEL = amax_vfp.S +#ZAMAXKERNEL = amax_vfp.S SAMINKERNEL = amax_vfp.S DAMINKERNEL = amax_vfp.S -CAMINKERNEL = amax_vfp.S -ZAMINKERNEL = amax_vfp.S +#CAMINKERNEL = amax_vfp.S +#ZAMINKERNEL = amax_vfp.S SMAXKERNEL = amax_vfp.S DMAXKERNEL = amax_vfp.S @@ -18,13 +18,13 @@ DMINKERNEL = amax_vfp.S ISAMAXKERNEL = iamax_vfp.S IDAMAXKERNEL = iamax_vfp.S -ICAMAXKERNEL = iamax_vfp.S -IZAMAXKERNEL = iamax_vfp.S +#ICAMAXKERNEL = iamax_vfp.S +#IZAMAXKERNEL = iamax_vfp.S ISAMINKERNEL = iamax_vfp.S IDAMINKERNEL = iamax_vfp.S -ICAMINKERNEL = iamax_vfp.S -IZAMINKERNEL = iamax_vfp.S +#ICAMINKERNEL = iamax_vfp.S +#IZAMINKERNEL = iamax_vfp.S ISMAXKERNEL = iamax_vfp.S IDMAXKERNEL = iamax_vfp.S From 8fe794f059a29922f1a4de7ecd143f35c79eb7e9 Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Thu, 23 May 2019 04:23:43 +0000 Subject: [PATCH 067/127] improved zgemm power9 based on power8 --- kernel/power/KERNEL.POWER9 | 2 +- kernel/power/sgemm_kernel_power9.S | 2 +- kernel/power/sgemm_logic_power9.S | 40 +- kernel/power/zgemm_kernel_power9.S | 257 +++++ kernel/power/zgemm_logic_power9.S | 857 ++++++++++++++ kernel/power/zgemm_macros_power9.S | 1664 ++++++++++++++++++++++++++++ param.h | 4 +- 7 files changed, 2802 insertions(+), 24 deletions(-) create mode 100644 kernel/power/zgemm_kernel_power9.S create mode 100644 kernel/power/zgemm_logic_power9.S create mode 100644 kernel/power/zgemm_macros_power9.S diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 0e0d62393..5c10ad64a 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -38,7 +38,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy.o CGEMMINCOPYOBJ = cgemm_incopy.o CGEMMITCOPYOBJ = cgemm_itcopy.o -ZGEMMKERNEL = zgemm_kernel_8x2_power8.S +ZGEMMKERNEL = zgemm_kernel_power9.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S index a44659468..f408cdc17 100644 --- a/kernel/power/sgemm_kernel_power9.S +++ b/kernel/power/sgemm_kernel_power9.S @@ -168,7 +168,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*alpha is stored in f1. convert to single and splat*/ - xscvdpspn alpha_r,vs1 + xscvdpspn alpha_r,vs1 xxspltw alpha_r,alpha_r,0 diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S index 300e30470..c149cb903 100644 --- a/kernel/power/sgemm_logic_power9.S +++ b/kernel/power/sgemm_logic_power9.S @@ -53,9 +53,9 @@ LSGEMM_L8x16_BEGIN: LSGEMM_L8x16_LOOP_START: LOAD8x16_0 /*we already zeroed */ - ##OffsetA=64 OffsetB=32 - addi AO,AO,2112 - addi BO,BO,32 + /*##OffsetA=64 OffsetB=32 + #addi AO,AO,2112 + #addi BO,BO,32 */ mtctr L @@ -63,29 +63,29 @@ LSGEMM_L8x16_LOOP_START: LSGEMM_L8x16_LOOP: - KERNEL8x16_I1_L4_2 -2048,0, 0,0 - KERNEL8x16_I1_L4_2 -2048,0, 1,0 - KERNEL8x16_I1_L4_2 -2048,0, 2,0 - KERNEL8x16_I1_L4_2 -2048,0, 3,0 - KERNEL8x16_I1_L4_2 -2048,0, 4,0 - KERNEL8x16_I1_L4_2 -2048,0, 5,0 - KERNEL8x16_I1_L4_2 -2048,0, 6,0 - KERNEL8x16_I1_L4_2 -2048,0, 7,0 - KERNEL8x16_I1_L4_2 -2048,0, 8,0 - KERNEL8x16_I1_L4_2 -2048,0, 9,0 - KERNEL8x16_I1_L4_2 -2048,0, 10,0 - KERNEL8x16_I1_L4_2 -2048,0, 11,0 - KERNEL8x16_I1_L4_2 -2048,0, 12,0 - KERNEL8x16_I1_L4_2 -2048,0, 13,0 - KERNEL8x16_I1_L4_2 -2048,0, 14,0 - KERNEL8x16_I1_L4_2 -2048,0, 15,1 + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_2 64,32, 1,0 + KERNEL8x16_I1_L4_2 64,32, 2,0 + KERNEL8x16_I1_L4_2 64,32, 3,0 + KERNEL8x16_I1_L4_2 64,32, 4,0 + KERNEL8x16_I1_L4_2 64,32, 5,0 + KERNEL8x16_I1_L4_2 64,32, 6,0 + KERNEL8x16_I1_L4_2 64,32, 7,0 + KERNEL8x16_I1_L4_2 64,32, 8,0 + KERNEL8x16_I1_L4_2 64,32, 9,0 + KERNEL8x16_I1_L4_2 64,32, 10,0 + KERNEL8x16_I1_L4_2 64,32, 11,0 + KERNEL8x16_I1_L4_2 64,32, 12,0 + KERNEL8x16_I1_L4_2 64,32, 13,0 + KERNEL8x16_I1_L4_2 64,32, 14,0 + KERNEL8x16_I1_L4_2 64,32, 15,1 bdnz LSGEMM_L8x16_LOOP MY_ALIGN LSGEMM_L8x16_LOOP_END: - END8x16 0, AO, BO, -2048, 0 + END8x16 0, AO, BO, 64, 32 b LSGEMM_L8x16_SUB1 MY_ALIGN diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S new file mode 100644 index 000000000..e655f0bfe --- /dev/null +++ b/kernel/power/zgemm_kernel_power9.S @@ -0,0 +1,257 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define LOAD ld + +#define STACKSIZE 32192 + +#define FZERO 312+192(SP) + + +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + + +#define o0 0 +#define alpha_r vs30 +#define alpha_i vs31 + +#define VECSAVE r11 + +#define FRAMEPOINTER r12 + +#define BBUFFER r14 + +#define L r15 +#define ALPHA r16 +#define T5 r17 +#define T2 r19 +#define BBO r20 +#define o8 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define T3 r28 +#define T4 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + mr FRAMEPOINTER, SP + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv v20, 288(SP) + stxv v21, 304(SP) + stxv v22, 320(SP) + stxv v23, 336(SP) + stxv v24, 352(SP) + stxv v25, 368(SP) + stxv v26, 384(SP) + stxv v27, 400(SP) + stxv v28, 416(SP) + stxv v29, 432(SP) + stxv v30, 448(SP) + stxv v31, 464(SP) + + + stw r0, FZERO + +#ifdef linux + ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) +#endif + + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) +#endif +#endif + + +#include "zgemm_macros_power9.S" + + cmpwi cr0, M, 0 + ble L999 + cmpwi cr0, N, 0 + ble L999 + cmpwi cr0, K, 0 + ble L999 + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 512 + li o8 , 8 + li o16 , 16 + + addi BBUFFER, SP, 512+4096 + li T1, -4096 + and BBUFFER, BBUFFER, T1 + + + addi ALPHA, SP, 296+192 + + xxlor alpha_r,vs1,vs1 /*copy from register f1 */ + xxlor alpha_i,vs2,vs2 /*copy from register f2 */ + + .align 4 + +#include "zgemm_logic_power9.S" + +L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + lxv v20, 288(SP) + lxv v21, 304(SP) + lxv v22, 320(SP) + lxv v23, 336(SP) + lxv v24, 352(SP) + lxv v25, 368(SP) + lxv v26, 384(SP) + lxv v27, 400(SP) + lxv v28, 416(SP) + lxv v29, 432(SP) + lxv v30, 448(SP) + lxv v31, 464(SP) + + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif \ No newline at end of file diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S new file mode 100644 index 000000000..77ce36294 --- /dev/null +++ b/kernel/power/zgemm_logic_power9.S @@ -0,0 +1,857 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define MY_ALIGN .align 3 + + srawi. J, N, 1 + ble ZGEMM_L2_END + +ZGEMM_L2_BEGIN: + + mr BO, B + mr BBO, BBUFFER + srawi. T1, K, 2 + ble ZGEMM_L2_COPYB1 + +ZGEMM_L2_COPYB8: + + addi T2, PRE, 128 + dcbt BO, PRE + dcbtst BBO, PRE + dcbtst BBO, T2 + ZCOPYB_8 + addic. T1, T1, -1 + + bgt ZGEMM_L2_COPYB8 + +ZGEMM_L2_COPYB1: + + andi. T1, K, 3 + ble ZGEMM_L2_COPYB_END + +ZGEMM_L2_COPYB_LOOP: + + ZCOPYB_2 + addic. T1, T1, -1 + + bgt ZGEMM_L2_COPYB_LOOP + +ZGEMM_L2_COPYB_END: + + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + srawi. I, M, 3 + ble ZGEMM_L2x8_END + +ZGEMM_L2x8_BEGIN: + + + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 5 /**(K-1) % 32x */ + ZERO2x8 + ble ZGEMM_L2x8_SUB0 + + +ZGEMM_L2x8_LOOP_START: + + LOAD2x8 0 + li T2, 1024 + li T3, 1024+512 + li T4, 2048 + li T5, 2048+512 + mtctr L + + MY_ALIGN +ZGEMM_L2x8_LOOP: + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L 128,64,0,0 + KERNEL2x8_L 128,64,1,0 + dcbt AO, T2 + KERNEL2x8_L 128,64,2,0 + KERNEL2x8_L 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L 128,64,4,0 + KERNEL2x8_L 128,64,5,0 + dcbt AO, T4 + KERNEL2x8_L 128,64,6,0 + KERNEL2x8_L 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L 128,64,8,0 + KERNEL2x8_L 128,64,9,0 + KERNEL2x8_L 128,64,10,0 + KERNEL2x8_L 128,64,11,0 + dcbt BO, T4 + KERNEL2x8_L 128,64,12,0 + KERNEL2x8_L 128,64,13,0 + KERNEL2x8_L 128,64,14,0 + KERNEL2x8_L 128,64,15,1 + bdnz ZGEMM_L2x8_LOOP + MY_ALIGN +ZGEMM_L2x8_LOOP_END: + END2x8 AO, BO, 128, 64 + + b ZGEMM_L2x8_SUB1 + +ZGEMM_L2x8_SUB0: + + andi. L, K, 63 + + b ZGEMM_L2x8_SUB2 + +ZGEMM_L2x8_SUB1: + + andi. L, T1, 31 + ble ZGEMM_L2x8_SAVE + +ZGEMM_L2x8_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L2x8_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L2x8_SUB2_LOOP: + LOAD2x8 0 + KERNEL2x8_L 128,64, 0,0 + KERNEL2x8_L 128,64, 1,0 + KERNEL2x8_L 128,64, 2,0 + KERNEL2x8_E 128,64, 3,1 + bdnz ZGEMM_L2x8_SUB2_LOOP + MY_ALIGN +ZGEMM_L2x8_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L2x8_SUB2_2 + LOAD2x8 0 + KERNEL2x8_L 128,64, 0,0 + KERNEL2x8_E 128,64, 1,1 + MY_ALIGN +ZGEMM_L2x8_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L2x8_SUB2_1 + LOAD2x8 0 + KERNEL2x8_E 128,64, 0,1 + MY_ALIGN +ZGEMM_L2x8_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L2x8_SAVE + KERNEL2x8 + +/* addic. L, L, -1 + bgt ZGEMM_L2x8_SUB2_1*/ + +ZGEMM_L2x8_SAVE: + + SAVE2x8 + + addic. I, I, -1 + bgt ZGEMM_L2x8_BEGIN + +ZGEMM_L2x8_END: + +ZGEMM_L2x4_BEGIN: + + andi. T2, M, 7 + ble ZGEMM_L2x1_END + + andi. T1, M, 4 + ble ZGEMM_L2x4_END + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 4 /**(K-1) % 16x */ + ZERO2x4 + ble ZGEMM_L2x4_SUB0 + +ZGEMM_L2x4_LOOP_START: + LOAD2x4 0 + mtctr L + + MY_ALIGN +ZGEMM_L2x4_LOOP: + KERNEL2x4_L 64,64,0,0 + KERNEL2x4_L 64,64,1,0 + KERNEL2x4_L 64,64,2,0 + KERNEL2x4_L 64,64,3,0 + KERNEL2x4_L 64,64,4,0 + KERNEL2x4_L 64,64,5,0 + KERNEL2x4_L 64,64,6,0 + KERNEL2x4_L 64,64,7,1 + bdnz ZGEMM_L2x4_LOOP + MY_ALIGN +ZGEMM_L2x4_LOOP_END: + END2x4 AO, BO, 64, 64 + + b ZGEMM_L2x4_SUB1 + +ZGEMM_L2x4_SUB0: + + andi. L, K, 31 + + b ZGEMM_L2x4_SUB2 + +ZGEMM_L2x4_SUB1: + + andi. L, T1, 15 + ble ZGEMM_L2x4_SAVE + +ZGEMM_L2x4_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L2x4_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L2x4_SUB2_LOOP: + LOAD2x4 0 + KERNEL2x4_L 64,64, 0,0 + KERNEL2x4_L 64,64, 1,0 + KERNEL2x4_L 64,64, 2,0 + KERNEL2x4_E 64,64, 3,1 + bdnz ZGEMM_L2x4_SUB2_LOOP + MY_ALIGN +ZGEMM_L2x4_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L2x4_SUB2_2 + LOAD2x4 0 + KERNEL2x4_L 64,64, 0,0 + KERNEL2x4_E 64,64, 1,1 + MY_ALIGN +ZGEMM_L2x4_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L2x4_SUB2_1 + LOAD2x4 0 + KERNEL2x4_E 64,64, 0,1 + MY_ALIGN +ZGEMM_L2x4_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L2x4_SAVE + KERNEL2x4 + +ZGEMM_L2x4_SAVE: + + SAVE2x4 + +ZGEMM_L2x4_END: + +ZGEMM_L2x2_BEGIN: + + + andi. T1, M, 2 + ble ZGEMM_L2x2_END + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 4 /**(K-1) % 16x */ + ZERO2x2 + ble ZGEMM_L2x2_SUB0 + +ZGEMM_L2x2_LOOP_START: + LOAD2x2 0 + mtctr L + + MY_ALIGN +ZGEMM_L2x2_LOOP: + KERNEL2x2_L 32,64,0,0 + KERNEL2x2_L 32,64,1,0 + KERNEL2x2_L 32,64,2,0 + KERNEL2x2_L 32,64,3,0 + KERNEL2x2_L 32,64,4,0 + KERNEL2x2_L 32,64,5,0 + KERNEL2x2_L 32,64,6,0 + KERNEL2x2_L 32,64,7,1 + bdnz ZGEMM_L2x2_LOOP + MY_ALIGN +ZGEMM_L2x2_LOOP_END: + END2x2 AO, BO, 32, 64 + + b ZGEMM_L2x2_SUB1 + +ZGEMM_L2x2_SUB0: + + andi. L, K, 31 + + b ZGEMM_L2x2_SUB2 + +ZGEMM_L2x2_SUB1: + + andi. L, T1, 15 + ble ZGEMM_L2x2_SAVE + +ZGEMM_L2x2_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L2x2_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L2x2_SUB2_LOOP: + LOAD2x2 0 + KERNEL2x2_L 32,64, 0,0 + KERNEL2x2_L 32,64, 1,0 + KERNEL2x2_L 32,64, 2,0 + KERNEL2x2_E 32,64, 3,1 + bdnz ZGEMM_L2x2_SUB2_LOOP + MY_ALIGN +ZGEMM_L2x2_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L2x2_SUB2_2 + LOAD2x2 0 + KERNEL2x2_L 32,64, 0,0 + KERNEL2x2_E 32,64, 1,1 + MY_ALIGN +ZGEMM_L2x2_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L2x2_SUB2_1 + LOAD2x2 0 + KERNEL2x2_E 32,64, 0,1 + MY_ALIGN +ZGEMM_L2x2_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L2x2_SAVE + KERNEL2x2 +ZGEMM_L2x2_SAVE: + + SAVE2x2 + +ZGEMM_L2x2_END: + +ZGEMM_L2x1_BEGIN: + + + andi. T1, M, 1 + ble ZGEMM_L2x1_END + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 4 /**(K-1) % 16x */ + ZERO2x1 + ble ZGEMM_L2x1_SUB0 + +ZGEMM_L2x1_LOOP_START: + + LOAD2x1 0 + mtctr L + + MY_ALIGN +ZGEMM_L2x1_LOOP: + KERNEL2x1_L 16,64,0,0 + KERNEL2x1_L 16,64,1,0 + KERNEL2x1_L 16,64,2,0 + KERNEL2x1_L 16,64,3,0 + KERNEL2x1_L 16,64,4,0 + KERNEL2x1_L 16,64,5,0 + KERNEL2x1_L 16,64,6,0 + KERNEL2x1_L 16,64,7,1 + bdnz ZGEMM_L2x1_LOOP + MY_ALIGN +ZGEMM_L2x1_LOOP_END: + END2x1 AO, BO, 16, 64 + + b ZGEMM_L2x1_SUB1 + +ZGEMM_L2x1_SUB0: + + andi. L, K, 31 + + b ZGEMM_L2x1_SUB2 + +ZGEMM_L2x1_SUB1: + + andi. L, T1, 15 + ble ZGEMM_L2x1_SAVE + +ZGEMM_L2x1_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L2x1_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L2x1_SUB2_LOOP: + LOAD2x1 0 + KERNEL2x1_L 16,64, 0,0 + KERNEL2x1_L 16,64, 1,0 + KERNEL2x1_L 16,64, 2,0 + KERNEL2x1_E 16,64, 3,1 + bdnz ZGEMM_L2x1_SUB2_LOOP + MY_ALIGN +ZGEMM_L2x1_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L2x1_SUB2_2 + LOAD2x1 0 + KERNEL2x1_L 16,64, 0,0 + KERNEL2x1_E 16,64, 1,1 + MY_ALIGN +ZGEMM_L2x1_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L2x1_SUB2_1 + LOAD2x1 0 + KERNEL2x1_E 16,64, 0,1 + MY_ALIGN +ZGEMM_L2x1_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L2x1_SAVE + KERNEL2x1 + +ZGEMM_L2x1_SAVE: + + SAVE2x1 + +ZGEMM_L2x1_END: + + slwi T1, K, 5 + add B, B, T1 + + addic. J, J, -1 + bgt ZGEMM_L2_BEGIN + + andi. T2, N, 1 + ble L999 + +ZGEMM_L2_END: + + b ZGEMM_L1_BEGIN + +L999_H1: + + b L999 + +ZGEMM_L1_BEGIN: + andi. T1, N, 1 + ble ZGEMM_L1_END + + mr BO, B + mr BBO, BBUFFER + srawi. T1, K, 3 /*this time K/8 */ + ble ZGEMM_L1_COPYB1 + +ZGEMM_L1_COPYB8: + + addi T2, PRE, 128 + dcbt BO, PRE + dcbtst BBO, PRE + dcbtst BBO, T2 + ZCOPYB_8 + addic. T1, T1, -1 + + bgt ZGEMM_L1_COPYB8 + +ZGEMM_L1_COPYB1: + + andi. T1, K, 7 + ble ZGEMM_L1_COPYB_END + +ZGEMM_L1_COPYB_LOOP: + + ZCOPYB_1 + addic. T1, T1, -1 + + bgt ZGEMM_L1_COPYB_LOOP + +ZGEMM_L1_COPYB_END: + + mr CO, C + mr AO, A + srawi. I, M, 3 + ble ZGEMM_L1x8_END + +ZGEMM_L1x8_BEGIN: + + + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 5 /**(K-1) % 32x */ + ZERO1x8 + ble ZGEMM_L1x8_SUB0 + + +ZGEMM_L1x8_LOOP_START: + + LOAD1x8 0 + li T2, 1024 + li T3, 1024+512 + li T4, 2048 + li T5, 2048+512 + mtctr L + + MY_ALIGN +ZGEMM_L1x8_LOOP: + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L 128,32,0,0 + KERNEL1x8_L 128,32,1,0 + dcbt AO, T2 + KERNEL1x8_L 128,32,2,0 + KERNEL1x8_L 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L 128,32,4,0 + KERNEL1x8_L 128,32,5,0 + dcbt AO, T4 + KERNEL1x8_L 128,32,6,0 + KERNEL1x8_L 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L 128,32,8,0 + KERNEL1x8_L 128,32,9,0 + KERNEL1x8_L 128,32,10,0 + KERNEL1x8_L 128,32,11,0 + dcbt BO, T4 + KERNEL1x8_L 128,32,12,0 + KERNEL1x8_L 128,32,13,0 + KERNEL1x8_L 128,32,14,0 + KERNEL1x8_L 128,32,15,1 + bdnz ZGEMM_L1x8_LOOP + MY_ALIGN +ZGEMM_L1x8_LOOP_END: + END1x8 AO, BO, 128, 32 + + b ZGEMM_L1x8_SUB1 + +ZGEMM_L1x8_SUB0: + + andi. L, K, 63 + + b ZGEMM_L1x8_SUB2 + +ZGEMM_L1x8_SUB1: + + andi. L, T1, 31 + ble ZGEMM_L1x8_SAVE + +ZGEMM_L1x8_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L1x8_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L1x8_SUB2_LOOP: + LOAD1x8 0 + KERNEL1x8_L 128,32, 0,0 + KERNEL1x8_L 128,32, 1,0 + KERNEL1x8_L 128,32, 2,0 + KERNEL1x8_E 128,32, 3,1 + bdnz ZGEMM_L1x8_SUB2_LOOP + MY_ALIGN +ZGEMM_L1x8_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L1x8_SUB2_2 + LOAD1x8 0 + KERNEL1x8_L 128,32, 0,0 + KERNEL1x8_E 128,32, 1,1 + MY_ALIGN +ZGEMM_L1x8_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L1x8_SUB2_1 + LOAD1x8 0 + KERNEL1x8_E 128,32, 0,1 + MY_ALIGN +ZGEMM_L1x8_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L1x8_SAVE + KERNEL1x8 + +/* addic. L, L, -1 + bgt ZGEMM_L1x8_SUB2_1*/ + +ZGEMM_L1x8_SAVE: + + SAVE1x8 + + addic. I, I, -1 + bgt ZGEMM_L1x8_BEGIN + +ZGEMM_L1x8_END: + +ZGEMM_L1x4_BEGIN: + + andi. T2, M, 7 + ble ZGEMM_L1x1_END + + andi. T1, M, 4 + ble ZGEMM_L1x4_END + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 5 /**(K-1) % 16x */ + ZERO1x4 + ble ZGEMM_L1x4_SUB0 + +ZGEMM_L1x4_LOOP_START: + LOAD1x4 0 + mtctr L + + MY_ALIGN +ZGEMM_L1x4_LOOP: + KERNEL1x4_L 64,32,0,0 + KERNEL1x4_L 64,32,1,0 + KERNEL1x4_L 64,32,2,0 + KERNEL1x4_L 64,32,3,0 + KERNEL1x4_L 64,32,4,0 + KERNEL1x4_L 64,32,5,0 + KERNEL1x4_L 64,32,6,0 + KERNEL1x4_L 64,32,7,0 + KERNEL1x4_L 64,32,8,0 + KERNEL1x4_L 64,32,9,0 + KERNEL1x4_L 64,32,10,0 + KERNEL1x4_L 64,32,11,0 + KERNEL1x4_L 64,32,12,0 + KERNEL1x4_L 64,32,13,0 + KERNEL1x4_L 64,32,14,0 + KERNEL1x4_L 64,32,15,1 + bdnz ZGEMM_L1x4_LOOP + MY_ALIGN +ZGEMM_L1x4_LOOP_END: + END1x4 AO, BO, 64, 32 + + b ZGEMM_L1x4_SUB1 + +ZGEMM_L1x4_SUB0: + + andi. L, K, 63 + + b ZGEMM_L1x4_SUB2 + +ZGEMM_L1x4_SUB1: + + andi. L, T1, 31 + ble ZGEMM_L1x4_SAVE + +ZGEMM_L1x4_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L1x4_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L1x4_SUB2_LOOP: + LOAD1x4 0 + KERNEL1x4_L 64,32, 0,0 + KERNEL1x4_L 64,32, 1,0 + KERNEL1x4_L 64,32, 2,0 + KERNEL1x4_E 64,32, 3,1 + bdnz ZGEMM_L1x4_SUB2_LOOP + MY_ALIGN +ZGEMM_L1x4_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L1x4_SUB2_2 + LOAD1x4 0 + KERNEL1x4_L 64,32, 0,0 + KERNEL1x4_E 64,32, 1,1 + MY_ALIGN +ZGEMM_L1x4_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L1x4_SUB2_1 + LOAD1x4 0 + KERNEL1x4_E 64,32, 0,1 + MY_ALIGN +ZGEMM_L1x4_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L1x4_SAVE + KERNEL1x4 + +ZGEMM_L1x4_SAVE: + + SAVE1x4 + +ZGEMM_L1x4_END: + +ZGEMM_L1x2_BEGIN: + + + andi. T1, M, 2 + ble ZGEMM_L1x2_END + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 5 /**(K-1) % 16x */ + ZERO1x2 + ble ZGEMM_L1x2_SUB0 + +ZGEMM_L1x2_LOOP_START: + LOAD1x2 0 + mtctr L + + MY_ALIGN +ZGEMM_L1x2_LOOP: + KERNEL1x2_L 32,32,0,0 + KERNEL1x2_L 32,32,1,0 + KERNEL1x2_L 32,32,2,0 + KERNEL1x2_L 32,32,3,0 + KERNEL1x2_L 32,32,4,0 + KERNEL1x2_L 32,32,5,0 + KERNEL1x2_L 32,32,6,0 + KERNEL1x2_L 32,32,7,0 + KERNEL1x2_L 32,32,8,0 + KERNEL1x2_L 32,32,9,0 + KERNEL1x2_L 32,32,10,0 + KERNEL1x2_L 32,32,11,0 + KERNEL1x2_L 32,32,12,0 + KERNEL1x2_L 32,32,13,0 + KERNEL1x2_L 32,32,14,0 + KERNEL1x2_L 32,32,15,1 + bdnz ZGEMM_L1x2_LOOP + MY_ALIGN +ZGEMM_L1x2_LOOP_END: + END1x2 AO, BO, 32, 32 + + b ZGEMM_L1x2_SUB1 + +ZGEMM_L1x2_SUB0: + + andi. L, K, 63 + + b ZGEMM_L1x2_SUB2 + +ZGEMM_L1x2_SUB1: + + andi. L, T1, 31 + ble ZGEMM_L1x2_SAVE + +ZGEMM_L1x2_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L1x2_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L1x2_SUB2_LOOP: + LOAD1x2 0 + KERNEL1x2_L 32,32, 0,0 + KERNEL1x2_L 32,32, 1,0 + KERNEL1x2_L 32,32, 2,0 + KERNEL1x2_E 32,32, 3,1 + bdnz ZGEMM_L1x2_SUB2_LOOP + MY_ALIGN +ZGEMM_L1x2_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L1x2_SUB2_2 + LOAD1x2 0 + KERNEL1x2_L 32,32, 0,0 + KERNEL1x2_E 32,32, 1,1 + MY_ALIGN +ZGEMM_L1x2_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L1x2_SUB2_1 + LOAD1x2 0 + KERNEL1x2_E 32,32, 0,1 + MY_ALIGN +ZGEMM_L1x2_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L1x2_SAVE + KERNEL1x2 +ZGEMM_L1x2_SAVE: + + SAVE1x2 + +ZGEMM_L1x2_END: + +ZGEMM_L1x1_BEGIN: + + + andi. T1, M, 1 + ble ZGEMM_L1x1_END + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 5 /**(K-1) % 16x */ + ZERO1x1 + ble ZGEMM_L1x1_SUB0 + +ZGEMM_L1x1_LOOP_START: + + LOAD1x1 0 + mtctr L + + MY_ALIGN +ZGEMM_L1x1_LOOP: + KERNEL1x1_L 16,32,0,0 + KERNEL1x1_L 16,32,1,0 + KERNEL1x1_L 16,32,2,0 + KERNEL1x1_L 16,32,3,0 + KERNEL1x1_L 16,32,4,0 + KERNEL1x1_L 16,32,5,0 + KERNEL1x1_L 16,32,6,0 + KERNEL1x1_L 16,32,7,0 + KERNEL1x1_L 16,32,8,0 + KERNEL1x1_L 16,32,9,0 + KERNEL1x1_L 16,32,10,0 + KERNEL1x1_L 16,32,11,0 + KERNEL1x1_L 16,32,12,0 + KERNEL1x1_L 16,32,13,0 + KERNEL1x1_L 16,32,14,0 + KERNEL1x1_L 16,32,15,1 + bdnz ZGEMM_L1x1_LOOP + MY_ALIGN +ZGEMM_L1x1_LOOP_END: + END1x1 AO, BO, 16, 32 + + b ZGEMM_L1x1_SUB1 + +ZGEMM_L1x1_SUB0: + + andi. L, K, 63 + + b ZGEMM_L1x1_SUB2 + +ZGEMM_L1x1_SUB1: + + andi. L, T1, 31 + ble ZGEMM_L1x1_SAVE + +ZGEMM_L1x1_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L1x1_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L1x1_SUB2_LOOP: + LOAD1x1 0 + KERNEL1x1_L 16,32, 0,0 + KERNEL1x1_L 16,32, 1,0 + KERNEL1x1_L 16,32, 2,0 + KERNEL1x1_E 16,32, 3,1 + bdnz ZGEMM_L1x1_SUB2_LOOP + MY_ALIGN +ZGEMM_L1x1_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L1x1_SUB2_2 + LOAD1x1 0 + KERNEL1x1_L 16,32, 0,0 + KERNEL1x1_E 16,32, 1,1 + MY_ALIGN +ZGEMM_L1x1_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L1x1_SUB2_1 + LOAD1x1 0 + KERNEL1x1_E 16,32, 0,1 + MY_ALIGN +ZGEMM_L1x1_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L1x1_SAVE + KERNEL1x1 + +ZGEMM_L1x1_SAVE: + + SAVE1x1 + +ZGEMM_L1x1_END: + +ZGEMM_L1_END: diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S new file mode 100644 index 000000000..93a309ad1 --- /dev/null +++ b/kernel/power/zgemm_macros_power9.S @@ -0,0 +1,1664 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xsadddp + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xsadddp + +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xssubdp + +#else // CC || CR || RC || RR + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xssubdp + +#endif + +.macro AGGREGATE_INTO_COMPLEX FIRST_V, SECOND_V, OUTPUT_V + AGGREGATE_INTO_COMPLEX_INNER \FIRST_V, \SECOND_V, \OUTPUT_V, vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7 +.endm + +.macro AGGREGATE_INTO_COMPLEX_INNER FIRST_V, SECOND_V, OUTPUT_V ,TEMP1,TEMP2,TEMP3,TEMP4,TEMP5,TEMP6,TEMP7,TEMP8 + xxlxor \TEMP1, \TEMP1, \TEMP1 + xxlxor \TEMP2, \TEMP2, \TEMP2 + + xxswapd \SECOND_V, \SECOND_V // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 \TEMP2, \TEMP2, \FIRST_V // realA*imagB + XSFADD_I2 \TEMP2, \TEMP2, \SECOND_V // imagA*realB + + xxswapd \FIRST_V, \FIRST_V //imagA*realB, realA*realB -> realA*realB, imagA*realB + xxswapd \SECOND_V, \SECOND_V // reverse to original imagA*imagB, realA*imagB + + XSFADD_R1 \TEMP1, \TEMP1, \FIRST_V // realA*realB + XSFADD_R2 \TEMP1, \TEMP1, \SECOND_V // imagA*imagB + + xsmuldp \TEMP3, \TEMP2, alpha_i // imag*alpha_i + xsmuldp \TEMP4, \TEMP2, alpha_r // imag*alpha_r + xsmuldp \TEMP5, \TEMP1, alpha_r // real*alpha_r + xsmuldp \TEMP6, \TEMP1, alpha_i // real*alpha_i + + xssubdp \TEMP7, \TEMP5, \TEMP3 // real*alpha_r - imag*alpha_i + xsadddp \TEMP8, \TEMP6, \TEMP4 // real*alpha_i + imag*alpha_r + xxpermdi \OUTPUT_V, \TEMP8, \TEMP7, 0 // merge real and imag part +.endm + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +#define unit_size 16 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) + +.macro Zero2x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endm + +.macro LOAD2x8 Zero + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + lxv vs18, 32(BO) // load real part from B + lxv vs19, 48(BO) // load imag part from B + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A + + lxv vs4, 64(AO) // load real,imag from A + lxv vs5, 80(AO) // load real,imag from A + lxv vs6, 96(AO) // load real,imag from A + lxv vs7, 112(AO) // load real,imag from A + +.if \Zero==1 + Zero2x8 +.endif + +.endm + +.macro END2x8_NORMAL + END2x8 AO,BO,128,64 +.endm + +.macro END2x8 AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + xvmaddadp vs48, vs0, vs18 // real*real, imag*real + xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs50, vs1, vs18 // real*real, imag*real + xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs52, vs2, vs18 // real*real, imag*real + xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs54, vs3, vs18 // real*real, imag*real + xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag + xvmaddadp vs56, vs4, vs18 // real*real, imag*real + xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag + xvmaddadp vs58, vs5, vs18 // real*real, imag*real + xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag + xvmaddadp vs60, vs6, vs18 // real*real, imag*real + xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag + xvmaddadp vs62, vs7, vs18 // real*real, imag*real + xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag + +.endm + +.macro KERNEL2x8_L OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL2x8_E OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + +lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B + lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B + lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + xvmaddadp vs48, vs0, vs18 // real*real, imag*real + xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs50, vs1, vs18 // real*real, imag*real + xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs52, vs2, vs18 // real*real, imag*real + xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs54, vs3, vs18 // real*real, imag*real + xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag + xvmaddadp vs56, vs4, vs18 // real*real, imag*real + xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag + xvmaddadp vs58, vs5, vs18 // real*real, imag*real + xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag + xvmaddadp vs60, vs6, vs18 // real*real, imag*real + xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag + xvmaddadp vs62, vs7, vs18 // real*real, imag*real + xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag + +.if \Complete==0 + lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B + lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B + lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP8(\Index,128) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + xvmaddadp vs48, vs8, vs22 // real*real, imag*real + xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs50, vs9, vs22 // real*real, imag*real + xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs52, vs10, vs22 // real*real, imag*real + xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs54, vs11, vs22 // real*real, imag*real + xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag + xvmaddadp vs56, vs12, vs22 // real*real, imag*real + xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag + xvmaddadp vs58, vs13, vs22 // real*real, imag*real + xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag + xvmaddadp vs60, vs14, vs22 // real*real, imag*real + xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag + xvmaddadp vs62, vs15, vs22 // real*real, imag*real + xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag + +.endm + +.macro KERNEL2x8 + LOAD2x8 0 + END2x8 AO, BO, 128,64 +.endm + +.macro SAVE2x8 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + lxv vs18, 32(T1) + lxv vs19, 48(T1) + lxv vs20, 0(T2) + lxv vs21, 16(T2) + lxv vs22, 32(T2) + lxv vs23, 48(T2) + +#endif + + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 + AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 + AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 + AGGREGATE_INTO_COMPLEX vs40,vs41,vs12 + AGGREGATE_INTO_COMPLEX vs42,vs43,vs13 + AGGREGATE_INTO_COMPLEX vs44,vs45,vs14 + AGGREGATE_INTO_COMPLEX vs46,vs47,vs15 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + stxv vs10, 32(T1) + stxv vs11, 48(T1) + stxv vs12, 0(T2) + stxv vs13, 16(T2) + stxv vs14, 32(T2) + stxv vs15, 48(T2) + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + lxv vs18, 32(T1) + lxv vs19, 48(T1) + lxv vs20, 0(T2) + lxv vs21, 16(T2) + lxv vs22, 32(T2) + lxv vs23, 48(T2) + +#endif + + AGGREGATE_INTO_COMPLEX vs48,vs49,vs8 + AGGREGATE_INTO_COMPLEX vs50,vs51,vs9 + AGGREGATE_INTO_COMPLEX vs52,vs53,vs10 + AGGREGATE_INTO_COMPLEX vs54,vs55,vs11 + AGGREGATE_INTO_COMPLEX vs56,vs57,vs12 + AGGREGATE_INTO_COMPLEX vs58,vs59,vs13 + AGGREGATE_INTO_COMPLEX vs60,vs61,vs14 + AGGREGATE_INTO_COMPLEX vs62,vs63,vs15 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + stxv vs10, 32(T1) + stxv vs11, 48(T1) + stxv vs12, 0(T2) + stxv vs13, 16(T2) + stxv vs14, 32(T2) + stxv vs15, 48(T2) + + addi CO, CO, 128 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro Zero2x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + +.macro LOAD2x4 Zero + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + lxv vs18, 32(BO) // load real part from B + lxv vs19, 48(BO) // load imag part from B + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A + +.if \Zero==1 + Zero2x4 +.endif + +.endm + +.macro END2x4_NORMAL + END2x4 AO,BO,64,64 +.endm + +.macro END2x4 AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + xvmaddadp vs40, vs0, vs18 // real*real, imag*real + xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs42, vs1, vs18 // real*real, imag*real + xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs44, vs2, vs18 // real*real, imag*real + xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs46, vs3, vs18 // real*real, imag*real + xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + +.endm + +.macro KERNEL2x4_L OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL2x4_E OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + +lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B + lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B + lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + xvmaddadp vs40, vs0, vs18 // real*real, imag*real + xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs42, vs1, vs18 // real*real, imag*real + xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs44, vs2, vs18 // real*real, imag*real + xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs46, vs3, vs18 // real*real, imag*real + xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + +.if \Complete==0 + lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B + lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B + lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP8(\Index,128) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + xvmaddadp vs40, vs8, vs22 // real*real, imag*real + xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs42, vs9, vs22 // real*real, imag*real + xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs44, vs10, vs22 // real*real, imag*real + xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs46, vs11, vs22 // real*real, imag*real + xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag + +.endm + +.macro KERNEL2x4 + LOAD2x4 0 + END2x4 AO, BO, 64,64 +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + lxv vs18, 32(T1) + lxv vs19, 48(T1) + +#endif + + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 + AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 + AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + stxv vs10, 32(T1) + stxv vs11, 48(T1) + + add T1, T1, LDC + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + lxv vs18, 32(T1) + lxv vs19, 48(T1) + +#endif + + AGGREGATE_INTO_COMPLEX vs40,vs41,vs8 + AGGREGATE_INTO_COMPLEX vs42,vs43,vs9 + AGGREGATE_INTO_COMPLEX vs44,vs45,vs10 + AGGREGATE_INTO_COMPLEX vs46,vs47,vs11 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + stxv vs10, 32(T1) + stxv vs11, 48(T1) + + addi CO, CO, 64 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro Zero2x2 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 +.endm + +.macro LOAD2x2 Zero + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + lxv vs18, 32(BO) // load real part from B + lxv vs19, 48(BO) // load imag part from B + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + +.if \Zero==1 + Zero2x2 +.endif + +.endm + +.macro END2x2_NORMAL + END2x2 AO,BO,32,64 +.endm + +.macro END2x2 AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + xvmaddadp vs36, vs0, vs18 // real*real, imag*real + xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs38, vs1, vs18 // real*real, imag*real + xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag + +.endm + +.macro KERNEL2x2_L OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL2x2_E OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + +lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B + lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B + lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + xvmaddadp vs36, vs0, vs18 // real*real, imag*real + xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs38, vs1, vs18 // real*real, imag*real + xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag + +.if \Complete==0 + lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B + lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B + lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP8(\Index,128) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + xvmaddadp vs36, vs8, vs22 // real*real, imag*real + xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs38, vs9, vs22 // real*real, imag*real + xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag + +.endm + +.macro KERNEL2x2 + LOAD2x2 0 + END2x2 AO, BO, 32,64 +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + +#endif + + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + + add T1, T1, LDC + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + +#endif + + AGGREGATE_INTO_COMPLEX vs36,vs37,vs8 + AGGREGATE_INTO_COMPLEX vs38,vs39,vs9 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + + addi CO, CO, 32 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro Zero2x1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 +.endm + +.macro LOAD2x1 Zero + lxv vs0, 0(AO) // load real,imag from A + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + lxv vs18, 32(BO) // load real part from B + lxv vs19, 48(BO) // load imag part from B + +.if \Zero==1 + Zero2x1 +.endif + +.endm + +.macro END2x1_NORMAL + END2x1 AO,BO,16,64 +.endm + +.macro END2x1 AREG, BREG, OffsetA, OffsetB + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + xvmaddadp vs34, vs0, vs18 // real*real, imag*real + xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag + +.endm + +.macro KERNEL2x1_L OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL2x1_E OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + +lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B + lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B + lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + xvmaddadp vs34, vs0, vs18 // real*real, imag*real + xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag + +.if \Complete==0 + lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B + lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B + lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP8(\Index,128) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + xvmaddadp vs34, vs8, vs22 // real*real, imag*real + xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag + +.endm + +.macro KERNEL2x1 + LOAD2x1 0 + END2x1 AO, BO, 16,64 +.endm + +.macro SAVE2x1 + + mr T1, CO +#ifndef TRMMKERNEL + lxv vs16, 0(T1) +#endif + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + +#ifndef TRMMKERNEL + xvadddp vs8, vs8, vs16 +#endif + + stxv vs8, 0(T1) + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxv vs16, 0(T1) +#endif + + AGGREGATE_INTO_COMPLEX vs34,vs35,vs8 + +#ifndef TRMMKERNEL + xvadddp vs8, vs8, vs16 +#endif + + stxv vs8, 0(T1) + + addi CO, CO, 16 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ +.macro Zero1x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + +.macro LOAD1x8 Zero + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A + + lxv vs4, 64(AO) // load real,imag from A + lxv vs5, 80(AO) // load real,imag from A + lxv vs6, 96(AO) // load real,imag from A + lxv vs7, 112(AO) // load real,imag from A + +.if \Zero==1 + Zero1x8 +.endif + +.endm + +.macro END1x8_NORMAL + END1x8 AO,BO,128,32 +.endm + +.macro END1x8 AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + +.endm + +.macro KERNEL1x8_L OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL1x8_E OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + +.if \Complete==0 + lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP4(\Index,48+\OffsetB)(\BREG) // load imag part from B +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + +.endm + +.macro KERNEL1x8 + LOAD1x8 0 + END1x8 AO, BO, 128,32 +.endm + +.macro SAVE1x8 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + lxv vs18, 32(T1) + lxv vs19, 48(T1) + lxv vs20, 0(T2) + lxv vs21, 16(T2) + lxv vs22, 32(T2) + lxv vs23, 48(T2) + +#endif + + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 + AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 + AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 + AGGREGATE_INTO_COMPLEX vs40,vs41,vs12 + AGGREGATE_INTO_COMPLEX vs42,vs43,vs13 + AGGREGATE_INTO_COMPLEX vs44,vs45,vs14 + AGGREGATE_INTO_COMPLEX vs46,vs47,vs15 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + stxv vs10, 32(T1) + stxv vs11, 48(T1) + stxv vs12, 0(T2) + stxv vs13, 16(T2) + stxv vs14, 32(T2) + stxv vs15, 48(T2) + + addi CO, CO, 128 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro Zero1x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 +.endm + +.macro LOAD1x4 Zero + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A + +.if \Zero==1 + Zero1x4 +.endif + +.endm + +.macro END1x4_NORMAL + END1x4 AO,BO,64,32 +.endm + +.macro END1x4 AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + +.endm + +.macro KERNEL1x4_L OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL1x4_E OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + +lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + xvmaddadp vs40, vs0, vs18 // real*real, imag*real + xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs42, vs1, vs18 // real*real, imag*real + xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs44, vs2, vs18 // real*real, imag*real + xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs46, vs3, vs18 // real*real, imag*real + xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + +.if \Complete==0 + lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + xvmaddadp vs40, vs8, vs22 // real*real, imag*real + xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs42, vs9, vs22 // real*real, imag*real + xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs44, vs10, vs22 // real*real, imag*real + xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs46, vs11, vs22 // real*real, imag*real + xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag + +.endm + +.macro KERNEL1x4 + LOAD1x4 0 + END1x4 AO, BO, 64,32 +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + lxv vs18, 32(T1) + lxv vs19, 48(T1) + +#endif + + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 + AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 + AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + stxv vs10, 32(T1) + stxv vs11, 48(T1) + + addi CO, CO, 64 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro Zero1x2 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 +.endm + +.macro LOAD1x2 Zero + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + +.if \Zero==1 + Zero1x2 +.endif + +.endm + +.macro END1x2_NORMAL + END1x2 AO,BO,32,32 +.endm + +.macro END1x2 AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + +.endm + +.macro KERNEL1x2_L OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL1x2_E OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + +lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag +.if \Complete==0 + lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + +.endm + +.macro KERNEL1x2 + LOAD1x2 0 + END1x2 AO, BO, 32,32 +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + +#endif + + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + +addi CO, CO, 32 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro Zero1x1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 +.endm + +.macro LOAD1x1 Zero + lxv vs0, 0(AO) // load real,imag from A + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + +.if \Zero==1 + Zero1x1 +.endif + +.endm + +.macro END1x1_NORMAL + END1x1 AO,BO,16,32 +.endm + +.macro END1x1 AREG, BREG, OffsetA, OffsetB + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_L OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL1x1_E OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + +.if \Complete==0 + lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1 + LOAD1x1 0 + END1x1 AO, BO, 16,32 + +.endm + +.macro SAVE1x1 + + mr T1, CO +#ifndef TRMMKERNEL + lxv vs16, 0(T1) +#endif + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + +#ifndef TRMMKERNEL + xvadddp vs8, vs8, vs16 +#endif + + stxv vs8, 0(T1) + +addi CO, CO, 16 + +.endm + + +.macro ZCOPYB_2 + + lxv vs32, 0(BO) + lxv vs33, 16(BO) + addi BO, BO, 32 + xxspltd vs40, vs32, 1 + xxspltd vs41, vs32, 0 + xxspltd vs42, vs33, 1 + xxspltd vs43, vs33, 0 + + stxv vs40, 0(BBO) + stxv vs41, 16(BBO) + stxv vs42, 32(BBO) + stxv vs43, 48(BBO) + addi BBO, BBO, 64 + +.endm + +.macro ZCOPYB_1 + + lxv vs32, 0(BO) + addi BO, BO, 16 + xxspltd vs40, vs32, 1 + xxspltd vs41, vs32, 0 + stxv vs40, 0(BBO) + stxv vs41, 16(BBO) + + addi BBO, BBO, 32 + +.endm + +.macro ZCOPYB_8 + + lxv vs32, 0(BO) + lxv vs33, 16(BO) + lxv vs34, 32(BO) + lxv vs35, 48(BO) + + lxv vs36, 64+0(BO) + lxv vs37, 64+16(BO) + lxv vs38, 64+32(BO) + lxv vs39, 64+48(BO) + addi BO, BO, 128 + xxspltd vs40, vs32, 1 + xxspltd vs41, vs32, 0 + xxspltd vs42, vs33, 1 + xxspltd vs43, vs33, 0 + xxspltd vs44, vs34, 1 + xxspltd vs45, vs34, 0 + xxspltd vs46, vs35, 1 + xxspltd vs47, vs35, 0 + + xxspltd vs48, vs36, 1 + xxspltd vs49, vs36, 0 + xxspltd vs50, vs37, 1 + xxspltd vs51, vs37, 0 + xxspltd vs52, vs38, 1 + xxspltd vs53, vs38, 0 + xxspltd vs54, vs39, 1 + xxspltd vs55, vs39, 0 + + stxv vs40, 0(BBO) + stxv vs41, 16(BBO) + stxv vs42, 32(BBO) + stxv vs43, 48(BBO) + + stxv vs44, 64+0(BBO) + stxv vs45, 64+16(BBO) + stxv vs46, 64+32(BBO) + stxv vs47, 64+48(BBO) + + stxv vs48, 128+ 0(BBO) + stxv vs49, 128+ 16(BBO) + stxv vs50, 128+ 32(BBO) + stxv vs51, 128+ 48(BBO) + + stxv vs52, 192 + 0(BBO) + stxv vs53, 192 + 16(BBO) + stxv vs54, 192+ 32(BBO) + stxv vs55, 192 + 48(BBO) + addi BBO, BBO, 256 + +.endm + diff --git a/param.h b/param.h index 4dcd96a75..d0b8518c9 100644 --- a/param.h +++ b/param.h @@ -2251,12 +2251,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 640 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 640 -#define ZGEMM_DEFAULT_P 320 +#define ZGEMM_DEFAULT_P 512 #define SGEMM_DEFAULT_Q 1408 #define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 640 -#define ZGEMM_DEFAULT_Q 640 +#define ZGEMM_DEFAULT_Q 1152 #define SYMV_P 8 From c00289ba543121a78c5ab07a8e45385cc12fb9a8 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Sat, 1 Jun 2019 21:30:06 +0200 Subject: [PATCH 068/127] upload thread safety test folder --- cpp_thread_test/Makefile | 14 +++ cpp_thread_test/cpp_thread_safety_common.h | 55 +++++++++++ cpp_thread_test/dgemm_thread_safety.cpp | 92 +++++++++++++++++++ cpp_thread_test/dgemv_thread_safety.cpp | 101 +++++++++++++++++++++ 4 files changed, 262 insertions(+) create mode 100644 cpp_thread_test/Makefile create mode 100644 cpp_thread_test/cpp_thread_safety_common.h create mode 100644 cpp_thread_test/dgemm_thread_safety.cpp create mode 100644 cpp_thread_test/dgemv_thread_safety.cpp diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile new file mode 100644 index 000000000..81e3470ef --- /dev/null +++ b/cpp_thread_test/Makefile @@ -0,0 +1,14 @@ +include ../Makefile.rule + +all :: dgemv_tester dgemm_tester + +dgemv_tester : + $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester + ./dgemv_tester + +dgemm_tester : dgemv_tester + $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester + ./dgemm_tester + +clean :: + rm -f dgemv_tester dgemm_tester diff --git a/cpp_thread_test/cpp_thread_safety_common.h b/cpp_thread_test/cpp_thread_safety_common.h new file mode 100644 index 000000000..60ab5bb2f --- /dev/null +++ b/cpp_thread_test/cpp_thread_safety_common.h @@ -0,0 +1,55 @@ +inline void pauser(){ + /// a portable way to pause a program + std::string dummy; + std::cout << "Press enter to continue..."; + std::getline(std::cin, dummy); +} + +void FillMatrices(std::vector>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){ + for(uint32_t i=0; i(randomMatSize*randomMatSize); j++){ + matBlock[i][j] = rngdist(PRNG); + } + } + for(uint32_t i=numMat; i<(numConcurrentThreads*numMat); i+=numMat){ + for(uint32_t j=0; j>& vecBlock, std::mt19937_64& PRNG, std::uniform_real_distribution& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numVec){ + for(uint32_t i=0; i(randomMatSize); j++){ + vecBlock[i][j] = rngdist(PRNG); + } + } + for(uint32_t i=numVec; i<(numConcurrentThreads*numVec); i+=numVec){ + for(uint32_t j=0; j rngdist{-1.0, 1.0}; + //make sure the internal state of the PRNG is properly mixed by generating 10M random numbers + //PRNGs often have unreliable distribution uniformity and other statistical properties before their internal state is sufficiently mixed + for (uint32_t i=0;i<10000000;i++) rngdist(PRNG); + return PRNG; +} + +void PrintMatrices(const std::vector>& matBlock, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){ + for (uint32_t i=0;i(randomMatSize); j++){ + for (uint32_t k = 0; k < static_cast(randomMatSize); k++){ + std::cout< +#include +#include +#include +#include +#include "../cblas.h" +#include "cpp_thread_safety_common.h" + +void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMatSize){ + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, randomMatSize, randomMatSize, randomMatSize, 1.0, A, randomMatSize, B, randomMatSize, 0.1, C, randomMatSize); +} + +int main(int argc, char* argv[]){ + blasint randomMatSize = 1024; //dimension of the random square matrices used + uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested + uint32_t numTestRounds = 16; //number of testing rounds before success exit + + if (argc > 4){ + std::cout<<"ERROR: too many arguments for thread safety tester"< cliArgs; + for (int i = 1; i < argc; i++){ + cliArgs.push_back(argv[i]); + std::cout< rngdist{-1.0, 1.0}; + std::vector> matBlock(numConcurrentThreads*3); + std::vector> futureBlock(numConcurrentThreads); + + std::cout<<"*----------------------------*\n"; + std::cout<<"| DGEMM thread safety tester |\n"; + std::cout<<"*----------------------------*\n"; + std::cout<<"Size of random matrices(N=M=K): "<(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast(1024*1024)<<" MiB of RAM\n"<(randomMatSize*randomMatSize); j++){ + if (std::abs(matBlock[i+2][j] - matBlock[2][j]) > 1.0E-13){ //i+2 is the index of matrix C, for a given thread + std::cout<<"ERROR: one of the threads returned a different result! Index : "< +#include +#include +#include +#include +#include "../cblas.h" +#include "cpp_thread_safety_common.h" + +void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){ + const blasint inc = 1; + cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc); +} + +int main(int argc, char* argv[]){ + blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used + uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested + uint32_t numTestRounds = 16; //number of testing rounds before success exit + + if (argc > 4){ + std::cout<<"ERROR: too many arguments for thread safety tester"< cliArgs; + for (int i = 1; i < argc; i++){ + cliArgs.push_back(argv[i]); + std::cout< rngdist{-1.0, 1.0}; + std::vector> matBlock(numConcurrentThreads); + std::vector> vecBlock(numConcurrentThreads*2); + std::vector> futureBlock(numConcurrentThreads); + + std::cout<<"*----------------------------*\n"; + std::cout<<"| DGEMV thread safety tester |\n"; + std::cout<<"*----------------------------*\n"; + std::cout<<"Size of random matrices and vectors(N=M): "<(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast(randomMatSize)*numConcurrentThreads*8*2))/static_cast(1024*1024)<<" MiB of RAM\n"<(randomMatSize); j++){ + if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread + std::cout<<"ERROR: one of the threads returned a different result! Index : "< Date: Sat, 1 Jun 2019 21:32:52 +0200 Subject: [PATCH 069/127] hook up c++ thread safety test (main Makefile) --- Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 21096f893..20ef1e868 100644 --- a/Makefile +++ b/Makefile @@ -34,7 +34,7 @@ endif LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) -SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench +SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test .PHONY : all libs netlib $(RELA) test ctest shared install .NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test @@ -127,6 +127,9 @@ ifndef NO_FBLAS endif ifndef NO_CBLAS $(MAKE) -C ctest all +ifeq ($(CPP_THREAD_SAFETY_TEST), 1) + $(MAKE) -C cpp_thread_test all +endif endif endif From 16f3df5d3551ff705d5d23dcdf26853114fb6956 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Sat, 1 Jun 2019 21:36:41 +0200 Subject: [PATCH 070/127] add c++ thread test option to Makefile.rule --- Makefile.rule | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Makefile.rule b/Makefile.rule index 7c128fb49..209934991 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -220,6 +220,21 @@ COMMON_PROF = -pg # SYMBOLPREFIX= # SYMBOLSUFFIX= +# Run a C++ based thread safety tester after the build is done. +# This is mostly intended as a developer feature to spot regressions, but users and +# package maintainers can enable this if they have doubts about the thread safety of +# the library, given the configuration in this file. +# By default, the thread safety tester launches 52 concurrent calculations at the same +# time. +# +# Please note that the test uses ~1300 MiB of RAM for the DGEMM test. +# +# The test requires CBLAS to be built, a C++11 capable compiler and the presence of +# an OpenMP implementation. If you are cross-compiling this test will probably not +# work at all. +# +# CPP_THREAD_SAFETY_TEST = 1 + # # End of user configuration # From 27649b95430cbed40923db4ab45119af6b05acb3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 3 Jun 2019 11:01:33 +0200 Subject: [PATCH 071/127] Document NO_AVX512 for #2151 --- Makefile.rule | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.rule b/Makefile.rule index 255d1da46..65d04ee3e 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -163,6 +163,10 @@ NO_AFFINITY = 1 # Don't use Haswell optimizations if binutils is too old (e.g. RHEL6) # NO_AVX2 = 1 +# Don't use SkylakeX optimizations if binutils or compiler are too old (the build +# system will try to determine this automatically) +# NO_AVX512 = 1 + # Don't use parallel make. # NO_PARALLEL_MAKE = 1 From a469b32cf43772bb14253a405be8f088ce3a9d83 Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Fri, 31 May 2019 22:48:16 +0000 Subject: [PATCH 072/127] sgemm pipeline improved, zgemm rewritten without inner packs, ABI lxvx v20 fixed with vs52 --- benchmark/gemm.c | 2 +- kernel/power/KERNEL.POWER9 | 2 +- kernel/power/dgemm_kernel_power9.S | 48 +- kernel/power/sgemm_kernel_power9.S | 140 +- kernel/power/sgemm_logic_power9.S | 192 ++- kernel/power/sgemm_macros_power9.S | 881 ++++------- kernel/power/zgemm_kernel_power9.S | 114 +- kernel/power/zgemm_logic_power9.S | 806 ++++++---- kernel/power/zgemm_macros_power9.S | 2307 +++++++++++++--------------- param.h | 8 +- 10 files changed, 2073 insertions(+), 2427 deletions(-) diff --git a/benchmark/gemm.c b/benchmark/gemm.c index 85bcbc710..dd016a7c3 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -207,7 +207,7 @@ int main(int argc, char *argv[]){ for (i = 0; i < m * n * COMPSIZE; i++) { c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - + fprintf(stderr, " SIZE Flops Time\n"); for (i = from; i <= to; i += step) { diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 5c10ad64a..440eaab1b 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -42,7 +42,7 @@ ZGEMMKERNEL = zgemm_kernel_power9.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c -ZGEMMITCOPY = zgemm_tcopy_8_power8.S +ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o ZGEMMINCOPYOBJ = zgemm_incopy.o diff --git a/kernel/power/dgemm_kernel_power9.S b/kernel/power/dgemm_kernel_power9.S index a1762dcf2..2fb1b27ef 100644 --- a/kernel/power/dgemm_kernel_power9.S +++ b/kernel/power/dgemm_kernel_power9.S @@ -135,18 +135,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r14, 280(SP) - stxv v20, 288(SP) - stxv v21, 304(SP) - stxv v22, 320(SP) - stxv v23, 336(SP) - stxv v24, 352(SP) - stxv v25, 368(SP) - stxv v26, 384(SP) - stxv v27, 400(SP) - stxv v28, 416(SP) - stxv v29, 432(SP) - stxv v30, 448(SP) - stxv v31, 464(SP) + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) stfd f1, ALPHA_SP @@ -229,18 +229,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld r15, 272(SP) ld r14, 280(SP) - lxv v20, 288(SP) - lxv v21, 304(SP) - lxv v22, 320(SP) - lxv v23, 336(SP) - lxv v24, 352(SP) - lxv v25, 368(SP) - lxv v26, 384(SP) - lxv v27, 400(SP) - lxv v28, 416(SP) - lxv v29, 432(SP) - lxv v30, 448(SP) - lxv v31, 464(SP) + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) addi SP, SP, STACKSIZE blr diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S index f408cdc17..7a0f3143e 100644 --- a/kernel/power/sgemm_kernel_power9.S +++ b/kernel/power/sgemm_kernel_power9.S @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LOAD ld #define STACKSIZE (512 ) - +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ #define M r3 #define N r4 #define K r5 @@ -91,7 +91,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROFCODE addi SP, SP, -STACKSIZE - li r0, 0 + mflr r0 + stfd f14, 0(SP) stfd f15, 8(SP) @@ -137,19 +138,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r14, 280(SP) - stxv v20, 288(SP) - stxv v21, 304(SP) - stxv v22, 320(SP) - stxv v23, 336(SP) - stxv v24, 352(SP) - stxv v25, 368(SP) - stxv v26, 384(SP) - stxv v27, 400(SP) - stxv v28, 416(SP) - stxv v29, 432(SP) - stxv v30, 448(SP) - stxv v31, 464(SP) - + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + std r0, FLINK_SAVE(SP) #if defined(TRMMKERNEL) @@ -157,72 +158,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif slwi LDC, LDC, 2 - -/* cmpwi cr0, M, 0 - ble .L999_H1 - cmpwi cr0, N, 0 - ble .L999_H1 - cmpwi cr0, K, 0 - ble .L999_H1 -*/ /*alpha is stored in f1. convert to single and splat*/ - xscvdpspn alpha_r,vs1 - xxspltw alpha_r,alpha_r,0 - + xscvdpspn alpha_r,vs1 + xxspltw alpha_r,alpha_r,0 /*load reverse permute mask for big endian uint128 = 0xc0d0e0f08090a0b0405060700010203 */ lis T2, perm_const2@highest - ori T2, T2, perm_const2@higher - rldicr T2, T2, 32, 31 - oris T2, T2, perm_const2@h - ori T2, T2, perm_const2@l - lis T1, perm_const1@highest + lis T3, save_permute_12@highest + lis T4, save_permute_11@highest + lis T5, save_permute_22@highest + lis T6, save_permute_21@highest + ori T2, T2, perm_const2@higher ori T1, T1, perm_const1@higher + ori T3, T3, save_permute_12@higher + ori T4, T4, save_permute_11@higher + ori T5, T5, save_permute_22@higher + ori T6, T6, save_permute_21@higher + rldicr T2, T2, 32, 31 rldicr T1, T1, 32, 31 + rldicr T3, T3, 32, 31 + rldicr T4, T4, 32, 31 + rldicr T5, T5, 32, 31 + rldicr T6, T6, 32, 31 + oris T2, T2, perm_const2@h oris T1, T1, perm_const1@h + oris T3, T3, save_permute_12@h + oris T4, T4, save_permute_11@h + oris T5, T5, save_permute_22@h + oris T6, T6, save_permute_21@h + ori T2, T2, perm_const2@l ori T1, T1, perm_const1@l - + ori T3, T3, save_permute_12@l + ori T4, T4, save_permute_11@l + ori T5, T5, save_permute_22@l + ori T6, T6, save_permute_21@l + li r0,0 mtvsrdd permute_mask,T2,T1 - - lis T2, save_permute_12@highest - ori T2, T2, save_permute_12@higher - rldicr T2, T2, 32, 31 - oris T2, T2, save_permute_12@h - ori T2, T2, save_permute_12@l - - lis T1, save_permute_11@highest - ori T1, T1, save_permute_11@higher - rldicr T1, T1, 32, 31 - oris T1, T1, save_permute_11@h - ori T1, T1, save_permute_11@l - - mtvsrdd save_permute_1,T2,T1 - - lis T2, save_permute_22@highest - ori T2, T2, save_permute_22@higher - rldicr T2, T2, 32, 31 - oris T2, T2, save_permute_22@h - ori T2, T2, save_permute_22@l - - lis T1, save_permute_21@highest - ori T1, T1, save_permute_21@higher - rldicr T1, T1, 32, 31 - oris T1, T1, save_permute_21@h - ori T1, T1, save_permute_21@l - - mtvsrdd save_permute_2,T2,T1 + mtvsrdd save_permute_1,T3,T4 + mtvsrdd save_permute_2,T5,T6 #include "sgemm_logic_power9.S" -.L999: - addi r3, 0, 0 - +.L999: lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) @@ -264,23 +247,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) - - lxv v20, 288(SP) - lxv v21, 304(SP) - lxv v22, 320(SP) - lxv v23, 336(SP) - lxv v24, 352(SP) - lxv v25, 368(SP) - lxv v26, 384(SP) - lxv v27, 400(SP) - lxv v28, 416(SP) - lxv v29, 432(SP) - lxv v30, 448(SP) - lxv v31, 464(SP) + ld r0, FLINK_SAVE(SP) - addi SP, SP, STACKSIZE + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE blr + EPILOGUE #endif diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S index c149cb903..25e8c8387 100644 --- a/kernel/power/sgemm_logic_power9.S +++ b/kernel/power/sgemm_logic_power9.S @@ -1,5 +1,94 @@ #define MY_ALIGN .align 3 +b L8 + MY_ALIGN +LSGEMM_L8x16_LMAIN_SUB: + LOAD8x16_0 + mtctr L + MY_ALIGN + +LSGEMM_L8x16_LOOP: + + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_2 64,32, 1,0 + KERNEL8x16_I1_L4_2 64,32, 2,0 + KERNEL8x16_I1_L4_2 64,32, 3,0 + KERNEL8x16_I1_L4_2 64,32, 4,0 + KERNEL8x16_I1_L4_2 64,32, 5,0 + KERNEL8x16_I1_L4_2 64,32, 6,0 + KERNEL8x16_I1_L4_2 64,32, 7,0 + KERNEL8x16_I1_L4_2 64,32, 8,0 + KERNEL8x16_I1_L4_2 64,32, 9,0 + KERNEL8x16_I1_L4_2 64,32, 10,0 + KERNEL8x16_I1_L4_2 64,32, 11,0 + KERNEL8x16_I1_L4_2 64,32, 12,0 + KERNEL8x16_I1_L4_2 64,32, 13,0 + KERNEL8x16_I1_L4_2 64,32, 14,0 + KERNEL8x16_I1_L4_2 64,32, 15,0 + KERNEL8x16_I1_L4_2 64,32, 16,0 + KERNEL8x16_I1_L4_2 64,32, 17,0 + KERNEL8x16_I1_L4_2 64,32, 18,0 + KERNEL8x16_I1_L4_2 64,32, 19,0 + KERNEL8x16_I1_L4_2 64,32, 20,0 + KERNEL8x16_I1_L4_2 64,32, 21,0 + KERNEL8x16_I1_L4_2 64,32, 22,0 + KERNEL8x16_I1_L4_2 64,32, 23,0 + KERNEL8x16_I1_L4_2 64,32, 24,0 + KERNEL8x16_I1_L4_2 64,32, 25,0 + KERNEL8x16_I1_L4_2 64,32, 26,0 + KERNEL8x16_I1_L4_2 64,32, 27,0 + KERNEL8x16_I1_L4_2 64,32, 28,0 + KERNEL8x16_I1_L4_2 64,32, 29,0 + KERNEL8x16_I1_L4_2 64,32, 30,0 + KERNEL8x16_I1_L4_2 64,32, 31,1 + bdnz LSGEMM_L8x16_LOOP + + MY_ALIGN +LSGEMM_L8x16_LOOP_END: + END8x16 0, AO, BO, 64, 32 + blr + + MY_ALIGN +LSGEMM_L8x16_L64_SUB: + LOAD8x16_0 + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_2 64,32, 1,0 + KERNEL8x16_I1_L4_2 64,32, 2,0 + KERNEL8x16_I1_L4_2 64,32, 3,0 + KERNEL8x16_I1_L4_2 64,32, 4,0 + KERNEL8x16_I1_L4_2 64,32, 5,0 + KERNEL8x16_I1_L4_2 64,32, 6,0 + KERNEL8x16_I1_L4_2 64,32, 7,0 + KERNEL8x16_I1_L4_2 64,32, 8,0 + KERNEL8x16_I1_L4_2 64,32, 9,0 + KERNEL8x16_I1_L4_2 64,32, 10,0 + KERNEL8x16_I1_L4_2 64,32, 11,0 + KERNEL8x16_I1_L4_2 64,32, 12,0 + KERNEL8x16_I1_L4_2 64,32, 13,0 + KERNEL8x16_I1_L4_2 64,32, 14,0 + KERNEL8x16_I1_L4_3 64,32, 15,1 + blr +LSGEMM_L8x16_L32_SUB: + LOAD8x16_0 + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_2 64,32, 1,0 + KERNEL8x16_I1_L4_2 64,32, 2,0 + KERNEL8x16_I1_L4_2 64,32, 3,0 + KERNEL8x16_I1_L4_2 64,32, 4,0 + KERNEL8x16_I1_L4_2 64,32, 5,0 + KERNEL8x16_I1_L4_2 64,32, 6,0 + KERNEL8x16_I1_L4_3 64,32, 7,1 + blr + +LSGEMM_L8x16_L16_SUB: + LOAD8x16_0 + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_2 64,32, 1,0 + KERNEL8x16_I1_L4_2 64,32, 2,0 + KERNEL8x16_I1_L4_3 64,32, 3,1 + blr + +L8: #if defined(TRMMKERNEL) && !defined(LEFT) neg TEMP_REG, OFFSET #endif @@ -39,98 +128,50 @@ LSGEMM_L8x16_BEGIN: REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 mr T12, T11 addi T12,T12, -1 - srawi. L, T12, 6 /**(T11-1) % 64x */ + srawi. L, T12, 7 /**(T11-1) % 128x */ #else mr T12, K addi T12,T12, -1 - srawi. L, T12, 6 /**(K-1) % 64x */ + srawi. L, T12, 7 /**(K-1) % 128x */ #endif ZERO8x16 ble LSGEMM_L8x16_SUB0 - - MY_ALIGN -LSGEMM_L8x16_LOOP_START: - - LOAD8x16_0 /*we already zeroed */ - /*##OffsetA=64 OffsetB=32 - #addi AO,AO,2112 - #addi BO,BO,32 */ - - mtctr L - - MY_ALIGN - -LSGEMM_L8x16_LOOP: - - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_2 64,32, 1,0 - KERNEL8x16_I1_L4_2 64,32, 2,0 - KERNEL8x16_I1_L4_2 64,32, 3,0 - KERNEL8x16_I1_L4_2 64,32, 4,0 - KERNEL8x16_I1_L4_2 64,32, 5,0 - KERNEL8x16_I1_L4_2 64,32, 6,0 - KERNEL8x16_I1_L4_2 64,32, 7,0 - KERNEL8x16_I1_L4_2 64,32, 8,0 - KERNEL8x16_I1_L4_2 64,32, 9,0 - KERNEL8x16_I1_L4_2 64,32, 10,0 - KERNEL8x16_I1_L4_2 64,32, 11,0 - KERNEL8x16_I1_L4_2 64,32, 12,0 - KERNEL8x16_I1_L4_2 64,32, 13,0 - KERNEL8x16_I1_L4_2 64,32, 14,0 - KERNEL8x16_I1_L4_2 64,32, 15,1 - - bdnz LSGEMM_L8x16_LOOP - - MY_ALIGN -LSGEMM_L8x16_LOOP_END: - - END8x16 0, AO, BO, 64, 32 - - b LSGEMM_L8x16_SUB1 + bl LSGEMM_L8x16_LMAIN_SUB + andi. L, T12, 127 + ble LSGEMM_L8x16_SAVE + b LSGEMM_L8x16_SUB2 MY_ALIGN LSGEMM_L8x16_SUB0: #if defined(TRMMKERNEL) - andi. L, T11, 127 + andi. L, T11, 255 + cmpwi T11,128 #else - andi. L, K, 127 + andi. L, K, 255 + cmpwi K,128 #endif - b LSGEMM_L8x16_SUB2 - MY_ALIGN -LSGEMM_L8x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 63 -#else - andi. L, T12, 63 -#endif - ble LSGEMM_L8x16_SAVE + + bne LSGEMM_L8x16_SUB2 + MY_ALIGN +LSGEMM_L8x16_SUB2_128: + bl LSGEMM_L8x16_L64_SUB + bl LSGEMM_L8x16_L64_SUB + b LSGEMM_L8x16_SAVE MY_ALIGN LSGEMM_L8x16_SUB2: - - srawi. T10,L, 5 + andi. T10,L,64 + ble LSGEMM_L8x16_SUB2_32 + bl LSGEMM_L8x16_L64_SUB + MY_ALIGN +LSGEMM_L8x16_SUB2_32: + andi. T10,L, 32 ble LSGEMM_L8x16_SUB2_16 - mtctr T10 - MY_ALIGN -LSGEMM_L8x16_SUB2_LOOP: - LOAD8x16_0 - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_2 64,32, 1,0 - KERNEL8x16_I1_L4_2 64,32, 2,0 - KERNEL8x16_I1_L4_2 64,32, 3,0 - KERNEL8x16_I1_L4_2 64,32, 4,0 - KERNEL8x16_I1_L4_2 64,32, 5,0 - KERNEL8x16_I1_L4_2 64,32, 6,0 - KERNEL8x16_I1_L4_3 64,32, 7,1 - bdnz LSGEMM_L8x16_SUB2_LOOP - MY_ALIGN + bl LSGEMM_L8x16_L32_SUB + MY_ALIGN LSGEMM_L8x16_SUB2_16: andi. T10,L, 16 ble LSGEMM_L8x16_SUB2_8 - LOAD8x16_0 - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_2 64,32, 1,0 - KERNEL8x16_I1_L4_2 64,32, 2,0 - KERNEL8x16_I1_L4_3 64,32, 3,1 + bl LSGEMM_L8x16_L16_SUB MY_ALIGN LSGEMM_L8x16_SUB2_8: andi. T10,L, 8 @@ -155,8 +196,7 @@ LSGEMM_L8x16_SUB2_1: andi. T10,L, 1 ble LSGEMM_L8x16_SAVE KERNEL8x16 0 -# addic. L, L, -1 -# bgt LSGEMM_L8x16_SUB2 + MY_ALIGN LSGEMM_L8x16_SAVE: diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S index c61f419ac..3f86a1d25 100644 --- a/kernel/power/sgemm_macros_power9.S +++ b/kernel/power/sgemm_macros_power9.S @@ -62,7 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro KERNEL8x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 + KERNEL8x16_L1_L2_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast @@ -112,15 +112,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxv vs24, 0(BO) lxv vs28, 16(BO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask lxv vs0, 0(AO) lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask xxpermdi vs25, vs24, vs24,2 xxpermdi vs29, vs28, vs28,2 - + lxv vs2, 32(AO) + lxv vs3, 48(AO) xxpermdi vs27, vs26, vs26,2 xxpermdi vs31, vs30, vs30,2 @@ -259,247 +258,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG) - lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG) - - lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG) - lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 - - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 - - lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) - - lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG) - lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - xvmaddasp vs50, vs6,vs12 - xvmaddasp vs51, vs7,vs12 - - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - xvmaddasp vs54, vs6,vs13 - xvmaddasp vs55, vs7,vs13 - - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - xvmaddasp vs58, vs6,vs14 - xvmaddasp vs59, vs7,vs14 - - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - xvmaddasp vs62, vs6,vs15 - xvmaddasp vs63, vs7,vs15 - - lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) - lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) - - lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG) - lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG) - lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG) - lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 - - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 - -.if \Complete==0 - lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) - - lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG) - lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG) - lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG) - lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - -.endif -.if \IsLast==1 -.if \Complete==1 - - addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) - addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA) -.else - - addi \BREG, \BREG, DISP32(\Index,128) - addi \AREG, \AREG, DISP64(\Index,256) -.endif -.endif - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.endif - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - xvmaddasp vs50, vs6,vs12 - xvmaddasp vs51, vs7,vs12 - - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - xvmaddasp vs54, vs6,vs13 - xvmaddasp vs55, vs7,vs13 - - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - xvmaddasp vs58, vs6,vs14 - xvmaddasp vs59, vs7,vs14 - - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - xvmaddasp vs62, vs6,vs15 - xvmaddasp vs63, vs7,vs15 +KERNEL8x16_L1_L2_I \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0 +KERNEL8x16_L1_L2_I \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete .endm @@ -509,224 +269,134 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. END8x16 \First, AO, BO, 64,32 .endm -.macro KERNEL8x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) +.macro KERNEL8x16_L1_L2_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + lxv vs8, DISP16(\Index,\OffsetB)(\BREG) lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) - lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs36, vs0,vs25 + lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG) lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) - lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) - xxperm vs10, vs8, permute_mask xxperm vs14, vs12, permute_mask + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs44, vs0,vs27 + lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs52, vs0,vs29 + xxpermdi vs9, vs8, vs8,2 xxpermdi vs13, vs12, vs12,2 -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - xvmulsp vs34, vs2,vs24 - xvmulsp vs35, vs3,vs24 - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - xvmulsp vs38, vs2,vs25 - xvmulsp vs39, vs3,vs25 -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 -.endif + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs60, vs0,vs31 xxpermdi vs11, vs10, vs10,2 xxpermdi vs15, vs14, vs14,2 - -.if \First==1 - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - xvmulsp vs42, vs2,vs26 - xvmulsp vs43, vs3,vs26 - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - xvmulsp vs46, vs2,vs27 - xvmulsp vs47, vs3,vs27 - xvmulsp vs48, vs0,vs28 - xvmulsp vs49, vs1,vs28 - xvmulsp vs50, vs2,vs28 - xvmulsp vs51, vs3,vs28 - xvmulsp vs52, vs0,vs29 - xvmulsp vs53, vs1,vs29 - xvmulsp vs54, vs2,vs29 - xvmulsp vs55, vs3,vs29 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs37, vs1,vs25 - xvmulsp vs56, vs0,vs30 - xvmulsp vs57, vs1,vs30 - xvmulsp vs58, vs2,vs30 - xvmulsp vs59, vs3,vs30 - - xvmulsp vs60, vs0,vs31 - xvmulsp vs61, vs1,vs31 - xvmulsp vs62, vs2,vs31 - xvmulsp vs63, vs3,vs31 - -.else - xvmaddasp vs40, vs0,vs26 xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - xvmaddasp vs48, vs0,vs28 xvmaddasp vs49, vs1,vs28 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 - - xvmaddasp vs52, vs0,vs29 xvmaddasp vs53, vs1,vs29 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 - - xvmaddasp vs56, vs0,vs30 xvmaddasp vs57, vs1,vs30 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 - + xvmaddasp vs61, vs1,vs31 +.if \Complete==0 + lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) .endif + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs62, vs2,vs31 + + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs51, vs3,vs28 + xvmaddasp vs55, vs3,vs29 + xvmaddasp vs59, vs3,vs30 + xvmaddasp vs63, vs3,vs31 +.if \Complete==0 + lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) +.endif + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs36, vs4,vs9 .if \Complete==0 lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) - - lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) - lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif +.endif .if \IsLast==1 .if \Complete==1 - addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) - addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) + addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) + addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) .else - addi \BREG, \BREG, DISP16(\Index,64) - addi \AREG, \AREG, DISP32(\Index,128) + addi \AREG, \AREG, DISP32(\Index,128) + addi \BREG, \BREG, DISP16(\Index,64) + .endif +.endif + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs44, vs4,vs11 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask .endif - -.if \First==1 - xvmulsp vs32, vs4,vs8 - xvmulsp vs33, vs5,vs8 - xvmulsp vs34, vs6,vs8 - xvmulsp vs35, vs7,vs8 - - xvmulsp vs36, vs4,vs9 - xvmulsp vs37, vs5,vs9 - xvmulsp vs38, vs6,vs9 - xvmulsp vs39, vs7,vs9 -.else - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs52, vs4,vs13 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 .endif + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs60, vs4,vs15 .if \Complete==0 xxpermdi vs27, vs26, vs26,2 xxpermdi vs31, vs30, vs30,2 - -.endif -.if \First==1 - xvmulsp vs40, vs4,vs10 - xvmulsp vs41, vs5,vs10 - xvmulsp vs42, vs6,vs10 - xvmulsp vs43, vs7,vs10 + +.endif - xvmulsp vs44, vs4,vs11 - xvmulsp vs45, vs5,vs11 - xvmulsp vs46, vs6,vs11 - xvmulsp vs47, vs7,vs11 - - xvmulsp vs48, vs4,vs12 - xvmulsp vs49, vs5,vs12 - xvmulsp vs50, vs6,vs12 - xvmulsp vs51, vs7,vs12 - - xvmulsp vs52, vs4,vs13 - xvmulsp vs53, vs5,vs13 - xvmulsp vs54, vs6,vs13 - xvmulsp vs55, vs7,vs13 - - xvmulsp vs56, vs4,vs14 - xvmulsp vs57, vs5,vs14 - xvmulsp vs58, vs6,vs14 - xvmulsp vs59, vs7,vs14 - - xvmulsp vs60, vs4,vs15 - xvmulsp vs61, vs5,vs15 - xvmulsp vs62, vs6,vs15 - xvmulsp vs63, vs7,vs15 - -.else - xvmaddasp vs40, vs4,vs10 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs37, vs5,vs9 xvmaddasp vs41, vs5,vs10 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - - xvmaddasp vs44, vs4,vs11 xvmaddasp vs45, vs5,vs11 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - - xvmaddasp vs48, vs4,vs12 xvmaddasp vs49, vs5,vs12 - xvmaddasp vs50, vs6,vs12 - xvmaddasp vs51, vs7,vs12 - - xvmaddasp vs52, vs4,vs13 xvmaddasp vs53, vs5,vs13 - xvmaddasp vs54, vs6,vs13 - xvmaddasp vs55, vs7,vs13 - - xvmaddasp vs56, vs4,vs14 xvmaddasp vs57, vs5,vs14 - xvmaddasp vs58, vs6,vs14 - xvmaddasp vs59, vs7,vs14 - - xvmaddasp vs60, vs4,vs15 xvmaddasp vs61, vs5,vs15 - xvmaddasp vs62, vs6,vs15 + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs62, vs6,vs15 + + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs39, vs7,vs9 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs47, vs7,vs11 + xvmaddasp vs51, vs7,vs12 + xvmaddasp vs55, vs7,vs13 + xvmaddasp vs59, vs7,vs14 xvmaddasp vs63, vs7,vs15 - -.endif - + .endm @@ -763,7 +433,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxmrghw vs2, vs37, vs41 xxmrghw vs3, vs33, vs45 - +#ifndef TRMMKERNEL + lxv vs32, 0(CO) + lxv vs33, 16(CO) +#endif xxmrglw vs16, vs34, vs46 xxmrglw vs18, vs38, vs42 @@ -784,176 +457,203 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxmrghw vs30, vs39, vs43 xxmrghw vs31, vs35, vs47 - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - -#ifndef TRMMKERNEL - lxv vs32, 0(CO) - lxv vs33, 16(CO) +#ifndef TRMMKERNEL lxv vs34, 32(CO) lxv vs35, 48(CO) #endif - xxlor vs25, vs24, vs24 - xxlor vs27, vs26, vs26 - + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 #ifndef TRMMKERNEL lxv vs36, 0(T1) lxv vs37, 16(T1) +#endif + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + +#ifndef TRMMKERNEL lxv vs38, 32(T1) lxv vs39, 48(T1) #endif + + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + + + #ifndef TRMMKERNEL lxv vs40, 0(T2) lxv vs41, 16(T2) +#endif + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 +#ifndef TRMMKERNEL lxv vs42, 32(T2) lxv vs43, 48(T2) #endif + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 #ifndef TRMMKERNEL lxv vs44, 0(T3) - lxv vs45, 16(T3) + lxv vs45, 16(T3) +#endif + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 +#ifndef TRMMKERNEL lxv vs46, 32(T3) lxv vs47, 48(T3) #endif - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 + + - xxperm vs16, vs4, save_permute_1 - xxperm vs18, vs5, save_permute_1 xxperm vs17, vs4, save_permute_2 xxperm vs19, vs5, save_permute_2 - +#ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r +#endif xxperm vs24, vs30, save_permute_1 xxperm vs26, vs31, save_permute_1 + + + stxv vs32, 0(CO) + stxv vs33, 16(CO) +#ifdef TRMMKERNEL + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r +#else + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r +#endif xxperm vs25, vs30, save_permute_2 xxperm vs27, vs31, save_permute_2 - /* multiply add normal way */ - -#ifdef TRMMKERNEL - xvmulsp vs32, vs8, alpha_r - xvmulsp vs33, vs12, alpha_r - xvmulsp vs34, vs16, alpha_r - xvmulsp vs35, vs24, alpha_r + stxv vs34, 32(CO) + stxv vs35, 48(CO) +#ifdef TRMMKERNEL xvmulsp vs36, vs9, alpha_r - xvmulsp vs37, vs13, alpha_r + xvmulsp vs37, vs13, alpha_r +#else + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r +#endif + stxv vs36, 0(T1) + stxv vs37, 16(T1) +#ifdef TRMMKERNEL xvmulsp vs38, vs17, alpha_r xvmulsp vs39, vs25, alpha_r -#else - xvmaddasp vs32, vs8, alpha_r - xvmaddasp vs33, vs12, alpha_r - xvmaddasp vs34, vs16, alpha_r - xvmaddasp vs35, vs24, alpha_r - xvmaddasp vs36, vs9, alpha_r - xvmaddasp vs37, vs13, alpha_r +#else xvmaddasp vs38, vs17, alpha_r xvmaddasp vs39, vs25, alpha_r #endif - - - -#ifdef TRMMKERNEL - xvmulsp vs40, vs10, alpha_r - xvmulsp vs41, vs14, alpha_r - xvmulsp vs42, vs18, alpha_r - xvmulsp vs43, vs26, alpha_r - xvmulsp vs44, vs11, alpha_r - xvmulsp vs45, vs15, alpha_r - xvmulsp vs46, vs19, alpha_r - xvmulsp vs47, vs27, alpha_r -#else - - xvmaddasp vs40, vs10, alpha_r - xvmaddasp vs41, vs14, alpha_r - xvmaddasp vs42, vs18, alpha_r - xvmaddasp vs43, vs26, alpha_r - xvmaddasp vs44, vs11, alpha_r - xvmaddasp vs45, vs15, alpha_r - xvmaddasp vs46, vs19, alpha_r - xvmaddasp vs47, vs27, alpha_r - -#endif - - stxv vs32, 0(CO) - stxv vs33, 16(CO) - stxv vs34, 32(CO) - stxv vs35, 48(CO) - - stxv vs36, 0(T1) - stxv vs37, 16(T1) stxv vs38, 32(T1) stxv vs39, 48(T1) +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r +#else + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r +#endif + stxv vs40, 0(T2) stxv vs41, 16(T2) +#ifdef TRMMKERNEL + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r +#else + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r +#endif stxv vs42, 32(T2) stxv vs43, 48(T2) +#ifdef TRMMKERNEL + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r +#else + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r +#endif stxv vs44, 0(T3) stxv vs45, 16(T3) +#ifdef TRMMKERNEL + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r +#endif stxv vs46, 32(T3) stxv vs47, 48(T3) /*****the same with the second 8X8 ****/ -#ifndef TRMMKERNEL - + #ifndef TRMMKERNEL lxv vs32, 0(T4) lxv vs33, 16(T4) - lxv vs34, 32(T4) - lxv vs35, 48(T4) - lxv vs36, 0(T5) - lxv vs37, 16(T5) - lxv vs38,32(T5) - lxv vs39, 48(T5) #endif - xxmrglw vs8, vs48, vs60 xxmrglw vs10, vs52, vs56 - +#ifndef TRMMKERNEL + lxv vs34, 32(T4) + lxv vs35, 48(T4) +#endif xxmrghw vs1, vs48, vs60 xxmrghw vs0, vs52, vs56 +#ifndef TRMMKERNEL + lxv vs36, 0(T5) + lxv vs37, 16(T5) +#endif xxmrglw vs12, vs49, vs61 xxmrglw vs14, vs53, vs57 - -#ifndef TRMMKERNEL - lxv vs40, 0(T6) - lxv vs41, 16(T6) - lxv vs42, 32(T6) - lxv vs43, 48(T6) - lxv vs44, 0(T7) - lxv vs45, 16(T7) - lxv vs46, 32(T7) - lxv vs47, 48(T7) -#endif +#ifndef TRMMKERNEL + lxv vs38,32(T5) + lxv vs39, 48(T5) +#endif + xxmrghw vs2, vs53, vs57 xxmrghw vs3, vs49, vs61 - +#ifndef TRMMKERNEL + lxv vs40, 0(T6) + lxv vs41, 16(T6) +#endif xxmrglw vs16, vs50, vs62 xxmrglw vs18, vs54, vs58 - +#ifndef TRMMKERNEL + lxv vs42, 32(T6) + lxv vs43, 48(T6) +#endif xxlor vs9, vs8, vs8 xxlor vs11, vs10, vs10 xxmrghw vs4, vs54, vs58 xxmrghw vs5, vs50, vs62 - +#ifndef TRMMKERNEL + lxv vs44, 0(T7) + lxv vs45, 16(T7) +#endif xxlor vs13, vs12, vs12 xxlor vs15, vs14, vs14 xxmrglw vs24, vs51, vs63 - xxmrglw vs26, vs55, vs59 - + xxmrglw vs26, vs55, vs59 +#ifndef TRMMKERNEL + lxv vs46, 32(T7) + lxv vs47, 48(T7) +#endif xxlor vs17, vs16, vs16 xxlor vs19, vs18, vs18 xxmrghw vs30, vs55, vs59 - xxmrghw vs31, vs51, vs63 + xxmrghw vs31, vs51, vs63 + + xxperm vs8, vs0, save_permute_1 xxperm vs10, vs1, save_permute_1 @@ -965,11 +665,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs27, vs26, vs26 xxperm vs12, vs2, save_permute_1 xxperm vs14, vs3, save_permute_1 + xxperm vs13, vs2, save_permute_2 xxperm vs15, vs3, save_permute_2 - + #ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r +#endif xxperm vs16, vs4, save_permute_1 xxperm vs18, vs5, save_permute_1 + stxv vs32, 0(T4) + stxv vs33, 16(T4) xxperm vs17, vs4, save_permute_2 xxperm vs19, vs5, save_permute_2 xxperm vs24, vs30, save_permute_1 @@ -977,64 +686,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxperm vs25, vs30, save_permute_2 xxperm vs27, vs31, save_permute_2 -#ifdef TRMMKERNEL - xvmulsp vs32, vs8, alpha_r - xvmulsp vs33, vs12, alpha_r +#ifdef TRMMKERNEL xvmulsp vs34, vs16, alpha_r - xvmulsp vs35, vs24, alpha_r + xvmulsp vs35, vs24, alpha_r +#else + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r +#endif + stxv vs34, 32(T4) + stxv vs35, 48(T4) + +#ifdef TRMMKERNEL xvmulsp vs36, vs9, alpha_r - xvmulsp vs37, vs13, alpha_r + xvmulsp vs37, vs13, alpha_r +#else + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r +#endif + stxv vs36, 0(T5) + stxv vs37, 16(T5) + +#ifdef TRMMKERNEL xvmulsp vs38, vs17, alpha_r xvmulsp vs39, vs25, alpha_r -#else - xvmaddasp vs32, vs8, alpha_r - xvmaddasp vs33, vs12, alpha_r - xvmaddasp vs34, vs16, alpha_r - xvmaddasp vs35, vs24, alpha_r - xvmaddasp vs36, vs9, alpha_r - xvmaddasp vs37, vs13, alpha_r +#else xvmaddasp vs38, vs17, alpha_r xvmaddasp vs39, vs25, alpha_r #endif - stxv vs32, 0(T4) - stxv vs33, 16(T4) - stxv vs34, 32(T4) - stxv vs35, 48(T4) - stxv vs36, 0(T5) - stxv vs37, 16(T5) + + stxv vs38, 32(T5) stxv vs39, 48(T5) + #ifdef TRMMKERNEL xvmulsp vs40, vs10, alpha_r - xvmulsp vs41, vs14, alpha_r - xvmulsp vs42, vs18, alpha_r - xvmulsp vs43, vs26, alpha_r - xvmulsp vs44, vs11, alpha_r - xvmulsp vs45, vs15, alpha_r - xvmulsp vs46, vs19, alpha_r - xvmulsp vs47, vs27, alpha_r -#else - + xvmulsp vs41, vs14, alpha_r +#else xvmaddasp vs40, vs10, alpha_r xvmaddasp vs41, vs14, alpha_r - xvmaddasp vs42, vs18, alpha_r - xvmaddasp vs43, vs26, alpha_r - xvmaddasp vs44, vs11, alpha_r - xvmaddasp vs45, vs15, alpha_r - xvmaddasp vs46, vs19, alpha_r - xvmaddasp vs47, vs27, alpha_r - #endif - stxv vs40, 0(T6) - stxv vs41, 16(T6) + stxv vs41, 16(T6) +#ifdef TRMMKERNEL + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r +#else + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r +#endif stxv vs42, 32(T6) stxv vs43, 48(T6) +#ifdef TRMMKERNEL + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r +#else + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r +#endif + stxv vs44, 0(T7) stxv vs45, 16(T7) +#ifdef TRMMKERNEL + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r +#endif + stxv vs46, 32(T7) stxv vs47, 48(T7) @@ -1224,12 +946,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxperm vs10, vs8, permute_mask xxperm vs14, vs12, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 xvmaddasp vs32, vs0,vs24 xvmaddasp vs33, vs1,vs24 + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + xvmaddasp vs36, vs0,vs25 xvmaddasp vs37, vs1,vs25 @@ -1247,21 +971,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs52, vs0,vs29 xvmaddasp vs53, vs1,vs29 - + lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) xvmaddasp vs56, vs0,vs30 xvmaddasp vs57, vs1,vs30 xvmaddasp vs60, vs0,vs31 xvmaddasp vs61, vs1,vs31 - lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 xxpermdi vs29, vs28, vs28,2 @@ -1285,21 +1009,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs52, vs4,vs13 xvmaddasp vs53, vs5,vs13 - + lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) xvmaddasp vs56, vs4,vs14 xvmaddasp vs57, vs5,vs14 xvmaddasp vs60, vs4,vs15 xvmaddasp vs61, vs5,vs15 - lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) - lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 xxpermdi vs13, vs12, vs12,2 @@ -1323,22 +1048,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs52, vs0,vs29 xvmaddasp vs53, vs1,vs29 - - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - .if \Complete==0 lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + +.if \Complete==0 lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) +.endif - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask +.if \Complete==0 xxpermdi vs25, vs24, vs24,2 xxpermdi vs29, vs28, vs28,2 diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S index e655f0bfe..a41bcec77 100644 --- a/kernel/power/zgemm_kernel_power9.S +++ b/kernel/power/zgemm_kernel_power9.S @@ -30,10 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LOAD ld -#define STACKSIZE 32192 +#define STACKSIZE 512 #define FZERO 312+192(SP) - + +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ #define M r3 #define N r4 @@ -56,20 +57,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FRAMEPOINTER r12 -#define BBUFFER r14 +#define T10 r14 #define L r15 -#define ALPHA r16 +#define T8 r16 #define T5 r17 #define T2 r19 -#define BBO r20 -#define o8 r21 +#define T9 r20 +#define T6 r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define CO r26 -#define o16 r27 +#define T7 r27 #define T3 r28 #define T4 r29 @@ -82,12 +83,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROFCODE mr FRAMEPOINTER, SP - addi SP, SP, -STACKSIZE - addi SP, SP, -STACKSIZE - addi SP, SP, -STACKSIZE - addi SP, SP, -STACKSIZE - li r0, 0 - + addi SP, SP, -STACKSIZE + mflr r0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) @@ -111,6 +108,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stfd f30, 128(SP) stfd f31, 136(SP) + xxspltd alpha_r,vs1,0 /*copy from register f1 */ + xxspltd alpha_i,vs2,0 /*copy from register f2 */ std r31, 144(SP) std r30, 152(SP) @@ -132,21 +131,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r14, 280(SP) - stxv v20, 288(SP) - stxv v21, 304(SP) - stxv v22, 320(SP) - stxv v23, 336(SP) - stxv v24, 352(SP) - stxv v25, 368(SP) - stxv v26, 384(SP) - stxv v27, 400(SP) - stxv v28, 416(SP) - stxv v29, 432(SP) - stxv v30, 448(SP) - stxv v31, 464(SP) + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + std r0, FLINK_SAVE(SP) - stw r0, FZERO #ifdef linux ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) @@ -162,35 +161,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zgemm_macros_power9.S" - cmpwi cr0, M, 0 - ble L999 - cmpwi cr0, N, 0 - ble L999 - cmpwi cr0, K, 0 - ble L999 + slwi LDC, LDC, ZBASE_SHIFT - li PRE, 512 - li o8 , 8 - li o16 , 16 - - addi BBUFFER, SP, 512+4096 - li T1, -4096 - and BBUFFER, BBUFFER, T1 - + li PRE, 512 + li r0, 0 - addi ALPHA, SP, 296+192 - - xxlor alpha_r,vs1,vs1 /*copy from register f1 */ - xxlor alpha_i,vs2,vs2 /*copy from register f2 */ +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegdp alpha_r,alpha_r + xvnegdp alpha_i,alpha_i +#endif .align 4 #include "zgemm_logic_power9.S" L999: - addi r3, 0, 0 - + lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) @@ -233,24 +221,24 @@ L999: ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) - - lxv v20, 288(SP) - lxv v21, 304(SP) - lxv v22, 320(SP) - lxv v23, 336(SP) - lxv v24, 352(SP) - lxv v25, 368(SP) - lxv v26, 384(SP) - lxv v27, 400(SP) - lxv v28, 416(SP) - lxv v29, 432(SP) - lxv v30, 448(SP) - lxv v31, 464(SP) - addi SP, SP, STACKSIZE - addi SP, SP, STACKSIZE - addi SP, SP, STACKSIZE - addi SP, SP, STACKSIZE + ld r0, FLINK_SAVE(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE blr EPILOGUE diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S index 77ce36294..01685fe79 100644 --- a/kernel/power/zgemm_logic_power9.S +++ b/kernel/power/zgemm_logic_power9.S @@ -25,155 +25,348 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define MY_ALIGN .align 3 +b ZGEMM_L2 +/* MINI SUBROUTINES */ + + + +/* 2x8 MAIN 128x+1 LOOP */ +ZGEMM_L2x8_LMAIN_SUB: + mtctr L + LOAD2x8 0 + MY_ALIGN +ZGEMM_L2x8_LOOP: + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L 128,32,0,0 + KERNEL2x8_L 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L 128,32,2,0 + KERNEL2x8_L 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L 128,32,4,0 + KERNEL2x8_L 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L 128,32,6,0 + KERNEL2x8_L 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L 128,32,8,0 + KERNEL2x8_L 128,32,9,0 + KERNEL2x8_L 128,32,10,0 + KERNEL2x8_L 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L 128,32,12,0 + KERNEL2x8_L 128,32,13,0 + KERNEL2x8_L 128,32,14,0 + KERNEL2x8_L 128,32,15,0 + KERNEL2x8_L 128,32,16,0 + KERNEL2x8_L 128,32,17,0 + KERNEL2x8_L 128,32,18,0 + KERNEL2x8_L 128,32,19,0 + KERNEL2x8_L 128,32,20,0 + KERNEL2x8_L 128,32,21,0 + KERNEL2x8_L 128,32,22,0 + KERNEL2x8_L 128,32,23,0 + KERNEL2x8_L 128,32,24,0 + KERNEL2x8_L 128,32,25,0 + KERNEL2x8_L 128,32,26,0 + KERNEL2x8_L 128,32,27,0 + KERNEL2x8_L 128,32,28,0 + KERNEL2x8_L 128,32,29,0 + KERNEL2x8_L 128,32,30,0 + KERNEL2x8_L 128,32,31,0 + KERNEL2x8_L 128,32,32,0 + KERNEL2x8_L 128,32,33,0 + KERNEL2x8_L 128,32,34,0 + KERNEL2x8_L 128,32,35,0 + KERNEL2x8_L 128,32,36,0 + KERNEL2x8_L 128,32,37,0 + KERNEL2x8_L 128,32,38,0 + KERNEL2x8_L 128,32,39,0 + KERNEL2x8_L 128,32,40,0 + KERNEL2x8_L 128,32,41,0 + KERNEL2x8_L 128,32,42,0 + KERNEL2x8_L 128,32,43,0 + KERNEL2x8_L 128,32,44,0 + KERNEL2x8_L 128,32,45,0 + KERNEL2x8_L 128,32,46,0 + KERNEL2x8_L 128,32,47,0 + KERNEL2x8_L 128,32,48,0 + KERNEL2x8_L 128,32,49,0 + KERNEL2x8_L 128,32,50,0 + KERNEL2x8_L 128,32,51,0 + KERNEL2x8_L 128,32,52,0 + KERNEL2x8_L 128,32,53,0 + KERNEL2x8_L 128,32,54,0 + KERNEL2x8_L 128,32,55,0 + KERNEL2x8_L 128,32,56,0 + KERNEL2x8_L 128,32,57,0 + KERNEL2x8_L 128,32,58,0 + KERNEL2x8_L 128,32,59,0 + KERNEL2x8_L 128,32,60,0 + KERNEL2x8_L 128,32,61,0 + KERNEL2x8_L 128,32,62,0 + KERNEL2x8_L 128,32,63,1 + bdnz ZGEMM_L2x8_LOOP + MY_ALIGN +ZGEMM_L2x8_LOOP_END: + END2x8 AO, BO, 128,32 + blr + + MY_ALIGN +ZGEMM_2x8_L64_SUB: + LOAD2x8 0 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L 128,32,0,0 + KERNEL2x8_L 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L 128,32,2,0 + KERNEL2x8_L 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L 128,32,4,0 + KERNEL2x8_L 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L 128,32,6,0 + KERNEL2x8_L 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L 128,32,8,0 + KERNEL2x8_L 128,32,9,0 + KERNEL2x8_L 128,32,10,0 + KERNEL2x8_L 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L 128,32,12,0 + KERNEL2x8_L 128,32,13,0 + KERNEL2x8_L 128,32,14,0 + KERNEL2x8_L 128,32,15,0 + KERNEL2x8_L 128,32,16,0 + KERNEL2x8_L 128,32,17,0 + KERNEL2x8_L 128,32,18,0 + KERNEL2x8_L 128,32,19,0 + KERNEL2x8_L 128,32,20,0 + KERNEL2x8_L 128,32,21,0 + KERNEL2x8_L 128,32,22,0 + KERNEL2x8_L 128,32,23,0 + KERNEL2x8_L 128,32,24,0 + KERNEL2x8_L 128,32,25,0 + KERNEL2x8_L 128,32,26,0 + KERNEL2x8_L 128,32,27,0 + KERNEL2x8_L 128,32,28,0 + KERNEL2x8_L 128,32,29,0 + KERNEL2x8_L 128,32,30,0 + KERNEL2x8_E 128,32,31,1 + blr + + + MY_ALIGN +ZGEMM_2x8_L32_SUB: + LOAD2x8 0 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L 128,32,0,0 + KERNEL2x8_L 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L 128,32,2,0 + KERNEL2x8_L 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L 128,32,4,0 + KERNEL2x8_L 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L 128,32,6,0 + KERNEL2x8_L 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L 128,32,8,0 + KERNEL2x8_L 128,32,9,0 + KERNEL2x8_L 128,32,10,0 + KERNEL2x8_L 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L 128,32,12,0 + KERNEL2x8_L 128,32,13,0 + KERNEL2x8_L 128,32,14,0 + KERNEL2x8_L 128,32,15,1 + blr + MY_ALIGN + +ZGEMM_2x8_L16_SUB: + LOAD2x8 0 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L 128,32,0,0 + KERNEL2x8_L 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L 128,32,2,0 + KERNEL2x8_L 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L 128,32,4,0 + KERNEL2x8_L 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L 128,32,6,0 + KERNEL2x8_L 128,32,7,1 + blr + MY_ALIGN + +ZGEMM_2x4_LMAIN_SUB: + mtctr L + LOAD2x4 0 + MY_ALIGN +ZGEMM_L2x4_LOOP: + KERNEL2x4_L 64,32,0,0 + KERNEL2x4_L 64,32,1,0 + KERNEL2x4_L 64,32,2,0 + KERNEL2x4_L 64,32,3,0 + KERNEL2x4_L 64,32,4,0 + KERNEL2x4_L 64,32,5,0 + KERNEL2x4_L 64,32,6,0 + KERNEL2x4_L 64,32,7,0 + KERNEL2x4_L 64,32,8,0 + KERNEL2x4_L 64,32,9,0 + KERNEL2x4_L 64,32,10,0 + KERNEL2x4_L 64,32,11,0 + KERNEL2x4_L 64,32,12,0 + KERNEL2x4_L 64,32,13,0 + KERNEL2x4_L 64,32,14,0 + KERNEL2x4_L 64,32,15,1 + bdnz ZGEMM_L2x4_LOOP + MY_ALIGN +ZGEMM_L2x4_LOOP_END: + END2x4 AO, BO, 64,32 + blr + + MY_ALIGN +ZGEMM_2x4_L16_SUB: + LOAD2x4 0 + KERNEL2x4_L 64,32, 0,0 + KERNEL2x4_L 64,32, 1,0 + KERNEL2x4_L 64,32, 2,0 + KERNEL2x4_L 64,32, 3,0 + KERNEL2x4_L 64,32, 4,0 + KERNEL2x4_L 64,32, 5,0 + KERNEL2x4_L 64,32, 6,0 + KERNEL2x4_E 64,32, 7,1 + blr + + MY_ALIGN +ZGEMM_2x4_L8_SUB: + LOAD2x4 0 + KERNEL2x4_L 64,32, 0,0 + KERNEL2x4_L 64,32, 1,0 + KERNEL2x4_L 64,32, 2,0 + KERNEL2x4_E 64,32, 3,1 + blr + +/* MAIN LOOP BEGINS */ + + MY_ALIGN +ZGEMM_L2: srawi. J, N, 1 ble ZGEMM_L2_END ZGEMM_L2_BEGIN: - - mr BO, B - mr BBO, BBUFFER - srawi. T1, K, 2 - ble ZGEMM_L2_COPYB1 - -ZGEMM_L2_COPYB8: - - addi T2, PRE, 128 - dcbt BO, PRE - dcbtst BBO, PRE - dcbtst BBO, T2 - ZCOPYB_8 - addic. T1, T1, -1 - - bgt ZGEMM_L2_COPYB8 - -ZGEMM_L2_COPYB1: - - andi. T1, K, 3 - ble ZGEMM_L2_COPYB_END - -ZGEMM_L2_COPYB_LOOP: - - ZCOPYB_2 - addic. T1, T1, -1 - - bgt ZGEMM_L2_COPYB_LOOP - -ZGEMM_L2_COPYB_END: - - mr CO, C - mr AO, A - slwi T1, LDC , 1 + mr CO, C + slwi T1, LDC , 1 + add T2,C,LDC + mr AO, A add C, C, T1 srawi. I, M, 3 ble ZGEMM_L2x8_END - -ZGEMM_L2x8_BEGIN: - - - mr BO, BBUFFER + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 +ZGEMM_L2x8_BEGIN: mr T1, K + mr BO, B + dcbt B, r0 + dcbt AO, r0 + /* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -1 - srawi. L, T1, 5 /**(K-1) % 32x */ + /* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. L, T1, 7 /**(K-1) % 128x */ + ZERO2x8 ble ZGEMM_L2x8_SUB0 - - -ZGEMM_L2x8_LOOP_START: - - LOAD2x8 0 - li T2, 1024 - li T3, 1024+512 - li T4, 2048 - li T5, 2048+512 - mtctr L - - MY_ALIGN -ZGEMM_L2x8_LOOP: - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L 128,64,0,0 - KERNEL2x8_L 128,64,1,0 - dcbt AO, T2 - KERNEL2x8_L 128,64,2,0 - KERNEL2x8_L 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L 128,64,4,0 - KERNEL2x8_L 128,64,5,0 - dcbt AO, T4 - KERNEL2x8_L 128,64,6,0 - KERNEL2x8_L 128,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L 128,64,8,0 - KERNEL2x8_L 128,64,9,0 - KERNEL2x8_L 128,64,10,0 - KERNEL2x8_L 128,64,11,0 - dcbt BO, T4 - KERNEL2x8_L 128,64,12,0 - KERNEL2x8_L 128,64,13,0 - KERNEL2x8_L 128,64,14,0 - KERNEL2x8_L 128,64,15,1 - bdnz ZGEMM_L2x8_LOOP - MY_ALIGN -ZGEMM_L2x8_LOOP_END: - END2x8 AO, BO, 128, 64 - - b ZGEMM_L2x8_SUB1 - -ZGEMM_L2x8_SUB0: - - andi. L, K, 63 - - b ZGEMM_L2x8_SUB2 - -ZGEMM_L2x8_SUB1: - - andi. L, T1, 31 + bl ZGEMM_L2x8_LMAIN_SUB + + andi. L, T1, 127 ble ZGEMM_L2x8_SAVE - -ZGEMM_L2x8_SUB2: - srawi. T1,L, 3 - ble ZGEMM_L2x8_SUB2_4 - mtctr T1 + b ZGEMM_L2x8_SUB2 + +ZGEMM_L2x8_SUB0: + andi. L, K, 255 + cmpwi K,128 + bne ZGEMM_L2x8_SUB2 + MY_ALIGN +ZGEMM_L2x8_SUB2_128: + bl ZGEMM_2x8_L64_SUB + bl ZGEMM_2x8_L64_SUB + b ZGEMM_L2x8_SAVE MY_ALIGN -ZGEMM_L2x8_SUB2_LOOP: +ZGEMM_L2x8_SUB2: + andi. T1,L, 64 + ble ZGEMM_L2x8_SUB2_32 + bl ZGEMM_2x8_L64_SUB + MY_ALIGN +ZGEMM_L2x8_SUB2_32: + andi. T1,L, 32 + ble ZGEMM_L2x8_SUB2_16 + bl ZGEMM_2x8_L32_SUB + MY_ALIGN +ZGEMM_L2x8_SUB2_16: + andi. T1,L, 16 + ble ZGEMM_L2x8_SUB2_8 + bl ZGEMM_2x8_L16_SUB + MY_ALIGN +ZGEMM_L2x8_SUB2_8: + andi. T1,L, 8 + ble ZGEMM_L2x8_SUB2_4 LOAD2x8 0 - KERNEL2x8_L 128,64, 0,0 - KERNEL2x8_L 128,64, 1,0 - KERNEL2x8_L 128,64, 2,0 - KERNEL2x8_E 128,64, 3,1 - bdnz ZGEMM_L2x8_SUB2_LOOP - MY_ALIGN + KERNEL2x8_L 128,32, 0,0 + KERNEL2x8_L 128,32, 1,0 + KERNEL2x8_L 128,32, 2,0 + KERNEL2x8_E 128,32, 3,1 + MY_ALIGN ZGEMM_L2x8_SUB2_4: andi. T1,L, 4 ble ZGEMM_L2x8_SUB2_2 LOAD2x8 0 - KERNEL2x8_L 128,64, 0,0 - KERNEL2x8_E 128,64, 1,1 + KERNEL2x8_L 128,32, 0,0 + KERNEL2x8_E 128,32, 1,1 MY_ALIGN ZGEMM_L2x8_SUB2_2: andi. T1,L, 2 ble ZGEMM_L2x8_SUB2_1 LOAD2x8 0 - KERNEL2x8_E 128,64, 0,1 + KERNEL2x8_E 128,32, 0,1 MY_ALIGN ZGEMM_L2x8_SUB2_1: andi. T1,L, 1 ble ZGEMM_L2x8_SAVE - KERNEL2x8 - -/* addic. L, L, -1 - bgt ZGEMM_L2x8_SUB2_1*/ + KERNEL2x8 ZGEMM_L2x8_SAVE: - + addic. I, I, -1 SAVE2x8 - addic. I, I, -1 bgt ZGEMM_L2x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L2x1_END + + andi. T1, M, 4 + ble ZGEMM_L2x4_END + b ZGEMM_L2x4_BEGIN + MY_ALIGN ZGEMM_L2x8_END: ZGEMM_L2x4_BEGIN: @@ -183,70 +376,50 @@ ZGEMM_L2x4_BEGIN: andi. T1, M, 4 ble ZGEMM_L2x4_END - mr BO, BBUFFER + mr BO, B mr T1, K addi T1,T1, -1 - srawi. L, T1, 4 /**(K-1) % 16x */ - ZERO2x4 + ZERO2x4 + srawi. L, T1, 5 /**(K-1) % 32x */ + ble ZGEMM_L2x4_SUB0 - -ZGEMM_L2x4_LOOP_START: - LOAD2x4 0 - mtctr L - - MY_ALIGN -ZGEMM_L2x4_LOOP: - KERNEL2x4_L 64,64,0,0 - KERNEL2x4_L 64,64,1,0 - KERNEL2x4_L 64,64,2,0 - KERNEL2x4_L 64,64,3,0 - KERNEL2x4_L 64,64,4,0 - KERNEL2x4_L 64,64,5,0 - KERNEL2x4_L 64,64,6,0 - KERNEL2x4_L 64,64,7,1 - bdnz ZGEMM_L2x4_LOOP - MY_ALIGN -ZGEMM_L2x4_LOOP_END: - END2x4 AO, BO, 64, 64 - - b ZGEMM_L2x4_SUB1 - -ZGEMM_L2x4_SUB0: - - andi. L, K, 31 - + bl ZGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x4_SAVE b ZGEMM_L2x4_SUB2 -ZGEMM_L2x4_SUB1: - - andi. L, T1, 15 - ble ZGEMM_L2x4_SAVE - -ZGEMM_L2x4_SUB2: - srawi. T1,L, 3 - ble ZGEMM_L2x4_SUB2_4 - mtctr T1 +ZGEMM_L2x4_SUB0: + andi. L, K, 63 + cmpwi K,32 + bne ZGEMM_L2x4_SUB2 + MY_ALIGN +ZGEMM_L2x4_SUB2_32: + bl ZGEMM_2x4_L16_SUB + bl ZGEMM_2x4_L16_SUB + b ZGEMM_L2x4_SAVE + MY_ALIGN +ZGEMM_L2x4_SUB2: + andi. T1,L, 16 + ble ZGEMM_L2x4_SUB2_8 + bl ZGEMM_2x4_L16_SUB MY_ALIGN -ZGEMM_L2x4_SUB2_LOOP: - LOAD2x4 0 - KERNEL2x4_L 64,64, 0,0 - KERNEL2x4_L 64,64, 1,0 - KERNEL2x4_L 64,64, 2,0 - KERNEL2x4_E 64,64, 3,1 - bdnz ZGEMM_L2x4_SUB2_LOOP +ZGEMM_L2x4_SUB2_8: + andi. T1,L, 8 + ble ZGEMM_L2x4_SUB2_4 + bl ZGEMM_2x4_L8_SUB MY_ALIGN ZGEMM_L2x4_SUB2_4: andi. T1,L, 4 ble ZGEMM_L2x4_SUB2_2 LOAD2x4 0 - KERNEL2x4_L 64,64, 0,0 - KERNEL2x4_E 64,64, 1,1 + KERNEL2x4_L 64,32, 0,0 + KERNEL2x4_E 64,32, 1,1 MY_ALIGN ZGEMM_L2x4_SUB2_2: andi. T1,L, 2 ble ZGEMM_L2x4_SUB2_1 LOAD2x4 0 - KERNEL2x4_E 64,64, 0,1 + KERNEL2x4_E 64,32, 0,1 MY_ALIGN ZGEMM_L2x4_SUB2_1: andi. T1,L, 1 @@ -259,12 +432,11 @@ ZGEMM_L2x4_SAVE: ZGEMM_L2x4_END: -ZGEMM_L2x2_BEGIN: - +ZGEMM_L2x2_BEGIN: andi. T1, M, 2 ble ZGEMM_L2x2_END - mr BO, BBUFFER + mr BO, B mr T1, K addi T1,T1, -1 srawi. L, T1, 4 /**(K-1) % 16x */ @@ -277,18 +449,18 @@ ZGEMM_L2x2_LOOP_START: MY_ALIGN ZGEMM_L2x2_LOOP: - KERNEL2x2_L 32,64,0,0 - KERNEL2x2_L 32,64,1,0 - KERNEL2x2_L 32,64,2,0 - KERNEL2x2_L 32,64,3,0 - KERNEL2x2_L 32,64,4,0 - KERNEL2x2_L 32,64,5,0 - KERNEL2x2_L 32,64,6,0 - KERNEL2x2_L 32,64,7,1 + KERNEL2x2_L 32,32,0,0 + KERNEL2x2_L 32,32,1,0 + KERNEL2x2_L 32,32,2,0 + KERNEL2x2_L 32,32,3,0 + KERNEL2x2_L 32,32,4,0 + KERNEL2x2_L 32,32,5,0 + KERNEL2x2_L 32,32,6,0 + KERNEL2x2_L 32,32,7,1 bdnz ZGEMM_L2x2_LOOP MY_ALIGN ZGEMM_L2x2_LOOP_END: - END2x2 AO, BO, 32, 64 + END2x2 AO, BO, 32,32 b ZGEMM_L2x2_SUB1 @@ -310,24 +482,24 @@ ZGEMM_L2x2_SUB2: MY_ALIGN ZGEMM_L2x2_SUB2_LOOP: LOAD2x2 0 - KERNEL2x2_L 32,64, 0,0 - KERNEL2x2_L 32,64, 1,0 - KERNEL2x2_L 32,64, 2,0 - KERNEL2x2_E 32,64, 3,1 + KERNEL2x2_L 32,32, 0,0 + KERNEL2x2_L 32,32, 1,0 + KERNEL2x2_L 32,32, 2,0 + KERNEL2x2_E 32,32, 3,1 bdnz ZGEMM_L2x2_SUB2_LOOP MY_ALIGN ZGEMM_L2x2_SUB2_4: andi. T1,L, 4 ble ZGEMM_L2x2_SUB2_2 LOAD2x2 0 - KERNEL2x2_L 32,64, 0,0 - KERNEL2x2_E 32,64, 1,1 + KERNEL2x2_L 32,32, 0,0 + KERNEL2x2_E 32,32, 1,1 MY_ALIGN ZGEMM_L2x2_SUB2_2: andi. T1,L, 2 ble ZGEMM_L2x2_SUB2_1 LOAD2x2 0 - KERNEL2x2_E 32,64, 0,1 + KERNEL2x2_E 32,32, 0,1 MY_ALIGN ZGEMM_L2x2_SUB2_1: andi. T1,L, 1 @@ -339,12 +511,12 @@ ZGEMM_L2x2_SAVE: ZGEMM_L2x2_END: -ZGEMM_L2x1_BEGIN: +ZGEMM_L2x1_BEGIN: andi. T1, M, 1 ble ZGEMM_L2x1_END - mr BO, BBUFFER + mr BO, B mr T1, K addi T1,T1, -1 srawi. L, T1, 4 /**(K-1) % 16x */ @@ -358,18 +530,18 @@ ZGEMM_L2x1_LOOP_START: MY_ALIGN ZGEMM_L2x1_LOOP: - KERNEL2x1_L 16,64,0,0 - KERNEL2x1_L 16,64,1,0 - KERNEL2x1_L 16,64,2,0 - KERNEL2x1_L 16,64,3,0 - KERNEL2x1_L 16,64,4,0 - KERNEL2x1_L 16,64,5,0 - KERNEL2x1_L 16,64,6,0 - KERNEL2x1_L 16,64,7,1 + KERNEL2x1_L 16,32,0,0 + KERNEL2x1_L 16,32,1,0 + KERNEL2x1_L 16,32,2,0 + KERNEL2x1_L 16,32,3,0 + KERNEL2x1_L 16,32,4,0 + KERNEL2x1_L 16,32,5,0 + KERNEL2x1_L 16,32,6,0 + KERNEL2x1_L 16,32,7,1 bdnz ZGEMM_L2x1_LOOP MY_ALIGN ZGEMM_L2x1_LOOP_END: - END2x1 AO, BO, 16, 64 + END2x1 AO, BO, 16,32 b ZGEMM_L2x1_SUB1 @@ -391,24 +563,24 @@ ZGEMM_L2x1_SUB2: MY_ALIGN ZGEMM_L2x1_SUB2_LOOP: LOAD2x1 0 - KERNEL2x1_L 16,64, 0,0 - KERNEL2x1_L 16,64, 1,0 - KERNEL2x1_L 16,64, 2,0 - KERNEL2x1_E 16,64, 3,1 + KERNEL2x1_L 16,32, 0,0 + KERNEL2x1_L 16,32, 1,0 + KERNEL2x1_L 16,32, 2,0 + KERNEL2x1_E 16,32, 3,1 bdnz ZGEMM_L2x1_SUB2_LOOP MY_ALIGN ZGEMM_L2x1_SUB2_4: andi. T1,L, 4 ble ZGEMM_L2x1_SUB2_2 LOAD2x1 0 - KERNEL2x1_L 16,64, 0,0 - KERNEL2x1_E 16,64, 1,1 + KERNEL2x1_L 16,32, 0,0 + KERNEL2x1_E 16,32, 1,1 MY_ALIGN ZGEMM_L2x1_SUB2_2: andi. T1,L, 2 ble ZGEMM_L2x1_SUB2_1 LOAD2x1 0 - KERNEL2x1_E 16,64, 0,1 + KERNEL2x1_E 16,32, 0,1 MY_ALIGN ZGEMM_L2x1_SUB2_1: andi. T1,L, 1 @@ -442,36 +614,6 @@ ZGEMM_L1_BEGIN: andi. T1, N, 1 ble ZGEMM_L1_END - mr BO, B - mr BBO, BBUFFER - srawi. T1, K, 3 /*this time K/8 */ - ble ZGEMM_L1_COPYB1 - -ZGEMM_L1_COPYB8: - - addi T2, PRE, 128 - dcbt BO, PRE - dcbtst BBO, PRE - dcbtst BBO, T2 - ZCOPYB_8 - addic. T1, T1, -1 - - bgt ZGEMM_L1_COPYB8 - -ZGEMM_L1_COPYB1: - - andi. T1, K, 7 - ble ZGEMM_L1_COPYB_END - -ZGEMM_L1_COPYB_LOOP: - - ZCOPYB_1 - addic. T1, T1, -1 - - bgt ZGEMM_L1_COPYB_LOOP - -ZGEMM_L1_COPYB_END: - mr CO, C mr AO, A srawi. I, M, 3 @@ -480,7 +622,7 @@ ZGEMM_L1_COPYB_END: ZGEMM_L1x8_BEGIN: - mr BO, BBUFFER + mr BO, B mr T1, K addi T1,T1, -1 srawi. L, T1, 5 /**(K-1) % 32x */ @@ -501,33 +643,33 @@ ZGEMM_L1x8_LOOP_START: ZGEMM_L1x8_LOOP: dcbt AO, PRE dcbt BO, PRE - KERNEL1x8_L 128,32,0,0 - KERNEL1x8_L 128,32,1,0 + KERNEL1x8_L 128,16,0,0 + KERNEL1x8_L 128,16,1,0 dcbt AO, T2 - KERNEL1x8_L 128,32,2,0 - KERNEL1x8_L 128,32,3,0 + KERNEL1x8_L 128,16,2,0 + KERNEL1x8_L 128,16,3,0 dcbt AO, T3 dcbt BO, T2 - KERNEL1x8_L 128,32,4,0 - KERNEL1x8_L 128,32,5,0 + KERNEL1x8_L 128,16,4,0 + KERNEL1x8_L 128,16,5,0 dcbt AO, T4 - KERNEL1x8_L 128,32,6,0 - KERNEL1x8_L 128,32,7,0 + KERNEL1x8_L 128,16,6,0 + KERNEL1x8_L 128,16,7,0 dcbt AO, T5 dcbt BO, T3 - KERNEL1x8_L 128,32,8,0 - KERNEL1x8_L 128,32,9,0 - KERNEL1x8_L 128,32,10,0 - KERNEL1x8_L 128,32,11,0 + KERNEL1x8_L 128,16,8,0 + KERNEL1x8_L 128,16,9,0 + KERNEL1x8_L 128,16,10,0 + KERNEL1x8_L 128,16,11,0 dcbt BO, T4 - KERNEL1x8_L 128,32,12,0 - KERNEL1x8_L 128,32,13,0 - KERNEL1x8_L 128,32,14,0 - KERNEL1x8_L 128,32,15,1 + KERNEL1x8_L 128,16,12,0 + KERNEL1x8_L 128,16,13,0 + KERNEL1x8_L 128,16,14,0 + KERNEL1x8_L 128,16,15,1 bdnz ZGEMM_L1x8_LOOP MY_ALIGN ZGEMM_L1x8_LOOP_END: - END1x8 AO, BO, 128, 32 + END1x8 AO, BO, 128,16 b ZGEMM_L1x8_SUB1 @@ -549,32 +691,30 @@ ZGEMM_L1x8_SUB2: MY_ALIGN ZGEMM_L1x8_SUB2_LOOP: LOAD1x8 0 - KERNEL1x8_L 128,32, 0,0 - KERNEL1x8_L 128,32, 1,0 - KERNEL1x8_L 128,32, 2,0 - KERNEL1x8_E 128,32, 3,1 + KERNEL1x8_L 128,16, 0,0 + KERNEL1x8_L 128,16, 1,0 + KERNEL1x8_L 128,16, 2,0 + KERNEL1x8_E 128,16, 3,1 bdnz ZGEMM_L1x8_SUB2_LOOP MY_ALIGN ZGEMM_L1x8_SUB2_4: andi. T1,L, 4 ble ZGEMM_L1x8_SUB2_2 LOAD1x8 0 - KERNEL1x8_L 128,32, 0,0 - KERNEL1x8_E 128,32, 1,1 + KERNEL1x8_L 128,16, 0,0 + KERNEL1x8_E 128,16, 1,1 MY_ALIGN ZGEMM_L1x8_SUB2_2: andi. T1,L, 2 ble ZGEMM_L1x8_SUB2_1 LOAD1x8 0 - KERNEL1x8_E 128,32, 0,1 + KERNEL1x8_E 128,16, 0,1 MY_ALIGN ZGEMM_L1x8_SUB2_1: andi. T1,L, 1 ble ZGEMM_L1x8_SAVE KERNEL1x8 - -/* addic. L, L, -1 - bgt ZGEMM_L1x8_SUB2_1*/ + ZGEMM_L1x8_SAVE: @@ -592,7 +732,7 @@ ZGEMM_L1x4_BEGIN: andi. T1, M, 4 ble ZGEMM_L1x4_END - mr BO, BBUFFER + mr BO, B mr T1, K addi T1,T1, -1 srawi. L, T1, 5 /**(K-1) % 16x */ @@ -605,26 +745,26 @@ ZGEMM_L1x4_LOOP_START: MY_ALIGN ZGEMM_L1x4_LOOP: - KERNEL1x4_L 64,32,0,0 - KERNEL1x4_L 64,32,1,0 - KERNEL1x4_L 64,32,2,0 - KERNEL1x4_L 64,32,3,0 - KERNEL1x4_L 64,32,4,0 - KERNEL1x4_L 64,32,5,0 - KERNEL1x4_L 64,32,6,0 - KERNEL1x4_L 64,32,7,0 - KERNEL1x4_L 64,32,8,0 - KERNEL1x4_L 64,32,9,0 - KERNEL1x4_L 64,32,10,0 - KERNEL1x4_L 64,32,11,0 - KERNEL1x4_L 64,32,12,0 - KERNEL1x4_L 64,32,13,0 - KERNEL1x4_L 64,32,14,0 - KERNEL1x4_L 64,32,15,1 + KERNEL1x4_L 64,16,0,0 + KERNEL1x4_L 64,16,1,0 + KERNEL1x4_L 64,16,2,0 + KERNEL1x4_L 64,16,3,0 + KERNEL1x4_L 64,16,4,0 + KERNEL1x4_L 64,16,5,0 + KERNEL1x4_L 64,16,6,0 + KERNEL1x4_L 64,16,7,0 + KERNEL1x4_L 64,16,8,0 + KERNEL1x4_L 64,16,9,0 + KERNEL1x4_L 64,16,10,0 + KERNEL1x4_L 64,16,11,0 + KERNEL1x4_L 64,16,12,0 + KERNEL1x4_L 64,16,13,0 + KERNEL1x4_L 64,16,14,0 + KERNEL1x4_L 64,16,15,1 bdnz ZGEMM_L1x4_LOOP MY_ALIGN ZGEMM_L1x4_LOOP_END: - END1x4 AO, BO, 64, 32 + END1x4 AO, BO, 64,16 b ZGEMM_L1x4_SUB1 @@ -646,24 +786,24 @@ ZGEMM_L1x4_SUB2: MY_ALIGN ZGEMM_L1x4_SUB2_LOOP: LOAD1x4 0 - KERNEL1x4_L 64,32, 0,0 - KERNEL1x4_L 64,32, 1,0 - KERNEL1x4_L 64,32, 2,0 - KERNEL1x4_E 64,32, 3,1 + KERNEL1x4_L 64,16, 0,0 + KERNEL1x4_L 64,16, 1,0 + KERNEL1x4_L 64,16, 2,0 + KERNEL1x4_E 64,16, 3,1 bdnz ZGEMM_L1x4_SUB2_LOOP MY_ALIGN ZGEMM_L1x4_SUB2_4: andi. T1,L, 4 ble ZGEMM_L1x4_SUB2_2 LOAD1x4 0 - KERNEL1x4_L 64,32, 0,0 - KERNEL1x4_E 64,32, 1,1 + KERNEL1x4_L 64,16, 0,0 + KERNEL1x4_E 64,16, 1,1 MY_ALIGN ZGEMM_L1x4_SUB2_2: andi. T1,L, 2 ble ZGEMM_L1x4_SUB2_1 LOAD1x4 0 - KERNEL1x4_E 64,32, 0,1 + KERNEL1x4_E 64,16, 0,1 MY_ALIGN ZGEMM_L1x4_SUB2_1: andi. T1,L, 1 @@ -681,7 +821,7 @@ ZGEMM_L1x2_BEGIN: andi. T1, M, 2 ble ZGEMM_L1x2_END - mr BO, BBUFFER + mr BO, B mr T1, K addi T1,T1, -1 srawi. L, T1, 5 /**(K-1) % 16x */ @@ -694,26 +834,26 @@ ZGEMM_L1x2_LOOP_START: MY_ALIGN ZGEMM_L1x2_LOOP: - KERNEL1x2_L 32,32,0,0 - KERNEL1x2_L 32,32,1,0 - KERNEL1x2_L 32,32,2,0 - KERNEL1x2_L 32,32,3,0 - KERNEL1x2_L 32,32,4,0 - KERNEL1x2_L 32,32,5,0 - KERNEL1x2_L 32,32,6,0 - KERNEL1x2_L 32,32,7,0 - KERNEL1x2_L 32,32,8,0 - KERNEL1x2_L 32,32,9,0 - KERNEL1x2_L 32,32,10,0 - KERNEL1x2_L 32,32,11,0 - KERNEL1x2_L 32,32,12,0 - KERNEL1x2_L 32,32,13,0 - KERNEL1x2_L 32,32,14,0 - KERNEL1x2_L 32,32,15,1 + KERNEL1x2_L 32,16,0,0 + KERNEL1x2_L 32,16,1,0 + KERNEL1x2_L 32,16,2,0 + KERNEL1x2_L 32,16,3,0 + KERNEL1x2_L 32,16,4,0 + KERNEL1x2_L 32,16,5,0 + KERNEL1x2_L 32,16,6,0 + KERNEL1x2_L 32,16,7,0 + KERNEL1x2_L 32,16,8,0 + KERNEL1x2_L 32,16,9,0 + KERNEL1x2_L 32,16,10,0 + KERNEL1x2_L 32,16,11,0 + KERNEL1x2_L 32,16,12,0 + KERNEL1x2_L 32,16,13,0 + KERNEL1x2_L 32,16,14,0 + KERNEL1x2_L 32,16,15,1 bdnz ZGEMM_L1x2_LOOP MY_ALIGN ZGEMM_L1x2_LOOP_END: - END1x2 AO, BO, 32, 32 + END1x2 AO, BO, 32,16 b ZGEMM_L1x2_SUB1 @@ -735,24 +875,24 @@ ZGEMM_L1x2_SUB2: MY_ALIGN ZGEMM_L1x2_SUB2_LOOP: LOAD1x2 0 - KERNEL1x2_L 32,32, 0,0 - KERNEL1x2_L 32,32, 1,0 - KERNEL1x2_L 32,32, 2,0 - KERNEL1x2_E 32,32, 3,1 + KERNEL1x2_L 32,16, 0,0 + KERNEL1x2_L 32,16, 1,0 + KERNEL1x2_L 32,16, 2,0 + KERNEL1x2_E 32,16, 3,1 bdnz ZGEMM_L1x2_SUB2_LOOP MY_ALIGN ZGEMM_L1x2_SUB2_4: andi. T1,L, 4 ble ZGEMM_L1x2_SUB2_2 LOAD1x2 0 - KERNEL1x2_L 32,32, 0,0 - KERNEL1x2_E 32,32, 1,1 + KERNEL1x2_L 32,16, 0,0 + KERNEL1x2_E 32,16, 1,1 MY_ALIGN ZGEMM_L1x2_SUB2_2: andi. T1,L, 2 ble ZGEMM_L1x2_SUB2_1 LOAD1x2 0 - KERNEL1x2_E 32,32, 0,1 + KERNEL1x2_E 32,16, 0,1 MY_ALIGN ZGEMM_L1x2_SUB2_1: andi. T1,L, 1 @@ -769,7 +909,7 @@ ZGEMM_L1x1_BEGIN: andi. T1, M, 1 ble ZGEMM_L1x1_END - mr BO, BBUFFER + mr BO, B mr T1, K addi T1,T1, -1 srawi. L, T1, 5 /**(K-1) % 16x */ @@ -783,26 +923,26 @@ ZGEMM_L1x1_LOOP_START: MY_ALIGN ZGEMM_L1x1_LOOP: - KERNEL1x1_L 16,32,0,0 - KERNEL1x1_L 16,32,1,0 - KERNEL1x1_L 16,32,2,0 - KERNEL1x1_L 16,32,3,0 - KERNEL1x1_L 16,32,4,0 - KERNEL1x1_L 16,32,5,0 - KERNEL1x1_L 16,32,6,0 - KERNEL1x1_L 16,32,7,0 - KERNEL1x1_L 16,32,8,0 - KERNEL1x1_L 16,32,9,0 - KERNEL1x1_L 16,32,10,0 - KERNEL1x1_L 16,32,11,0 - KERNEL1x1_L 16,32,12,0 - KERNEL1x1_L 16,32,13,0 - KERNEL1x1_L 16,32,14,0 - KERNEL1x1_L 16,32,15,1 + KERNEL1x1_L 16,16,0,0 + KERNEL1x1_L 16,16,1,0 + KERNEL1x1_L 16,16,2,0 + KERNEL1x1_L 16,16,3,0 + KERNEL1x1_L 16,16,4,0 + KERNEL1x1_L 16,16,5,0 + KERNEL1x1_L 16,16,6,0 + KERNEL1x1_L 16,16,7,0 + KERNEL1x1_L 16,16,8,0 + KERNEL1x1_L 16,16,9,0 + KERNEL1x1_L 16,16,10,0 + KERNEL1x1_L 16,16,11,0 + KERNEL1x1_L 16,16,12,0 + KERNEL1x1_L 16,16,13,0 + KERNEL1x1_L 16,16,14,0 + KERNEL1x1_L 16,16,15,1 bdnz ZGEMM_L1x1_LOOP MY_ALIGN ZGEMM_L1x1_LOOP_END: - END1x1 AO, BO, 16, 32 + END1x1 AO, BO, 16, 16 b ZGEMM_L1x1_SUB1 @@ -824,24 +964,24 @@ ZGEMM_L1x1_SUB2: MY_ALIGN ZGEMM_L1x1_SUB2_LOOP: LOAD1x1 0 - KERNEL1x1_L 16,32, 0,0 - KERNEL1x1_L 16,32, 1,0 - KERNEL1x1_L 16,32, 2,0 - KERNEL1x1_E 16,32, 3,1 + KERNEL1x1_L 16,16, 0,0 + KERNEL1x1_L 16,16, 1,0 + KERNEL1x1_L 16,16, 2,0 + KERNEL1x1_E 16,16, 3,1 bdnz ZGEMM_L1x1_SUB2_LOOP MY_ALIGN ZGEMM_L1x1_SUB2_4: andi. T1,L, 4 ble ZGEMM_L1x1_SUB2_2 LOAD1x1 0 - KERNEL1x1_L 16,32, 0,0 - KERNEL1x1_E 16,32, 1,1 + KERNEL1x1_L 16,16, 0,0 + KERNEL1x1_E 16,16, 1,1 MY_ALIGN ZGEMM_L1x1_SUB2_2: andi. T1,L, 2 ble ZGEMM_L1x1_SUB2_1 LOAD1x1 0 - KERNEL1x1_E 16,32, 0,1 + KERNEL1x1_E 16,16, 0,1 MY_ALIGN ZGEMM_L1x1_SUB2_1: andi. T1,L, 1 diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S index 93a309ad1..10d9e4cc3 100644 --- a/kernel/power/zgemm_macros_power9.S +++ b/kernel/power/zgemm_macros_power9.S @@ -25,68 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - - #define XSFADD_R1 xsadddp - #define XSFADD_R2 xssubdp - #define XSFADD_I1 xsadddp - #define XSFADD_I2 xsadddp - -#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) - - #define XSFADD_R1 xsadddp - #define XSFADD_R2 xsadddp - #define XSFADD_I1 xssubdp - #define XSFADD_I2 xsadddp - -#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) - - #define XSFADD_R1 xsadddp - #define XSFADD_R2 xsadddp - #define XSFADD_I1 xsadddp - #define XSFADD_I2 xssubdp - -#else // CC || CR || RC || RR - - #define XSFADD_R1 xsadddp - #define XSFADD_R2 xssubdp - #define XSFADD_I1 xssubdp - #define XSFADD_I2 xssubdp - -#endif - -.macro AGGREGATE_INTO_COMPLEX FIRST_V, SECOND_V, OUTPUT_V - AGGREGATE_INTO_COMPLEX_INNER \FIRST_V, \SECOND_V, \OUTPUT_V, vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7 -.endm - -.macro AGGREGATE_INTO_COMPLEX_INNER FIRST_V, SECOND_V, OUTPUT_V ,TEMP1,TEMP2,TEMP3,TEMP4,TEMP5,TEMP6,TEMP7,TEMP8 - xxlxor \TEMP1, \TEMP1, \TEMP1 - xxlxor \TEMP2, \TEMP2, \TEMP2 - - xxswapd \SECOND_V, \SECOND_V // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB - - XSFADD_I1 \TEMP2, \TEMP2, \FIRST_V // realA*imagB - XSFADD_I2 \TEMP2, \TEMP2, \SECOND_V // imagA*realB - - xxswapd \FIRST_V, \FIRST_V //imagA*realB, realA*realB -> realA*realB, imagA*realB - xxswapd \SECOND_V, \SECOND_V // reverse to original imagA*imagB, realA*imagB - - XSFADD_R1 \TEMP1, \TEMP1, \FIRST_V // realA*realB - XSFADD_R2 \TEMP1, \TEMP1, \SECOND_V // imagA*imagB - - xsmuldp \TEMP3, \TEMP2, alpha_i // imag*alpha_i - xsmuldp \TEMP4, \TEMP2, alpha_r // imag*alpha_r - xsmuldp \TEMP5, \TEMP1, alpha_r // real*alpha_r - xsmuldp \TEMP6, \TEMP1, alpha_i // real*alpha_i - - xssubdp \TEMP7, \TEMP5, \TEMP3 // real*alpha_r - imag*alpha_i - xsadddp \TEMP8, \TEMP6, \TEMP4 // real*alpha_i + imag*alpha_r - xxpermdi \OUTPUT_V, \TEMP8, \TEMP7, 0 // merge real and imag part -.endm - -/********************************************************************************************** -* Macros for N=2 and M=8 -**********************************************************************************************/ #define unit_size 16 #define DISP32(ind,disp) (ind*unit_size*32+disp) @@ -95,338 +33,457 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DISP4(ind,disp) (ind*unit_size*4+disp) #define DISP2(ind,disp) (ind*unit_size*2+disp) #define DISP1(ind,disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) + +/* HELPERS FOR SAVE */ + +/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */ +.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET +#ifndef TRMMKERNEL + lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) + lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) + xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 + xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 +#endif +.endm + +/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ +.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ +.endm + +/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ +.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ +.endm + +/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead instead to fix sign*/ + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm + +/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */ +.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 +#ifndef TRMMKERNEL + xvmsubadp \VSOUT1,\VSINII, alpha_i + xvmaddadp \VSOUT2,\VSINRR, alpha_i +#else + xvmuldp \VSOUT1,\VSINII, alpha_i + xvmuldp \VSOUT2,\VSINRR, alpha_i +#endif +.endm + +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ +.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmsubadp \VSOUT1,\VSINRR, alpha_r + xvmaddadp \VSOUT2,\VSINII, alpha_r +.endm + +/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */ +.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrghd \VSOUT1,\VSIN2,\VSIN1 + xxmrgld \VSOUT2,\VSIN2,\VSIN1 +.endm +.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2 + stxv \VSIN1, DISPX(\LOFFSET)(\REG) + stxv \VSIN2, DISPX(\LOFFSET+16)(\REG) +.endm + +.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 + LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 + LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 + LOAD_COUPLE_AS_RR_II vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64) + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 + LOAD_COUPLE_AS_RR_II vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs12,vs13 + AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4 + MULT_APLHA_PART1 vs6,vs8,vs16,vs17 + MULT_APLHA_PART2 vs2,vs4,vs14,vs15 + AGGREGATE_REALS_IMAGES vs10,vs11,vs12,vs13 + MULT_APLHA_PART2 vs6,vs8,vs16,vs17 + AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + MULT_APLHA_PART1 vs10,vs12, vs24,vs25 + UNPACK_FOR_STORE vs16,vs17,vs3,vs5 + MULT_APLHA_PART1 \VSRes1,\VSRes3, vs26,vs27 + STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 + MULT_APLHA_PART2 vs10,vs12,vs24,vs25 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 + MULT_APLHA_PART2 \VSRes1,\VSRes3, vs26,vs27 + UNPACK_FOR_STORE vs24,vs25,vs10,vs12 + UNPACK_FOR_STORE vs26,vs27,\VSRes1,\VSRes3 + STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs10,vs12 + STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3 +.endm + +.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 + LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 + LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + MULT_APLHA_PART1 vs6,vs8, vs16,vs17 + MULT_APLHA_PART2 vs2,vs4, vs14,vs15 + MULT_APLHA_PART2 vs6,vs8,vs16,vs17 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + UNPACK_FOR_STORE vs16,vs17,vs3,vs5 + STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 +.endm + + +.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 + LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + MULT_APLHA_PART2 vs2,vs4, vs14,vs15 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 +.endm + + +.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3 +#ifndef TRMMKERNEL + lxv vs18, (\LOFFSET)(\BASE_REG) + xxmrgld vs14,vs18,vs18 + xxmrghd vs15,vs18,vs18 +#endif + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs4,vs5 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + MULT_APLHA_PART2 vs2,vs4, vs14,vs15 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + xxmrghd vs7,vs15,vs14 + stxv vs7, (\LOFFSET)(\BASE_REG) +.endm + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ .macro Zero2x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 .endm .macro LOAD2x8 Zero - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B - lxv vs18, 32(BO) // load real part from B - lxv vs19, 48(BO) // load imag part from B + lxv vs16, 0(BO) // load real imag from B + lxv vs18, 16(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A - lxv vs4, 64(AO) // load real,imag from A - lxv vs5, 80(AO) // load real,imag from A - lxv vs6, 96(AO) // load real,imag from A - lxv vs7, 112(AO) // load real,imag from A + lxv vs4, 64(AO) // load real,imag from A + lxv vs5, 80(AO) // load real,imag from A + lxv vs6, 96(AO) // load real,imag from A + lxv vs7, 112(AO) // load real,imag from A .if \Zero==1 - Zero2x8 + Zero2x8 .endif .endm .macro END2x8_NORMAL - END2x8 AO,BO,128,64 + END2x8 AO,BO,128,32 .endm -.macro END2x8 AREG, BREG, OffsetA, OffsetB +.macro END2x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA .endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag - xvmaddadp vs40, vs4, vs16 // real*real, imag*real - xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag - xvmaddadp vs42, vs5, vs16 // real*real, imag*real - xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag - xvmaddadp vs44, vs6, vs16 // real*real, imag*real - xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag - xvmaddadp vs46, vs7, vs16 // real*real, imag*real - xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs48, vs0, vs18 - xvmaddadp vs48, vs0, vs18 // real*real, imag*real - xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag - xvmaddadp vs50, vs1, vs18 // real*real, imag*real - xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag - xvmaddadp vs52, vs2, vs18 // real*real, imag*real - xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag - xvmaddadp vs54, vs3, vs18 // real*real, imag*real - xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag - xvmaddadp vs56, vs4, vs18 // real*real, imag*real - xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag - xvmaddadp vs58, vs5, vs18 // real*real, imag*real - xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag - xvmaddadp vs60, vs6, vs18 // real*real, imag*real - xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag - xvmaddadp vs62, vs7, vs18 // real*real, imag*real - xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs50, vs1, vs18 + + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs52, vs2, vs18 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs54, vs3, vs18 + + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs56, vs4, vs18 + + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs58, vs5, vs18 + + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs60, vs6, vs18 + + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs62, vs7, vs18 + + + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs49, vs0, vs19 + + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs51, vs1, vs19 + + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs53, vs2, vs19 + + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs55, vs3, vs19 + + xvmaddadp vs41, vs4, vs17 + xvmaddadp vs57, vs4, vs19 + + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs59, vs5, vs19 + + xvmaddadp vs45, vs6, vs17 + xvmaddadp vs61, vs6, vs19 + + xvmaddadp vs47, vs7, vs17 + xvmaddadp vs63, vs7, vs19 .endm -.macro KERNEL2x8_L OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.macro KERNEL2x8_L OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm -.macro KERNEL2x8_E OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL2x8_E OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs48, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs49, vs0, vs19 -lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B - lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B - lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B + xxswapd vs21, vs20 + xxswapd vs23, vs22 - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag - xvmaddadp vs40, vs4, vs16 // real*real, imag*real - xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag - xvmaddadp vs42, vs5, vs16 // real*real, imag*real - xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag - xvmaddadp vs44, vs6, vs16 // real*real, imag*real - xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag - xvmaddadp vs46, vs7, vs16 // real*real, imag*real - xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs50, vs1, vs18 - xvmaddadp vs48, vs0, vs18 // real*real, imag*real - xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag - xvmaddadp vs50, vs1, vs18 // real*real, imag*real - xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag - xvmaddadp vs52, vs2, vs18 // real*real, imag*real - xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag - xvmaddadp vs54, vs3, vs18 // real*real, imag*real - xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag - xvmaddadp vs56, vs4, vs18 // real*real, imag*real - xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag - xvmaddadp vs58, vs5, vs18 // real*real, imag*real - xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag - xvmaddadp vs60, vs6, vs18 // real*real, imag*real - xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag - xvmaddadp vs62, vs7, vs18 // real*real, imag*real - xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag + lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs51, vs1, vs19 + + lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs52, vs2, vs18 + + lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A + + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs53, vs2, vs19 + + lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) +.endif +.endif + + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs54, vs3, vs18 .if \Complete==0 - lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A - - lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A - - lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B - lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B - lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B + lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A .endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) - addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) -.else - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP8(\Index,128) + + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs55, vs3, vs19 + +.if \Complete==0 + lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A .endif -.endif + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs56, vs4, vs18 - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - xvmaddadp vs34, vs9, vs20 // real*real, imag*real - xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag - xvmaddadp vs36, vs10, vs20 // real*real, imag*real - xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag - xvmaddadp vs38, vs11, vs20 // real*real, imag*real - xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag - xvmaddadp vs40, vs12, vs20 // real*real, imag*real - xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag - xvmaddadp vs42, vs13, vs20 // real*real, imag*real - xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag - xvmaddadp vs44, vs14, vs20 // real*real, imag*real - xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag - xvmaddadp vs46, vs15, vs20 // real*real, imag*real - xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + xvmaddadp vs41, vs4, vs17 + xvmaddadp vs57, vs4, vs19 - xvmaddadp vs48, vs8, vs22 // real*real, imag*real - xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag - xvmaddadp vs50, vs9, vs22 // real*real, imag*real - xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag - xvmaddadp vs52, vs10, vs22 // real*real, imag*real - xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag - xvmaddadp vs54, vs11, vs22 // real*real, imag*real - xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag - xvmaddadp vs56, vs12, vs22 // real*real, imag*real - xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag - xvmaddadp vs58, vs13, vs22 // real*real, imag*real - xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag - xvmaddadp vs60, vs14, vs22 // real*real, imag*real - xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag - xvmaddadp vs62, vs15, vs22 // real*real, imag*real - xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs58, vs5, vs18 + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs59, vs5, vs19 + +.if \Complete==0 + lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A +.endif + + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs60, vs6, vs18 + xvmaddadp vs45, vs6, vs17 + xvmaddadp vs61, vs6, vs19 + + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs62, vs7, vs18 + xvmaddadp vs47, vs7, vs17 + xvmaddadp vs63, vs7, vs19 + +.if \Complete==0 + lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A +.endif + + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs48, vs8, vs22 +.if \Complete==0 + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 +.if \IsLast==1 + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP4(\Index,64) +.endif + +.endif + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs49, vs8, vs23 + +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs50, vs9, vs22 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs51, vs9, vs23 + + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs52, vs10, vs22 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs53, vs10, vs23 + + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs54, vs11, vs22 + xvmaddadp vs39, vs11, vs21 + xvmaddadp vs55, vs11, vs23 + + xvmaddadp vs40, vs12, vs20 + xvmaddadp vs56, vs12, vs22 + xvmaddadp vs41, vs12, vs21 + xvmaddadp vs57, vs12, vs23 + + xvmaddadp vs42, vs13, vs20 + xvmaddadp vs58, vs13, vs22 + xvmaddadp vs43, vs13, vs21 + xvmaddadp vs59, vs13, vs23 + + xvmaddadp vs44, vs14, vs20 + xvmaddadp vs60, vs14, vs22 + xvmaddadp vs45, vs14, vs21 + xvmaddadp vs61, vs14, vs23 + + xvmaddadp vs46, vs15, vs20 + xvmaddadp vs62, vs15, vs22 + xvmaddadp vs47, vs15, vs21 + xvmaddadp vs63, vs15, vs23 .endm -.macro KERNEL2x8 +.macro KERNEL2x8 LOAD2x8 0 - END2x8 AO, BO, 128,64 + END2x8 AO, BO, 128,32 .endm .macro SAVE2x8 - mr T1, CO - addi T2, T1, 64 - -#ifndef TRMMKERNEL - - lxv vs16, 0(T1) - lxv vs17, 16(T1) - lxv vs18, 32(T1) - lxv vs19, 48(T1) - lxv vs20, 0(T2) - lxv vs21, 16(T2) - lxv vs22, 32(T2) - lxv vs23, 48(T2) - -#endif - - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 - AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 - AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 - AGGREGATE_INTO_COMPLEX vs40,vs41,vs12 - AGGREGATE_INTO_COMPLEX vs42,vs43,vs13 - AGGREGATE_INTO_COMPLEX vs44,vs45,vs14 - AGGREGATE_INTO_COMPLEX vs46,vs47,vs15 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - xvadddp vs10, vs10, vs18 - xvadddp vs11, vs11, vs19 - xvadddp vs12, vs12, vs20 - xvadddp vs13, vs13, vs21 - xvadddp vs14, vs14, vs22 - xvadddp vs15, vs15, vs23 - -#endif - - stxv vs8, 0(T1) - stxv vs9, 16(T1) - stxv vs10, 32(T1) - stxv vs11, 48(T1) - stxv vs12, 0(T2) - stxv vs13, 16(T2) - stxv vs14, 32(T2) - stxv vs15, 48(T2) - - add T1, T1, LDC - add T2, T2, LDC - -#ifndef TRMMKERNEL - - lxv vs16, 0(T1) - lxv vs17, 16(T1) - lxv vs18, 32(T1) - lxv vs19, 48(T1) - lxv vs20, 0(T2) - lxv vs21, 16(T2) - lxv vs22, 32(T2) - lxv vs23, 48(T2) - -#endif - - AGGREGATE_INTO_COMPLEX vs48,vs49,vs8 - AGGREGATE_INTO_COMPLEX vs50,vs51,vs9 - AGGREGATE_INTO_COMPLEX vs52,vs53,vs10 - AGGREGATE_INTO_COMPLEX vs54,vs55,vs11 - AGGREGATE_INTO_COMPLEX vs56,vs57,vs12 - AGGREGATE_INTO_COMPLEX vs58,vs59,vs13 - AGGREGATE_INTO_COMPLEX vs60,vs61,vs14 - AGGREGATE_INTO_COMPLEX vs62,vs63,vs15 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - xvadddp vs10, vs10, vs18 - xvadddp vs11, vs11, vs19 - xvadddp vs12, vs12, vs20 - xvadddp vs13, vs13, vs21 - xvadddp vs14, vs14, vs22 - xvadddp vs15, vs15, vs23 - -#endif - - stxv vs8, 0(T1) - stxv vs9, 16(T1) - stxv vs10, 32(T1) - stxv vs11, 48(T1) - stxv vs12, 0(T2) - stxv vs13, 16(T2) - stxv vs14, 32(T2) - stxv vs15, 48(T2) - - addi CO, CO, 128 + add T1, CO ,LDC + SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 + SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0 + addi CO, CO, 128 .endm @@ -435,223 +492,178 @@ lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B **********************************************************************************************/ .macro Zero2x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 .endm .macro LOAD2x4 Zero - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B - lxv vs18, 32(BO) // load real part from B - lxv vs19, 48(BO) // load imag part from B + lxv vs16, 0(BO) // load real imag from B + lxv vs18, 16(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A - .if \Zero==1 - Zero2x4 + Zero2x4 .endif .endm .macro END2x4_NORMAL - END2x4 AO,BO,64,64 + END2x4 AO,BO,64,32 .endm -.macro END2x4 AREG, BREG, OffsetA, OffsetB +.macro END2x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA .endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs40, vs0, vs18 + xvmaddadp vs41, vs0, vs19 - xvmaddadp vs40, vs0, vs18 // real*real, imag*real - xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag - xvmaddadp vs42, vs1, vs18 // real*real, imag*real - xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag - xvmaddadp vs44, vs2, vs18 // real*real, imag*real - xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag - xvmaddadp vs46, vs3, vs18 // real*real, imag*real - xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs42, vs1, vs18 + xvmaddadp vs43, vs1, vs19 + + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs44, vs2, vs18 + xvmaddadp vs45, vs2, vs19 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs47, vs3, vs19 .endm -.macro KERNEL2x4_L OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.macro KERNEL2x4_L OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm -.macro KERNEL2x4_E OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL2x4_E OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xxswapd vs21, vs20 + xxswapd vs23, vs22 + lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs40, vs0, vs18 + xvmaddadp vs41, vs0, vs19 + lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) + addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) +.endif +.endif -lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B - lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B - lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs42, vs1, vs18 + xvmaddadp vs43, vs1, vs19 + + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 +.if \Complete==0 + lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs44, vs2, vs18 + xvmaddadp vs45, vs2, vs19 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs47, vs3, vs19 - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag - xvmaddadp vs40, vs0, vs18 // real*real, imag*real - xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag - xvmaddadp vs42, vs1, vs18 // real*real, imag*real - xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag - xvmaddadp vs44, vs2, vs18 // real*real, imag*real - xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag - xvmaddadp vs46, vs3, vs18 // real*real, imag*real - xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag +.if \Complete==0 + lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.if \IsLast==1 + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif .if \Complete==0 - lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A - - lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B - lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B - lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 .endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) - addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) -.else - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP8(\Index,128) -.endif -.endif - - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - xvmaddadp vs34, vs9, vs20 // real*real, imag*real - xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag - xvmaddadp vs36, vs10, vs20 // real*real, imag*real - xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag - xvmaddadp vs38, vs11, vs20 // real*real, imag*real - xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag - xvmaddadp vs40, vs8, vs22 // real*real, imag*real - xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag - xvmaddadp vs42, vs9, vs22 // real*real, imag*real - xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag - xvmaddadp vs44, vs10, vs22 // real*real, imag*real - xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag - xvmaddadp vs46, vs11, vs22 // real*real, imag*real - xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag + xvmaddadp vs40, vs8, vs22 + xvmaddadp vs41, vs8, vs23 + + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs42, vs9, vs22 + xvmaddadp vs43, vs9, vs23 + + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs44, vs10, vs22 + xvmaddadp vs45, vs10, vs23 + + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs39, vs11, vs21 + xvmaddadp vs46, vs11, vs22 + xvmaddadp vs47, vs11, vs23 .endm -.macro KERNEL2x4 +.macro KERNEL2x4 LOAD2x4 0 - END2x4 AO, BO, 64,64 + END2x4 AO, BO, 64,32 .endm -.macro SAVE2x4 - - mr T1, CO - -#ifndef TRMMKERNEL - - lxv vs16, 0(T1) - lxv vs17, 16(T1) - lxv vs18, 32(T1) - lxv vs19, 48(T1) - -#endif - - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 - AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 - AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - xvadddp vs10, vs10, vs18 - xvadddp vs11, vs11, vs19 - -#endif - - stxv vs8, 0(T1) - stxv vs9, 16(T1) - stxv vs10, 32(T1) - stxv vs11, 48(T1) - - add T1, T1, LDC - -#ifndef TRMMKERNEL - - lxv vs16, 0(T1) - lxv vs17, 16(T1) - lxv vs18, 32(T1) - lxv vs19, 48(T1) - -#endif - - AGGREGATE_INTO_COMPLEX vs40,vs41,vs8 - AGGREGATE_INTO_COMPLEX vs42,vs43,vs9 - AGGREGATE_INTO_COMPLEX vs44,vs45,vs10 - AGGREGATE_INTO_COMPLEX vs46,vs47,vs11 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - xvadddp vs10, vs10, vs18 - xvadddp vs11, vs11, vs19 - -#endif - - stxv vs8, 0(T1) - stxv vs9, 16(T1) - stxv vs10, 32(T1) - stxv vs11, 48(T1) - - addi CO, CO, 64 +.macro SAVE2x4 + add T1, CO ,LDC + SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 + SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0 + addi CO, CO, 64 .endm @@ -660,170 +672,131 @@ lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B **********************************************************************************************/ .macro Zero2x2 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 .endm .macro LOAD2x2 Zero - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B - lxv vs18, 32(BO) // load real part from B - lxv vs19, 48(BO) // load imag part from B + lxv vs16, 0(BO) // load real imag from B + lxv vs18, 16(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - .if \Zero==1 - Zero2x2 -.endif - + Zero2x2 +.endif .endm .macro END2x2_NORMAL - END2x2 AO,BO,32,64 + END2x2 AO,BO,32,32 .endm -.macro END2x2 AREG, BREG, OffsetA, OffsetB +.macro END2x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA .endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs36, vs0, vs18 + xvmaddadp vs37, vs0, vs19 + + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs39, vs1, vs19 - xvmaddadp vs36, vs0, vs18 // real*real, imag*real - xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag - xvmaddadp vs38, vs1, vs18 // real*real, imag*real - xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag - .endm -.macro KERNEL2x2_L OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.macro KERNEL2x2_L OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm -.macro KERNEL2x2_E OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL2x2_E OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xxswapd vs21, vs20 + xxswapd vs23, vs22 -lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B - lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B - lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B + lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) +.endif +.endif + xvmaddadp vs36, vs0, vs18 + xvmaddadp vs37, vs0, vs19 - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - - xvmaddadp vs36, vs0, vs18 // real*real, imag*real - xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag - xvmaddadp vs38, vs1, vs18 // real*real, imag*real - xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs39, vs1, vs19 .if \Complete==0 - lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A - - lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B - lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B - lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B + lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP4(\Index,64) +.endif .endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) - addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,64) - addi \BREG, \BREG, DISP8(\Index,128) -.endif -.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 + +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs36, vs8, vs22 + xvmaddadp vs37, vs8, vs23 + + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 + + xvmaddadp vs38, vs9, vs22 + xvmaddadp vs39, vs9, vs23 - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - xvmaddadp vs34, vs9, vs20 // real*real, imag*real - xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag - - xvmaddadp vs36, vs8, vs22 // real*real, imag*real - xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag - xvmaddadp vs38, vs9, vs22 // real*real, imag*real - xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag - .endm -.macro KERNEL2x2 +.macro KERNEL2x2 LOAD2x2 0 - END2x2 AO, BO, 32,64 + END2x2 AO, BO, 32,32 .endm -.macro SAVE2x2 - - mr T1, CO - -#ifndef TRMMKERNEL - - lxv vs16, 0(T1) - lxv vs17, 16(T1) - -#endif - - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - -#endif - - stxv vs8, 0(T1) - stxv vs9, 16(T1) - - add T1, T1, LDC - -#ifndef TRMMKERNEL - - lxv vs16, 0(T1) - lxv vs17, 16(T1) - -#endif - - AGGREGATE_INTO_COMPLEX vs36,vs37,vs8 - AGGREGATE_INTO_COMPLEX vs38,vs39,vs9 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - -#endif - - stxv vs8, 0(T1) - stxv vs9, 16(T1) - - addi CO, CO, 32 - +.macro SAVE2x2 + add T1, CO ,LDC + SAVE2 vs32,vs33,vs34,vs35,CO,0 + SAVE2 vs36,vs37,vs38,vs39,T1,0 + addi CO, CO, 32 .endm /********************************************************************************************** @@ -831,348 +804,288 @@ lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B **********************************************************************************************/ .macro Zero2x1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 .endm .macro LOAD2x1 Zero - lxv vs0, 0(AO) // load real,imag from A + lxv vs0, 0(AO) // load real,imag from A - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B - lxv vs18, 32(BO) // load real part from B - lxv vs19, 48(BO) // load imag part from B + lxv vs16, 0(BO) // load real imag from B + lxv vs18, 16(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 .if \Zero==1 - Zero2x1 -.endif - + Zero2x1 +.endif .endm .macro END2x1_NORMAL - END2x1 AO,BO,16,64 + END2x1 AO,BO,16,32 .endm -.macro END2x1 AREG, BREG, OffsetA, OffsetB +.macro END2x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB .endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs35, vs0, vs19 - xvmaddadp vs34, vs0, vs18 // real*real, imag*real - xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag - .endm -.macro KERNEL2x1_L OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.macro KERNEL2x1_L OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm -.macro KERNEL2x1_E OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL2x1_E OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B -lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B - lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B - lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B + lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xxswapd vs21, vs20 + xxswapd vs23, vs22 +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) +.endif +.endif - xvmaddadp vs34, vs0, vs18 // real*real, imag*real - xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs35, vs0, vs19 .if \Complete==0 - lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B - lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B - lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B .endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) - addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,32) - addi \BREG, \BREG, DISP8(\Index,128) +.if \Complete==0 + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.if \IsLast==1 + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP4(\Index,64) +.endif .endif -.endif - - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - xvmaddadp vs34, vs8, vs22 // real*real, imag*real - xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag - +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 + + xvmaddadp vs34, vs8, vs22 + xvmaddadp vs35, vs8, vs23 + .endm -.macro KERNEL2x1 +.macro KERNEL2x1 LOAD2x1 0 - END2x1 AO, BO, 16,64 + END2x1 AO, BO, 16,32 .endm .macro SAVE2x1 - - mr T1, CO -#ifndef TRMMKERNEL - lxv vs16, 0(T1) -#endif - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - -#ifndef TRMMKERNEL - xvadddp vs8, vs8, vs16 -#endif - - stxv vs8, 0(T1) - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxv vs16, 0(T1) -#endif - - AGGREGATE_INTO_COMPLEX vs34,vs35,vs8 - -#ifndef TRMMKERNEL - xvadddp vs8, vs8, vs16 -#endif - - stxv vs8, 0(T1) - - addi CO, CO, 16 - + add T1, CO ,LDC + SAVE1 vs32,vs33,CO,0 + SAVE1 vs34,vs35,T1,0 + addi CO, CO, 16 .endm /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ .macro Zero1x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 .endm .macro LOAD1x8 Zero - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B + lxv vs16, 0(BO) // load real imag from B + xxswapd vs17, vs16 + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A - - lxv vs4, 64(AO) // load real,imag from A - lxv vs5, 80(AO) // load real,imag from A - lxv vs6, 96(AO) // load real,imag from A - lxv vs7, 112(AO) // load real,imag from A + lxv vs4, 64(AO) // load real,imag from A + lxv vs5, 80(AO) // load real,imag from A + lxv vs6, 96(AO) // load real,imag from A + lxv vs7, 112(AO) // load real,imag from A .if \Zero==1 - Zero1x8 + Zero1x8 .endif .endm .macro END1x8_NORMAL - END1x8 AO,BO,128,32 + END1x8 AO,BO,128,16 .endm -.macro END1x8 AREG, BREG, OffsetA, OffsetB +.macro END1x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA .endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag - xvmaddadp vs40, vs4, vs16 // real*real, imag*real - xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag - xvmaddadp vs42, vs5, vs16 // real*real, imag*real - xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag - xvmaddadp vs44, vs6, vs16 // real*real, imag*real - xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag - xvmaddadp vs46, vs7, vs16 // real*real, imag*real - xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs41, vs4, vs17 + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs45, vs6, vs17 + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs47, vs7, vs17 .endm -.macro KERNEL1x8_L OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.macro KERNEL1x8_L OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm -.macro KERNEL1x8_E OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL1x8_E OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + xxswapd vs21, vs20 - lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A - lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B - - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag - xvmaddadp vs40, vs4, vs16 // real*real, imag*real - xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag - xvmaddadp vs42, vs5, vs16 // real*real, imag*real - xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag - xvmaddadp vs44, vs6, vs16 // real*real, imag*real - xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag - xvmaddadp vs46, vs7, vs16 // real*real, imag*real - xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 .if \Complete==0 - lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs41, vs4, vs17 +.if \Complete==0 + lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs45, vs6, vs17 +.if \Complete==0 + lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs47, vs7, vs17 - lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP4(\Index,48+\OffsetB)(\BREG) // load imag part from B + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A .endif - -.if \IsLast==1 + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 +.if \Complete==0 + lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B + xxswapd vs17,vs16 +.endif +.if \IsLast==1 .if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) -.else - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP4(\Index,64) + addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) + addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP2(\Index,32) .endif -.endif +.endif + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs37, vs10, vs21 - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - xvmaddadp vs34, vs9, vs20 // real*real, imag*real - xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag - xvmaddadp vs36, vs10, vs20 // real*real, imag*real - xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag - xvmaddadp vs38, vs11, vs20 // real*real, imag*real - xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag - xvmaddadp vs40, vs12, vs20 // real*real, imag*real - xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag - xvmaddadp vs42, vs13, vs20 // real*real, imag*real - xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag - xvmaddadp vs44, vs14, vs20 // real*real, imag*real - xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag - xvmaddadp vs46, vs15, vs20 // real*real, imag*real - xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs39, vs11, vs21 + + xvmaddadp vs40, vs12, vs20 + xvmaddadp vs41, vs12, vs21 + xvmaddadp vs42, vs13, vs20 + xvmaddadp vs43, vs13, vs21 + xvmaddadp vs44, vs14, vs20 + xvmaddadp vs45, vs14, vs21 + xvmaddadp vs46, vs15, vs20 + xvmaddadp vs47, vs15, vs21 .endm -.macro KERNEL1x8 +.macro KERNEL1x8 LOAD1x8 0 - END1x8 AO, BO, 128,32 + END1x8 AO, BO, 128,16 .endm .macro SAVE1x8 - mr T1, CO - addi T2, T1, 64 - -#ifndef TRMMKERNEL - - lxv vs16, 0(T1) - lxv vs17, 16(T1) - lxv vs18, 32(T1) - lxv vs19, 48(T1) - lxv vs20, 0(T2) - lxv vs21, 16(T2) - lxv vs22, 32(T2) - lxv vs23, 48(T2) - -#endif - - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 - AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 - AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 - AGGREGATE_INTO_COMPLEX vs40,vs41,vs12 - AGGREGATE_INTO_COMPLEX vs42,vs43,vs13 - AGGREGATE_INTO_COMPLEX vs44,vs45,vs14 - AGGREGATE_INTO_COMPLEX vs46,vs47,vs15 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - xvadddp vs10, vs10, vs18 - xvadddp vs11, vs11, vs19 - xvadddp vs12, vs12, vs20 - xvadddp vs13, vs13, vs21 - xvadddp vs14, vs14, vs22 - xvadddp vs15, vs15, vs23 - -#endif - - stxv vs8, 0(T1) - stxv vs9, 16(T1) - stxv vs10, 32(T1) - stxv vs11, 48(T1) - stxv vs12, 0(T2) - stxv vs13, 16(T2) - stxv vs14, 32(T2) - stxv vs15, 48(T2) - - addi CO, CO, 128 + SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 + addi CO, CO, 128 .endm @@ -1181,170 +1094,143 @@ lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B **********************************************************************************************/ .macro Zero1x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 .endm .macro LOAD1x4 Zero - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B + lxv vs16, 0(BO) // load real imag from B + xxswapd vs17,vs16 + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A - .if \Zero==1 - Zero1x4 + Zero1x4 .endif .endm .macro END1x4_NORMAL - END1x4 AO,BO,64,32 + END1x4 AO,BO,64,16 .endm -.macro END1x4 AREG, BREG, OffsetA, OffsetB +.macro END1x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA .endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 .endm -.macro KERNEL1x4_L OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.macro KERNEL1x4_L OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm -.macro KERNEL1x4_E OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL1x4_E OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + xxswapd vs21,vs20 -lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A - xvmaddadp vs40, vs0, vs18 // real*real, imag*real - xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag - xvmaddadp vs42, vs1, vs18 // real*real, imag*real - xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag - xvmaddadp vs44, vs2, vs18 // real*real, imag*real - xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag - xvmaddadp vs46, vs3, vs18 // real*real, imag*real - xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + + xvmaddadp vs40, vs0, vs18 + xvmaddadp vs41, vs0, vs19 + xvmaddadp vs42, vs1, vs18 + xvmaddadp vs43, vs1, vs19 + xvmaddadp vs44, vs2, vs18 + xvmaddadp vs45, vs2, vs19 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs47, vs3, vs19 .if \Complete==0 - lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A - - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B + lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A .endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) -.else - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif - - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - xvmaddadp vs34, vs9, vs20 // real*real, imag*real - xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag - xvmaddadp vs36, vs10, vs20 // real*real, imag*real - xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag - xvmaddadp vs38, vs11, vs20 // real*real, imag*real - xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag +.if \Complete==0 + lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A - xvmaddadp vs40, vs8, vs22 // real*real, imag*real - xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag - xvmaddadp vs42, vs9, vs22 // real*real, imag*real - xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag - xvmaddadp vs44, vs10, vs22 // real*real, imag*real - xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag - xvmaddadp vs46, vs11, vs22 // real*real, imag*real - xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B + xxswapd vs17,vs16 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) + addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs39, vs11, vs21 + + xvmaddadp vs40, vs8, vs22 + xvmaddadp vs41, vs8, vs23 + xvmaddadp vs42, vs9, vs22 + xvmaddadp vs43, vs9, vs23 + xvmaddadp vs44, vs10, vs22 + xvmaddadp vs45, vs10, vs23 + xvmaddadp vs46, vs11, vs22 + xvmaddadp vs47, vs11, vs23 .endm -.macro KERNEL1x4 +.macro KERNEL1x4 LOAD1x4 0 - END1x4 AO, BO, 64,32 + END1x4 AO, BO, 64,16 .endm .macro SAVE1x4 - - mr T1, CO - -#ifndef TRMMKERNEL - - lxv vs16, 0(T1) - lxv vs17, 16(T1) - lxv vs18, 32(T1) - lxv vs19, 48(T1) - -#endif - - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 - AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 - AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - xvadddp vs10, vs10, vs18 - xvadddp vs11, vs11, vs19 - -#endif - - stxv vs8, 0(T1) - stxv vs9, 16(T1) - stxv vs10, 32(T1) - stxv vs11, 48(T1) - - addi CO, CO, 64 + SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 + addi CO, CO, 64 .endm @@ -1353,122 +1239,99 @@ lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B **********************************************************************************************/ .macro Zero1x2 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 .endm .macro LOAD1x2 Zero - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B - - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A + lxv vs16, 0(BO) // load real imag from B + xxswapd vs17,vs16 + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A .if \Zero==1 - Zero1x2 + Zero1x2 .endif .endm .macro END1x2_NORMAL - END1x2 AO,BO,32,32 + END1x2 AO,BO,32,16 .endm -.macro END1x2 AREG, BREG, OffsetA, OffsetB +.macro END1x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA .endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + .endm -.macro KERNEL1x2_L OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.macro KERNEL1x2_L OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm -.macro KERNEL1x2_E OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL1x2_E OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + xxswapd vs21,vs20 -lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 .if \Complete==0 - lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A - - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B + lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A .endif - -.if \IsLast==1 +.if \Complete==0 + lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B + xxswapd vs17,vs16 +.endif +.if \IsLast==1 .if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,64) - addi \BREG, \BREG, DISP4(\Index,64) + addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) + addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP2(\Index,32) +.endif .endif -.endif - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - xvmaddadp vs34, vs9, vs20 // real*real, imag*real - xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 .endm -.macro KERNEL1x2 +.macro KERNEL1x2 LOAD1x2 0 - END1x2 AO, BO, 32,32 + END1x2 AO, BO, 32,16 .endm .macro SAVE1x2 - - mr T1, CO - -#ifndef TRMMKERNEL - - lxv vs16, 0(T1) - lxv vs17, 16(T1) - -#endif - - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - -#endif - - stxv vs8, 0(T1) - stxv vs9, 16(T1) - -addi CO, CO, 32 - + SAVE2 vs32,vs33,vs34,vs35,CO,0 + addi CO, CO, 32 .endm /********************************************************************************************** @@ -1476,189 +1339,89 @@ addi CO, CO, 32 **********************************************************************************************/ .macro Zero1x1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 .endm .macro LOAD1x1 Zero - lxv vs0, 0(AO) // load real,imag from A - - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B + lxv vs0, 0(AO) // load real,imag from A + lxv vs16, 0(BO) // load real imag from B + xxswapd vs17, vs16 .if \Zero==1 - Zero1x1 + Zero1x1 .endif - + .endm .macro END1x1_NORMAL - END1x1 AO,BO,16,32 + END1x1 AO,BO,16,16 .endm -.macro END1x1 AREG, BREG, OffsetA, OffsetB +.macro END1x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB .endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + +.endm + +.macro KERNEL1x1_L OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL1x1_E OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + xxswapd vs21, vs20 + + lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - -.endm - -.macro KERNEL1x1_L OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - -.macro KERNEL1x1_E OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - -.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - - lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B - - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 .if \Complete==0 - lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B + lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B + xxswapd vs17, vs16 .endif - -.if \IsLast==1 +.if \IsLast==1 .if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,32) - addi \BREG, \BREG, DISP4(\Index,64) + addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP2(\Index,32) .endif .endif - - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - - + + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 + + .endm -.macro KERNEL1x1 +.macro KERNEL1x1 LOAD1x1 0 - END1x1 AO, BO, 16,32 - -.endm - -.macro SAVE1x1 - - mr T1, CO -#ifndef TRMMKERNEL - lxv vs16, 0(T1) -#endif - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - -#ifndef TRMMKERNEL - xvadddp vs8, vs8, vs16 -#endif - - stxv vs8, 0(T1) - -addi CO, CO, 16 + END1x1 AO, BO, 16,16 .endm - -.macro ZCOPYB_2 - - lxv vs32, 0(BO) - lxv vs33, 16(BO) - addi BO, BO, 32 - xxspltd vs40, vs32, 1 - xxspltd vs41, vs32, 0 - xxspltd vs42, vs33, 1 - xxspltd vs43, vs33, 0 - - stxv vs40, 0(BBO) - stxv vs41, 16(BBO) - stxv vs42, 32(BBO) - stxv vs43, 48(BBO) - addi BBO, BBO, 64 - -.endm - -.macro ZCOPYB_1 - - lxv vs32, 0(BO) - addi BO, BO, 16 - xxspltd vs40, vs32, 1 - xxspltd vs41, vs32, 0 - stxv vs40, 0(BBO) - stxv vs41, 16(BBO) - - addi BBO, BBO, 32 - -.endm - -.macro ZCOPYB_8 - - lxv vs32, 0(BO) - lxv vs33, 16(BO) - lxv vs34, 32(BO) - lxv vs35, 48(BO) - - lxv vs36, 64+0(BO) - lxv vs37, 64+16(BO) - lxv vs38, 64+32(BO) - lxv vs39, 64+48(BO) - addi BO, BO, 128 - xxspltd vs40, vs32, 1 - xxspltd vs41, vs32, 0 - xxspltd vs42, vs33, 1 - xxspltd vs43, vs33, 0 - xxspltd vs44, vs34, 1 - xxspltd vs45, vs34, 0 - xxspltd vs46, vs35, 1 - xxspltd vs47, vs35, 0 - - xxspltd vs48, vs36, 1 - xxspltd vs49, vs36, 0 - xxspltd vs50, vs37, 1 - xxspltd vs51, vs37, 0 - xxspltd vs52, vs38, 1 - xxspltd vs53, vs38, 0 - xxspltd vs54, vs39, 1 - xxspltd vs55, vs39, 0 - - stxv vs40, 0(BBO) - stxv vs41, 16(BBO) - stxv vs42, 32(BBO) - stxv vs43, 48(BBO) - - stxv vs44, 64+0(BBO) - stxv vs45, 64+16(BBO) - stxv vs46, 64+32(BBO) - stxv vs47, 64+48(BBO) - - stxv vs48, 128+ 0(BBO) - stxv vs49, 128+ 16(BBO) - stxv vs50, 128+ 32(BBO) - stxv vs51, 128+ 48(BBO) - - stxv vs52, 192 + 0(BBO) - stxv vs53, 192 + 16(BBO) - stxv vs54, 192+ 32(BBO) - stxv vs55, 192 + 48(BBO) - addi BBO, BBO, 256 - +.macro SAVE1x1 + SAVE1 vs32,vs33,CO,0 + addi CO, CO, 16 .endm diff --git a/param.h b/param.h index d0b8518c9..8f78a6a64 100644 --- a/param.h +++ b/param.h @@ -2248,15 +2248,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 640 +#define SGEMM_DEFAULT_P 832 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 640 -#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_P 256 -#define SGEMM_DEFAULT_Q 1408 +#define SGEMM_DEFAULT_Q 1025 #define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 640 -#define ZGEMM_DEFAULT_Q 1152 +#define ZGEMM_DEFAULT_Q 1025 #define SYMV_P 8 From 7a9a4dbc4fdd748747cd86ae685e760ae8cdc10f Mon Sep 17 00:00:00 2001 From: Michael Lass Date: Fri, 3 May 2019 21:07:14 +0200 Subject: [PATCH 073/127] Fix detection of AVX512 capable compilers in getarch 21eda8b5 introduced a check in getarch.c to test if the compiler is capable of AVX512. This check currently fails, since the used __AVX2__ macro is only defined if getarch itself was compiled with AVX2/AVX512 support. Make sure this is the case by building getarch with -march=native on x86_64. It is only supposed to run on the build host anyway. --- Makefile.system | 9 +++++++++ cmake/system.cmake | 5 +++++ 2 files changed, 14 insertions(+) diff --git a/Makefile.system b/Makefile.system index f574edf88..eb57cbb30 100644 --- a/Makefile.system +++ b/Makefile.system @@ -9,6 +9,11 @@ ifndef TOPDIR TOPDIR = . endif +# If ARCH is not set, we use the host system's architecture. +ifndef ARCH +ARCH := $(shell uname -m) +endif + # Catch conflicting usage of ARCH in some BSD environments ifeq ($(ARCH), amd64) override ARCH=x86_64 @@ -137,6 +142,10 @@ endif endif +# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. +ifeq ($(ARCH), x86_64) +GETARCH_FLAGS += -march=native +endif ifdef INTERFACE64 diff --git a/cmake/system.cmake b/cmake/system.cmake index adedd32cc..7f3696286 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -65,6 +65,11 @@ if (DEFINED TARGET) set(GETARCH_FLAGS "-DFORCE_${TARGET}") endif () +# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. +if (X86_64) + set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native") +endif () + if (INTERFACE64) message(STATUS "Using 64-bit integers.") set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT") From 9cdc828afa3b209c2c74a7d9daa5fac85bece49f Mon Sep 17 00:00:00 2001 From: Michael Lass Date: Fri, 3 May 2019 21:22:27 +0200 Subject: [PATCH 074/127] c_check: Unlink correct file --- c_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_check b/c_check index d93b756d5..271182c54 100644 --- a/c_check +++ b/c_check @@ -240,7 +240,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) { } else { $no_avx512 = 0; } - unlink("tmpf.o"); + unlink("$tmpf.o"); } } From d0c3543c3f38bbbdf363caa8c37bcf6df5bdb6fd Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Wed, 5 Jun 2019 10:30:57 +0000 Subject: [PATCH 075/127] power9 zgemm ztrmm optimized --- kernel/power/KERNEL.POWER9 | 2 +- kernel/power/zgemm_kernel_power9.S | 2 +- kernel/power/zgemm_logic_power9.S | 2352 +++++++++++++++++++--------- kernel/power/zgemm_macros_power9.S | 1692 ++++++++++++-------- param.h | 2 +- 5 files changed, 2671 insertions(+), 1379 deletions(-) diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 440eaab1b..126313c9a 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -6,7 +6,7 @@ STRMMKERNEL = sgemm_kernel_power9.S DTRMMKERNEL = dgemm_kernel_power9.S CTRMMKERNEL = ctrmm_kernel_8x4_power8.S -ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S +ZTRMMKERNEL = zgemm_kernel_power9.S SGEMMKERNEL = sgemm_kernel_power9.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S index a41bcec77..813f270b8 100644 --- a/kernel/power/zgemm_kernel_power9.S +++ b/kernel/power/zgemm_kernel_power9.S @@ -63,7 +63,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define T8 r16 #define T5 r17 #define T2 r19 -#define T9 r20 +#define TEMP_REG r20 #define T6 r21 #define I r22 #define J r23 diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S index 01685fe79..f902484a3 100644 --- a/kernel/power/zgemm_logic_power9.S +++ b/kernel/power/zgemm_logic_power9.S @@ -26,972 +26,1866 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define MY_ALIGN .align 3 b ZGEMM_L2 - -/* MINI SUBROUTINES */ +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ - -/* 2x8 MAIN 128x+1 LOOP */ -ZGEMM_L2x8_LMAIN_SUB: - mtctr L - LOAD2x8 0 - MY_ALIGN +ZGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x8_2 + MY_ALIGN ZGEMM_L2x8_LOOP: - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L 128,32,0,0 - KERNEL2x8_L 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L 128,32,2,0 - KERNEL2x8_L 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L 128,32,4,0 - KERNEL2x8_L 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L 128,32,6,0 - KERNEL2x8_L 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L 128,32,8,0 - KERNEL2x8_L 128,32,9,0 - KERNEL2x8_L 128,32,10,0 - KERNEL2x8_L 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L 128,32,12,0 - KERNEL2x8_L 128,32,13,0 - KERNEL2x8_L 128,32,14,0 - KERNEL2x8_L 128,32,15,0 - KERNEL2x8_L 128,32,16,0 - KERNEL2x8_L 128,32,17,0 - KERNEL2x8_L 128,32,18,0 - KERNEL2x8_L 128,32,19,0 - KERNEL2x8_L 128,32,20,0 - KERNEL2x8_L 128,32,21,0 - KERNEL2x8_L 128,32,22,0 - KERNEL2x8_L 128,32,23,0 - KERNEL2x8_L 128,32,24,0 - KERNEL2x8_L 128,32,25,0 - KERNEL2x8_L 128,32,26,0 - KERNEL2x8_L 128,32,27,0 - KERNEL2x8_L 128,32,28,0 - KERNEL2x8_L 128,32,29,0 - KERNEL2x8_L 128,32,30,0 - KERNEL2x8_L 128,32,31,0 - KERNEL2x8_L 128,32,32,0 - KERNEL2x8_L 128,32,33,0 - KERNEL2x8_L 128,32,34,0 - KERNEL2x8_L 128,32,35,0 - KERNEL2x8_L 128,32,36,0 - KERNEL2x8_L 128,32,37,0 - KERNEL2x8_L 128,32,38,0 - KERNEL2x8_L 128,32,39,0 - KERNEL2x8_L 128,32,40,0 - KERNEL2x8_L 128,32,41,0 - KERNEL2x8_L 128,32,42,0 - KERNEL2x8_L 128,32,43,0 - KERNEL2x8_L 128,32,44,0 - KERNEL2x8_L 128,32,45,0 - KERNEL2x8_L 128,32,46,0 - KERNEL2x8_L 128,32,47,0 - KERNEL2x8_L 128,32,48,0 - KERNEL2x8_L 128,32,49,0 - KERNEL2x8_L 128,32,50,0 - KERNEL2x8_L 128,32,51,0 - KERNEL2x8_L 128,32,52,0 - KERNEL2x8_L 128,32,53,0 - KERNEL2x8_L 128,32,54,0 - KERNEL2x8_L 128,32,55,0 - KERNEL2x8_L 128,32,56,0 - KERNEL2x8_L 128,32,57,0 - KERNEL2x8_L 128,32,58,0 - KERNEL2x8_L 128,32,59,0 - KERNEL2x8_L 128,32,60,0 - KERNEL2x8_L 128,32,61,0 - KERNEL2x8_L 128,32,62,0 - KERNEL2x8_L 128,32,63,1 - bdnz ZGEMM_L2x8_LOOP - MY_ALIGN +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 +ZGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_L2 256,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 256,64,8,0 + KERNEL2x8_L2 256,64,9,0 + KERNEL2x8_L2 256,64,10,0 + KERNEL2x8_L2 256,64,11,0 + dcbt BO, T4 + KERNEL2x8_L2 256,64,12,0 + KERNEL2x8_L2 256,64,13,0 + KERNEL2x8_L2 256,64,14,0 + KERNEL2x8_L2 256,64,15,0 + KERNEL2x8_L2 256,64,16,0 + KERNEL2x8_L2 256,64,17,0 + KERNEL2x8_L2 256,64,18,0 + KERNEL2x8_L2 256,64,19,0 + KERNEL2x8_L2 256,64,20,0 + KERNEL2x8_L2 256,64,21,0 + KERNEL2x8_L2 256,64,22,0 + KERNEL2x8_L2 256,64,23,0 + KERNEL2x8_L2 256,64,24,0 + KERNEL2x8_L2 256,64,25,0 + KERNEL2x8_L2 256,64,26,0 + KERNEL2x8_L2 256,64,27,0 + KERNEL2x8_L2 256,64,28,0 + KERNEL2x8_L2 256,64,29,0 + KERNEL2x8_L2 256,64,30,0 + KERNEL2x8_L2 256,64,31,0 + KERNEL2x8_L2 256,64,32,0 + KERNEL2x8_L2 256,64,33,0 + KERNEL2x8_L2 256,64,34,0 + KERNEL2x8_L2 256,64,35,0 + KERNEL2x8_L2 256,64,36,0 + KERNEL2x8_L2 256,64,37,0 + KERNEL2x8_L2 256,64,38,0 + KERNEL2x8_L2 256,64,39,0 + KERNEL2x8_L2 256,64,40,0 + KERNEL2x8_L2 256,64,41,0 + KERNEL2x8_L2 256,64,42,0 + KERNEL2x8_L2 256,64,43,0 + KERNEL2x8_L2 256,64,44,0 + KERNEL2x8_L2 256,64,45,0 + KERNEL2x8_L2 256,64,46,0 + KERNEL2x8_L2 256,64,47,0 + KERNEL2x8_L2 256,64,48,0 + KERNEL2x8_L2 256,64,49,0 + KERNEL2x8_L2 256,64,50,0 + KERNEL2x8_L2 256,64,51,0 + KERNEL2x8_L2 256,64,52,0 + KERNEL2x8_L2 256,64,53,0 + KERNEL2x8_L2 256,64,54,0 + KERNEL2x8_L2 256,64,55,0 + KERNEL2x8_L2 256,64,56,0 + KERNEL2x8_L2 256,64,57,0 + KERNEL2x8_L2 256,64,58,0 + KERNEL2x8_L2 256,64,59,0 + KERNEL2x8_L2 256,64,60,0 + KERNEL2x8_L2 256,64,61,0 + KERNEL2x8_L2 256,64,62,0 + KERNEL2x8_L2 256,64,63,1 + bdnz ZGEMM_L2x8_LOOP + MY_ALIGN ZGEMM_L2x8_LOOP_END: - END2x8 AO, BO, 128,32 - blr - +/*----------------------------------------*/ + END2x8_2 + blr MY_ALIGN + + ZGEMM_2x8_L64_SUB: - LOAD2x8 0 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L 128,32,0,0 - KERNEL2x8_L 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L 128,32,2,0 - KERNEL2x8_L 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L 128,32,4,0 - KERNEL2x8_L 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L 128,32,6,0 - KERNEL2x8_L 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L 128,32,8,0 - KERNEL2x8_L 128,32,9,0 - KERNEL2x8_L 128,32,10,0 - KERNEL2x8_L 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L 128,32,12,0 - KERNEL2x8_L 128,32,13,0 - KERNEL2x8_L 128,32,14,0 - KERNEL2x8_L 128,32,15,0 - KERNEL2x8_L 128,32,16,0 - KERNEL2x8_L 128,32,17,0 - KERNEL2x8_L 128,32,18,0 - KERNEL2x8_L 128,32,19,0 - KERNEL2x8_L 128,32,20,0 - KERNEL2x8_L 128,32,21,0 - KERNEL2x8_L 128,32,22,0 - KERNEL2x8_L 128,32,23,0 - KERNEL2x8_L 128,32,24,0 - KERNEL2x8_L 128,32,25,0 - KERNEL2x8_L 128,32,26,0 - KERNEL2x8_L 128,32,27,0 - KERNEL2x8_L 128,32,28,0 - KERNEL2x8_L 128,32,29,0 - KERNEL2x8_L 128,32,30,0 - KERNEL2x8_E 128,32,31,1 - blr - - +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_L2 256,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 256,64,8,0 + KERNEL2x8_L2 256,64,9,0 + KERNEL2x8_L2 256,64,10,0 + KERNEL2x8_L2 256,64,11,0 + dcbt BO, T4 + KERNEL2x8_L2 256,64,12,0 + KERNEL2x8_L2 256,64,13,0 + KERNEL2x8_L2 256,64,14,0 + KERNEL2x8_L2 256,64,15,0 + KERNEL2x8_L2 256,64,16,0 + KERNEL2x8_L2 256,64,17,0 + KERNEL2x8_L2 256,64,18,0 + KERNEL2x8_L2 256,64,19,0 + KERNEL2x8_L2 256,64,20,0 + KERNEL2x8_L2 256,64,21,0 + KERNEL2x8_L2 256,64,22,0 + KERNEL2x8_L2 256,64,23,0 + KERNEL2x8_L2 256,64,24,0 + KERNEL2x8_L2 256,64,25,0 + KERNEL2x8_L2 256,64,26,0 + KERNEL2x8_L2 256,64,27,0 + KERNEL2x8_L2 256,64,28,0 + KERNEL2x8_L2 256,64,29,0 + KERNEL2x8_L2 256,64,30,0 + KERNEL2x8_E2 256,64,31,1 + blr MY_ALIGN + + ZGEMM_2x8_L32_SUB: - LOAD2x8 0 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L 128,32,0,0 - KERNEL2x8_L 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L 128,32,2,0 - KERNEL2x8_L 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L 128,32,4,0 - KERNEL2x8_L 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L 128,32,6,0 - KERNEL2x8_L 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L 128,32,8,0 - KERNEL2x8_L 128,32,9,0 - KERNEL2x8_L 128,32,10,0 - KERNEL2x8_L 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L 128,32,12,0 - KERNEL2x8_L 128,32,13,0 - KERNEL2x8_L 128,32,14,0 - KERNEL2x8_L 128,32,15,1 - blr +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_L2 256,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 256,64,8,0 + KERNEL2x8_L2 256,64,9,0 + KERNEL2x8_L2 256,64,10,0 + KERNEL2x8_L2 256,64,11,0 + dcbt BO, T4 + KERNEL2x8_L2 256,64,12,0 + KERNEL2x8_L2 256,64,13,0 + KERNEL2x8_L2 256,64,14,0 + KERNEL2x8_E2 256,64,15,1 + blr MY_ALIGN + ZGEMM_2x8_L16_SUB: - LOAD2x8 0 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L 128,32,0,0 - KERNEL2x8_L 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L 128,32,2,0 - KERNEL2x8_L 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L 128,32,4,0 - KERNEL2x8_L 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L 128,32,6,0 - KERNEL2x8_L 128,32,7,1 - blr - MY_ALIGN +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_E2 256,64,7,1 + blr + MY_ALIGN + ZGEMM_2x4_LMAIN_SUB: - mtctr L - LOAD2x4 0 - MY_ALIGN -ZGEMM_L2x4_LOOP: - KERNEL2x4_L 64,32,0,0 - KERNEL2x4_L 64,32,1,0 - KERNEL2x4_L 64,32,2,0 - KERNEL2x4_L 64,32,3,0 - KERNEL2x4_L 64,32,4,0 - KERNEL2x4_L 64,32,5,0 - KERNEL2x4_L 64,32,6,0 - KERNEL2x4_L 64,32,7,0 - KERNEL2x4_L 64,32,8,0 - KERNEL2x4_L 64,32,9,0 - KERNEL2x4_L 64,32,10,0 - KERNEL2x4_L 64,32,11,0 - KERNEL2x4_L 64,32,12,0 - KERNEL2x4_L 64,32,13,0 - KERNEL2x4_L 64,32,14,0 - KERNEL2x4_L 64,32,15,1 - bdnz ZGEMM_L2x4_LOOP - MY_ALIGN +/*----------------------------------------*/ + mtctr T8 + LOAD2x4_2 + MY_ALIGN +ZGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_L2 128,64,0,0 +ZGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_L2 128,64,1,0 + KERNEL2x4_L2 128,64,2,0 + KERNEL2x4_L2 128,64,3,0 + KERNEL2x4_L2 128,64,4,0 + KERNEL2x4_L2 128,64,5,0 + KERNEL2x4_L2 128,64,6,0 + KERNEL2x4_L2 128,64,7,0 + KERNEL2x4_L2 128,64,8,0 + KERNEL2x4_L2 128,64,9,0 + KERNEL2x4_L2 128,64,10,0 + KERNEL2x4_L2 128,64,11,0 + KERNEL2x4_L2 128,64,12,0 + KERNEL2x4_L2 128,64,13,0 + KERNEL2x4_L2 128,64,14,0 + KERNEL2x4_L2 128,64,15,1 + bdnz ZGEMM_L2x4_LOOP + MY_ALIGN ZGEMM_L2x4_LOOP_END: - END2x4 AO, BO, 64,32 - blr - +/*----------------------------------------*/ + END2x4_2 + blr MY_ALIGN + + ZGEMM_2x4_L16_SUB: - LOAD2x4 0 - KERNEL2x4_L 64,32, 0,0 - KERNEL2x4_L 64,32, 1,0 - KERNEL2x4_L 64,32, 2,0 - KERNEL2x4_L 64,32, 3,0 - KERNEL2x4_L 64,32, 4,0 - KERNEL2x4_L 64,32, 5,0 - KERNEL2x4_L 64,32, 6,0 - KERNEL2x4_E 64,32, 7,1 +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 128,64,0,0 + KERNEL2x4_L2 128,64,1,0 + KERNEL2x4_L2 128,64,2,0 + KERNEL2x4_L2 128,64,3,0 + KERNEL2x4_L2 128,64,4,0 + KERNEL2x4_L2 128,64,5,0 + KERNEL2x4_L2 128,64,6,0 + KERNEL2x4_E2 128,64,7,1 + blr + MY_ALIGN + + +ZGEMM_2x4_L8_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 128,64,0,0 + KERNEL2x4_L2 128,64,1,0 + KERNEL2x4_L2 128,64,2,0 + KERNEL2x4_E2 128,64,3,1 + blr + + +ZGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x2_2 + MY_ALIGN +ZGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_L2 64,64,0,0 +ZGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_L2 64,64,1,0 + KERNEL2x2_L2 64,64,2,0 + KERNEL2x2_L2 64,64,3,0 + KERNEL2x2_L2 64,64,4,0 + KERNEL2x2_L2 64,64,5,0 + KERNEL2x2_L2 64,64,6,0 + KERNEL2x2_L2 64,64,7,0 + KERNEL2x2_L2 64,64,8,0 + KERNEL2x2_L2 64,64,9,0 + KERNEL2x2_L2 64,64,10,0 + KERNEL2x2_L2 64,64,11,0 + KERNEL2x2_L2 64,64,12,0 + KERNEL2x2_L2 64,64,13,0 + KERNEL2x2_L2 64,64,14,0 + KERNEL2x2_L2 64,64,15,1 + bdnz ZGEMM_L2x2_LOOP + MY_ALIGN + + +ZGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + END2x2_2 + blr + MY_ALIGN +ZGEMM_2x2_L16_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 64,64,0,0 + KERNEL2x2_L2 64,64,1,0 + KERNEL2x2_L2 64,64,2,0 + KERNEL2x2_L2 64,64,3,0 + KERNEL2x2_L2 64,64,4,0 + KERNEL2x2_L2 64,64,5,0 + KERNEL2x2_L2 64,64,6,0 + KERNEL2x2_E2 64,64,7,1 + blr + MY_ALIGN +ZGEMM_2x2_L8_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 64,64,0,0 + KERNEL2x2_L2 64,64,1,0 + KERNEL2x2_L2 64,64,2,0 + KERNEL2x2_E2 64,64,3,1 + blr + + +ZGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +ZGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 32,64,0,0 +ZGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 32,64,1,0 + KERNEL2x1_L2 32,64,2,0 + KERNEL2x1_L2 32,64,3,0 + KERNEL2x1_L2 32,64,4,0 + KERNEL2x1_L2 32,64,5,0 + KERNEL2x1_L2 32,64,6,0 + KERNEL2x1_L2 32,64,7,0 + KERNEL2x1_L2 32,64,8,0 + KERNEL2x1_L2 32,64,9,0 + KERNEL2x1_L2 32,64,10,0 + KERNEL2x1_L2 32,64,11,0 + KERNEL2x1_L2 32,64,12,0 + KERNEL2x1_L2 32,64,13,0 + KERNEL2x1_L2 32,64,14,0 + KERNEL2x1_L2 32,64,15,1 + bdnz ZGEMM_L2x1_LOOP + MY_ALIGN +ZGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 blr MY_ALIGN -ZGEMM_2x4_L8_SUB: - LOAD2x4 0 - KERNEL2x4_L 64,32, 0,0 - KERNEL2x4_L 64,32, 1,0 - KERNEL2x4_L 64,32, 2,0 - KERNEL2x4_E 64,32, 3,1 +ZGEMM_2x1_L16_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 32,64,0,0 + KERNEL2x1_L2 32,64,1,0 + KERNEL2x1_L2 32,64,2,0 + KERNEL2x1_L2 32,64,3,0 + KERNEL2x1_L2 32,64,4,0 + KERNEL2x1_L2 32,64,5,0 + KERNEL2x1_L2 32,64,6,0 + KERNEL2x1_E2 32,64,7,1 + blr + MY_ALIGN + + +ZGEMM_2x1_L8_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 32,64,0,0 + KERNEL2x1_L2 32,64,1,0 + KERNEL2x1_L2 32,64,2,0 + KERNEL2x1_E2 32,64,3,1 blr -/* MAIN LOOP BEGINS */ - MY_ALIGN + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + ZGEMM_L2: - srawi. J, N, 1 - ble ZGEMM_L2_END +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + srawi. J, N, 1 + ble ZGEMM_L2_END + ZGEMM_L2_BEGIN: - mr CO, C - slwi T1, LDC , 1 +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 1 add T2,C,LDC - mr AO, A - add C, C, T1 - srawi. I, M, 3 - ble ZGEMM_L2x8_END + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble ZGEMM_L2x8_END dcbt CO,r0 /*just prefetch*/ dcbt T2,r0 -ZGEMM_L2x8_BEGIN: - mr T1, K - mr BO, B - dcbt B, r0 - dcbt AO, r0 - /* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -1 - /* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. L, T1, 7 /**(K-1) % 128x */ - ZERO2x8 - ble ZGEMM_L2x8_SUB0 - bl ZGEMM_L2x8_LMAIN_SUB - - andi. L, T1, 127 - ble ZGEMM_L2x8_SAVE - b ZGEMM_L2x8_SUB2 - -ZGEMM_L2x8_SUB0: - andi. L, K, 255 +ZGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T11-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO2x8 + ble ZGEMM_L2x8_SUB0 + bl ZGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + ble ZGEMM_L2x8_SAVE + b ZGEMM_L2x8_SUB2 + + +ZGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP2x8_128K + addi BO,BO,-32 + addi AO,AO,-128 + LOAD2x8O 128,32 + END2x8_WITHOUT_ADD + LOAD2x8_2O 256, 64 + mtctr T8 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else cmpwi K,128 - bne ZGEMM_L2x8_SUB2 - MY_ALIGN -ZGEMM_L2x8_SUB2_128: - bl ZGEMM_2x8_L64_SUB - bl ZGEMM_2x8_L64_SUB - b ZGEMM_L2x8_SAVE +#endif + bne ZGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-256 + LOAD2x8_2O 256,64 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE MY_ALIGN + + ZGEMM_L2x8_SUB2: +/*----------------------------------------*/ andi. T1,L, 64 - ble ZGEMM_L2x8_SUB2_32 - bl ZGEMM_2x8_L64_SUB + ble ZGEMM_L2x8_SUB2_32 + bl ZGEMM_2x8_L64_SUB MY_ALIGN + + ZGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ andi. T1,L, 32 - ble ZGEMM_L2x8_SUB2_16 - bl ZGEMM_2x8_L32_SUB + ble ZGEMM_L2x8_SUB2_16 + bl ZGEMM_2x8_L32_SUB MY_ALIGN + + ZGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ andi. T1,L, 16 ble ZGEMM_L2x8_SUB2_8 - bl ZGEMM_2x8_L16_SUB - MY_ALIGN + bl ZGEMM_2x8_L16_SUB + MY_ALIGN + + ZGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ andi. T1,L, 8 ble ZGEMM_L2x8_SUB2_4 - LOAD2x8 0 - KERNEL2x8_L 128,32, 0,0 - KERNEL2x8_L 128,32, 1,0 - KERNEL2x8_L 128,32, 2,0 - KERNEL2x8_E 128,32, 3,1 - MY_ALIGN + LOAD2x8_2 + KERNEL2x8_L2 256,64, 0,0 + KERNEL2x8_L2 256,64, 1,0 + KERNEL2x8_L2 256,64, 2,0 + KERNEL2x8_E2 256,64, 3,1 + MY_ALIGN + + ZGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L2x8_SUB2_2 - LOAD2x8 0 - KERNEL2x8_L 128,32, 0,0 - KERNEL2x8_E 128,32, 1,1 + LOAD2x8_2 + KERNEL2x8_L2 256,64, 0,0 + KERNEL2x8_E2 256,64, 1,1 MY_ALIGN + + ZGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L2x8_SUB2_1 - LOAD2x8 0 - KERNEL2x8_E 128,32, 0,1 + LOAD2x8_2 + KERNEL2x8_E2 256,64, 0,1 MY_ALIGN + + ZGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ andi. T1,L, 1 - ble ZGEMM_L2x8_SAVE - KERNEL2x8 + ble ZGEMM_L2x8_SAVE + KERNEL2x8 + ZGEMM_L2x8_SAVE: - addic. I, I, -1 - SAVE2x8 +/*----------------------------------------*/ + addic. I, I, -1 + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 +#endif + bgt ZGEMM_L2x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END + b ZGEMM_L2x4_BEGIN + MY_ALIGN - bgt ZGEMM_L2x8_BEGIN - andi. T2, M, 7 - ble ZGEMM_L2x1_END - - andi. T1, M, 4 - ble ZGEMM_L2x4_END - b ZGEMM_L2x4_BEGIN - MY_ALIGN ZGEMM_L2x8_END: +/*----------------------------------------*/ + ZGEMM_L2x4_BEGIN: - - andi. T2, M, 7 - ble ZGEMM_L2x1_END - - andi. T1, M, 4 - ble ZGEMM_L2x4_END - mr BO, B - mr T1, K - addi T1,T1, -1 - ZERO2x4 - srawi. L, T1, 5 /**(K-1) % 32x */ - - ble ZGEMM_L2x4_SUB0 +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x4 + ble ZGEMM_L2x4_SUB0 bl ZGEMM_2x4_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L2x4_SAVE - b ZGEMM_L2x4_SUB2 + andi. L, T1, 31 + ble ZGEMM_L2x4_SAVE + b ZGEMM_L2x4_SUB2 + ZGEMM_L2x4_SUB0: - andi. L, K, 63 +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x4_32K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD2x4O 64,32 + END2x4_WITHOUT_ADD + LOAD2x4_2O 128, 64 + mtctr T8 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else cmpwi K,32 - bne ZGEMM_L2x4_SUB2 - MY_ALIGN -ZGEMM_L2x4_SUB2_32: - bl ZGEMM_2x4_L16_SUB - bl ZGEMM_2x4_L16_SUB - b ZGEMM_L2x4_SAVE +#endif + bne ZGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD2x4_2O 128,64 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE MY_ALIGN -ZGEMM_L2x4_SUB2: + MY_ALIGN + + +ZGEMM_L2x4_SUB2: +/*----------------------------------------*/ andi. T1,L, 16 ble ZGEMM_L2x4_SUB2_8 - bl ZGEMM_2x4_L16_SUB + bl ZGEMM_2x4_L16_SUB MY_ALIGN -ZGEMM_L2x4_SUB2_8: + + +ZGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ andi. T1,L, 8 ble ZGEMM_L2x4_SUB2_4 bl ZGEMM_2x4_L8_SUB MY_ALIGN + + ZGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L2x4_SUB2_2 - LOAD2x4 0 - KERNEL2x4_L 64,32, 0,0 - KERNEL2x4_E 64,32, 1,1 + LOAD2x4_2 + KERNEL2x4_L2 128,64, 0,0 + KERNEL2x4_E2 128,64, 1,1 MY_ALIGN + + ZGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L2x4_SUB2_1 - LOAD2x4 0 - KERNEL2x4_E 64,32, 0,1 + LOAD2x4_2 + KERNEL2x4_E2 128,64, 0,1 MY_ALIGN + + ZGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ andi. T1,L, 1 - ble ZGEMM_L2x4_SAVE - KERNEL2x4 + ble ZGEMM_L2x4_SAVE + KERNEL2x4 + ZGEMM_L2x4_SAVE: +/*----------------------------------------*/ + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 +#endif - SAVE2x4 ZGEMM_L2x4_END: +/*----------------------------------------*/ -ZGEMM_L2x2_BEGIN: - andi. T1, M, 2 - ble ZGEMM_L2x2_END - mr BO, B - mr T1, K - addi T1,T1, -1 - srawi. L, T1, 4 /**(K-1) % 16x */ - ZERO2x2 - ble ZGEMM_L2x2_SUB0 +ZGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x2 + ble ZGEMM_L2x2_SUB0 + bl ZGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x2_SAVE + b ZGEMM_L2x2_SUB2 -ZGEMM_L2x2_LOOP_START: - LOAD2x2 0 - mtctr L - MY_ALIGN -ZGEMM_L2x2_LOOP: - KERNEL2x2_L 32,32,0,0 - KERNEL2x2_L 32,32,1,0 - KERNEL2x2_L 32,32,2,0 - KERNEL2x2_L 32,32,3,0 - KERNEL2x2_L 32,32,4,0 - KERNEL2x2_L 32,32,5,0 - KERNEL2x2_L 32,32,6,0 - KERNEL2x2_L 32,32,7,1 - bdnz ZGEMM_L2x2_LOOP - MY_ALIGN -ZGEMM_L2x2_LOOP_END: - END2x2 AO, BO, 32,32 - - b ZGEMM_L2x2_SUB1 - ZGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x2_32K + addi BO,BO,-32 + addi AO,AO,-32 + LOAD2x2O 32,32 + END2x2_WITHOUT_ADD + LOAD2x2_2O 64, 64 + mtctr T8 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-64 + LOAD2x2_2O 64,64 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN - andi. L, K, 31 - - b ZGEMM_L2x2_SUB2 - -ZGEMM_L2x2_SUB1: - - andi. L, T1, 15 - ble ZGEMM_L2x2_SAVE ZGEMM_L2x2_SUB2: - srawi. T1,L, 3 - ble ZGEMM_L2x2_SUB2_4 - mtctr T1 +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x2_SUB2_8 + bl ZGEMM_2x2_L16_SUB MY_ALIGN -ZGEMM_L2x2_SUB2_LOOP: - LOAD2x2 0 - KERNEL2x2_L 32,32, 0,0 - KERNEL2x2_L 32,32, 1,0 - KERNEL2x2_L 32,32, 2,0 - KERNEL2x2_E 32,32, 3,1 - bdnz ZGEMM_L2x2_SUB2_LOOP + + +ZGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x2_SUB2_4 + bl ZGEMM_2x2_L8_SUB MY_ALIGN + + ZGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L2x2_SUB2_2 - LOAD2x2 0 - KERNEL2x2_L 32,32, 0,0 - KERNEL2x2_E 32,32, 1,1 + LOAD2x2_2 + KERNEL2x2_L2 64,64, 0,0 + KERNEL2x2_E2 64,64, 1,1 MY_ALIGN + + ZGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L2x2_SUB2_1 - LOAD2x2 0 - KERNEL2x2_E 32,32, 0,1 + LOAD2x2_2 + KERNEL2x2_E2 64,64, 0,1 MY_ALIGN -ZGEMM_L2x2_SUB2_1: - andi. T1,L, 1 - ble ZGEMM_L2x2_SAVE - KERNEL2x2 -ZGEMM_L2x2_SAVE: - SAVE2x2 + +ZGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x2_SAVE + KERNEL2x2 + + +ZGEMM_L2x2_SAVE: +/*----------------------------------------*/ + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 +#endif + ZGEMM_L2x2_END: +/*----------------------------------------*/ +ZGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble ZGEMM_L2x1_SUB0 + bl ZGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x1_SAVE + b ZGEMM_L2x1_SUB2 -ZGEMM_L2x1_BEGIN: - andi. T1, M, 1 - ble ZGEMM_L2x1_END - mr BO, B - mr T1, K - addi T1,T1, -1 - srawi. L, T1, 4 /**(K-1) % 16x */ - ZERO2x1 - ble ZGEMM_L2x1_SUB0 -ZGEMM_L2x1_LOOP_START: - - LOAD2x1 0 - mtctr L - - MY_ALIGN -ZGEMM_L2x1_LOOP: - KERNEL2x1_L 16,32,0,0 - KERNEL2x1_L 16,32,1,0 - KERNEL2x1_L 16,32,2,0 - KERNEL2x1_L 16,32,3,0 - KERNEL2x1_L 16,32,4,0 - KERNEL2x1_L 16,32,5,0 - KERNEL2x1_L 16,32,6,0 - KERNEL2x1_L 16,32,7,1 - bdnz ZGEMM_L2x1_LOOP - MY_ALIGN -ZGEMM_L2x1_LOOP_END: - END2x1 AO, BO, 16,32 - - b ZGEMM_L2x1_SUB1 - ZGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x1_32K + addi BO,BO,-32 + addi AO,AO,-16 + LOAD2x1O 16,32 + END2x1_WITHOUT_ADD + LOAD2x1_2O 32, 64 + mtctr T8 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-32 + LOAD2x1_2O 32,64 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN - andi. L, K, 31 - - b ZGEMM_L2x1_SUB2 - -ZGEMM_L2x1_SUB1: - - andi. L, T1, 15 - ble ZGEMM_L2x1_SAVE ZGEMM_L2x1_SUB2: - srawi. T1,L, 3 - ble ZGEMM_L2x1_SUB2_4 - mtctr T1 +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x1_SUB2_8 + bl ZGEMM_2x1_L16_SUB MY_ALIGN -ZGEMM_L2x1_SUB2_LOOP: - LOAD2x1 0 - KERNEL2x1_L 16,32, 0,0 - KERNEL2x1_L 16,32, 1,0 - KERNEL2x1_L 16,32, 2,0 - KERNEL2x1_E 16,32, 3,1 - bdnz ZGEMM_L2x1_SUB2_LOOP + + +ZGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x1_SUB2_4 + bl ZGEMM_2x1_L8_SUB MY_ALIGN + + ZGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L2x1_SUB2_2 - LOAD2x1 0 - KERNEL2x1_L 16,32, 0,0 - KERNEL2x1_E 16,32, 1,1 + LOAD2x1_2 + KERNEL2x1_L2 32,64, 0,0 + KERNEL2x1_E2 32,64, 1,1 MY_ALIGN + + ZGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L2x1_SUB2_1 - LOAD2x1 0 - KERNEL2x1_E 16,32, 0,1 + LOAD2x1_2 + KERNEL2x1_E2 32,64, 0,1 MY_ALIGN + + ZGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ andi. T1,L, 1 - ble ZGEMM_L2x1_SAVE - KERNEL2x1 + ble ZGEMM_L2x1_SAVE + KERNEL2x1 + ZGEMM_L2x1_SAVE: +/*----------------------------------------*/ + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 +#endif - SAVE2x1 ZGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + bgt ZGEMM_L2_BEGIN - slwi T1, K, 5 - add B, B, T1 - - addic. J, J, -1 - bgt ZGEMM_L2_BEGIN - - andi. T2, N, 1 - ble L999 ZGEMM_L2_END: - b ZGEMM_L1_BEGIN +b ZGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ -L999_H1: - b L999 +ZGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x8_2 + MY_ALIGN +ZGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 +ZGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_L2 256,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 256,32,8,0 + KERNEL1x8_L2 256,32,9,0 + KERNEL1x8_L2 256,32,10,0 + KERNEL1x8_L2 256,32,11,0 + dcbt BO, T4 + KERNEL1x8_L2 256,32,12,0 + KERNEL1x8_L2 256,32,13,0 + KERNEL1x8_L2 256,32,14,0 + KERNEL1x8_L2 256,32,15,0 + KERNEL1x8_L2 256,32,16,0 + KERNEL1x8_L2 256,32,17,0 + KERNEL1x8_L2 256,32,18,0 + KERNEL1x8_L2 256,32,19,0 + KERNEL1x8_L2 256,32,20,0 + KERNEL1x8_L2 256,32,21,0 + KERNEL1x8_L2 256,32,22,0 + KERNEL1x8_L2 256,32,23,0 + KERNEL1x8_L2 256,32,24,0 + KERNEL1x8_L2 256,32,25,0 + KERNEL1x8_L2 256,32,26,0 + KERNEL1x8_L2 256,32,27,0 + KERNEL1x8_L2 256,32,28,0 + KERNEL1x8_L2 256,32,29,0 + KERNEL1x8_L2 256,32,30,0 + KERNEL1x8_L2 256,32,31,0 + KERNEL1x8_L2 256,32,32,0 + KERNEL1x8_L2 256,32,33,0 + KERNEL1x8_L2 256,32,34,0 + KERNEL1x8_L2 256,32,35,0 + KERNEL1x8_L2 256,32,36,0 + KERNEL1x8_L2 256,32,37,0 + KERNEL1x8_L2 256,32,38,0 + KERNEL1x8_L2 256,32,39,0 + KERNEL1x8_L2 256,32,40,0 + KERNEL1x8_L2 256,32,41,0 + KERNEL1x8_L2 256,32,42,0 + KERNEL1x8_L2 256,32,43,0 + KERNEL1x8_L2 256,32,44,0 + KERNEL1x8_L2 256,32,45,0 + KERNEL1x8_L2 256,32,46,0 + KERNEL1x8_L2 256,32,47,0 + KERNEL1x8_L2 256,32,48,0 + KERNEL1x8_L2 256,32,49,0 + KERNEL1x8_L2 256,32,50,0 + KERNEL1x8_L2 256,32,51,0 + KERNEL1x8_L2 256,32,52,0 + KERNEL1x8_L2 256,32,53,0 + KERNEL1x8_L2 256,32,54,0 + KERNEL1x8_L2 256,32,55,0 + KERNEL1x8_L2 256,32,56,0 + KERNEL1x8_L2 256,32,57,0 + KERNEL1x8_L2 256,32,58,0 + KERNEL1x8_L2 256,32,59,0 + KERNEL1x8_L2 256,32,60,0 + KERNEL1x8_L2 256,32,61,0 + KERNEL1x8_L2 256,32,62,0 + KERNEL1x8_L2 256,32,63,1 + bdnz ZGEMM_L1x8_LOOP + MY_ALIGN +ZGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + END1x8_2 + blr + MY_ALIGN + +ZGEMM_1x8_L64_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_L2 256,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 256,32,8,0 + KERNEL1x8_L2 256,32,9,0 + KERNEL1x8_L2 256,32,10,0 + KERNEL1x8_L2 256,32,11,0 + dcbt BO, T4 + KERNEL1x8_L2 256,32,12,0 + KERNEL1x8_L2 256,32,13,0 + KERNEL1x8_L2 256,32,14,0 + KERNEL1x8_L2 256,32,15,0 + KERNEL1x8_L2 256,32,16,0 + KERNEL1x8_L2 256,32,17,0 + KERNEL1x8_L2 256,32,18,0 + KERNEL1x8_L2 256,32,19,0 + KERNEL1x8_L2 256,32,20,0 + KERNEL1x8_L2 256,32,21,0 + KERNEL1x8_L2 256,32,22,0 + KERNEL1x8_L2 256,32,23,0 + KERNEL1x8_L2 256,32,24,0 + KERNEL1x8_L2 256,32,25,0 + KERNEL1x8_L2 256,32,26,0 + KERNEL1x8_L2 256,32,27,0 + KERNEL1x8_L2 256,32,28,0 + KERNEL1x8_L2 256,32,29,0 + KERNEL1x8_L2 256,32,30,0 + KERNEL1x8_E2 256,32,31,1 + blr + MY_ALIGN + + +ZGEMM_1x8_L32_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_L2 256,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 256,32,8,0 + KERNEL1x8_L2 256,32,9,0 + KERNEL1x8_L2 256,32,10,0 + KERNEL1x8_L2 256,32,11,0 + dcbt BO, T4 + KERNEL1x8_L2 256,32,12,0 + KERNEL1x8_L2 256,32,13,0 + KERNEL1x8_L2 256,32,14,0 + KERNEL1x8_E2 256,32,15,1 + blr + MY_ALIGN + + +ZGEMM_1x8_L16_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_E2 256,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x4_2 + MY_ALIGN + + +ZGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_L2 128,32,0,0 + + +ZGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_L2 128,32,1,0 + KERNEL1x4_L2 128,32,2,0 + KERNEL1x4_L2 128,32,3,0 + KERNEL1x4_L2 128,32,4,0 + KERNEL1x4_L2 128,32,5,0 + KERNEL1x4_L2 128,32,6,0 + KERNEL1x4_L2 128,32,7,0 + KERNEL1x4_L2 128,32,8,0 + KERNEL1x4_L2 128,32,9,0 + KERNEL1x4_L2 128,32,10,0 + KERNEL1x4_L2 128,32,11,0 + KERNEL1x4_L2 128,32,12,0 + KERNEL1x4_L2 128,32,13,0 + KERNEL1x4_L2 128,32,14,0 + KERNEL1x4_L2 128,32,15,1 + bdnz ZGEMM_L1x4_LOOP + MY_ALIGN + + +ZGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + END1x4_2 + blr + MY_ALIGN + + +ZGEMM_1x4_L16_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 128,32,0,0 + KERNEL1x4_L2 128,32,1,0 + KERNEL1x4_L2 128,32,2,0 + KERNEL1x4_L2 128,32,3,0 + KERNEL1x4_L2 128,32,4,0 + KERNEL1x4_L2 128,32,5,0 + KERNEL1x4_L2 128,32,6,0 + KERNEL1x4_E2 128,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x4_L8_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 128,32,0,0 + KERNEL1x4_L2 128,32,1,0 + KERNEL1x4_L2 128,32,2,0 + KERNEL1x4_E2 128,32,3,1 + blr + + +ZGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x2_2 + MY_ALIGN + + +ZGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_L2 64,32,0,0 + + +ZGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_L2 64,32,1,0 + KERNEL1x2_L2 64,32,2,0 + KERNEL1x2_L2 64,32,3,0 + KERNEL1x2_L2 64,32,4,0 + KERNEL1x2_L2 64,32,5,0 + KERNEL1x2_L2 64,32,6,0 + KERNEL1x2_L2 64,32,7,0 + KERNEL1x2_L2 64,32,8,0 + KERNEL1x2_L2 64,32,9,0 + KERNEL1x2_L2 64,32,10,0 + KERNEL1x2_L2 64,32,11,0 + KERNEL1x2_L2 64,32,12,0 + KERNEL1x2_L2 64,32,13,0 + KERNEL1x2_L2 64,32,14,0 + KERNEL1x2_L2 64,32,15,1 + bdnz ZGEMM_L1x2_LOOP + MY_ALIGN + + +ZGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + END1x2_2 + blr + MY_ALIGN + + +ZGEMM_1x2_L16_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 64,32,0,0 + KERNEL1x2_L2 64,32,1,0 + KERNEL1x2_L2 64,32,2,0 + KERNEL1x2_L2 64,32,3,0 + KERNEL1x2_L2 64,32,4,0 + KERNEL1x2_L2 64,32,5,0 + KERNEL1x2_L2 64,32,6,0 + KERNEL1x2_E2 64,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x2_L8_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 64,32,0,0 + KERNEL1x2_L2 64,32,1,0 + KERNEL1x2_L2 64,32,2,0 + KERNEL1x2_E2 64,32,3,1 + blr + + +ZGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN + + +ZGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 32,32,0,0 + + +ZGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 32,32,1,0 + KERNEL1x1_L2 32,32,2,0 + KERNEL1x1_L2 32,32,3,0 + KERNEL1x1_L2 32,32,4,0 + KERNEL1x1_L2 32,32,5,0 + KERNEL1x1_L2 32,32,6,0 + KERNEL1x1_L2 32,32,7,0 + KERNEL1x1_L2 32,32,8,0 + KERNEL1x1_L2 32,32,9,0 + KERNEL1x1_L2 32,32,10,0 + KERNEL1x1_L2 32,32,11,0 + KERNEL1x1_L2 32,32,12,0 + KERNEL1x1_L2 32,32,13,0 + KERNEL1x1_L2 32,32,14,0 + KERNEL1x1_L2 32,32,15,1 + bdnz ZGEMM_L1x1_LOOP + MY_ALIGN + + +ZGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + MY_ALIGN + + +ZGEMM_1x1_L16_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 32,32,0,0 + KERNEL1x1_L2 32,32,1,0 + KERNEL1x1_L2 32,32,2,0 + KERNEL1x1_L2 32,32,3,0 + KERNEL1x1_L2 32,32,4,0 + KERNEL1x1_L2 32,32,5,0 + KERNEL1x1_L2 32,32,6,0 + KERNEL1x1_E2 32,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x1_L8_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 32,32,0,0 + KERNEL1x1_L2 32,32,1,0 + KERNEL1x1_L2 32,32,2,0 + KERNEL1x1_E2 32,32,3,1 + blr + + +/*----------------------N1 BEGINS---------*/ +ZGEMM_L1: +/*----------------------------------------*/ + andi. T1, N, 1 + ble ZGEMM_L1_END + ZGEMM_L1_BEGIN: - andi. T1, N, 1 - ble ZGEMM_L1_END +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble ZGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 - mr CO, C - mr AO, A - srawi. I, M, 3 - ble ZGEMM_L1x8_END ZGEMM_L1x8_BEGIN: - - - mr BO, B - mr T1, K - addi T1,T1, -1 - srawi. L, T1, 5 /**(K-1) % 32x */ - ZERO1x8 - ble ZGEMM_L1x8_SUB0 - - -ZGEMM_L1x8_LOOP_START: - - LOAD1x8 0 +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ li T2, 1024 - li T3, 1024+512 - li T4, 2048 - li T5, 2048+512 - mtctr L + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T11-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO1x8 + ble ZGEMM_L1x8_SUB0 + bl ZGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble ZGEMM_L1x8_SAVE + b ZGEMM_L1x8_SUB2 + - MY_ALIGN -ZGEMM_L1x8_LOOP: - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L 128,16,0,0 - KERNEL1x8_L 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L 128,16,2,0 - KERNEL1x8_L 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L 128,16,4,0 - KERNEL1x8_L 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L 128,16,6,0 - KERNEL1x8_L 128,16,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L 128,16,8,0 - KERNEL1x8_L 128,16,9,0 - KERNEL1x8_L 128,16,10,0 - KERNEL1x8_L 128,16,11,0 - dcbt BO, T4 - KERNEL1x8_L 128,16,12,0 - KERNEL1x8_L 128,16,13,0 - KERNEL1x8_L 128,16,14,0 - KERNEL1x8_L 128,16,15,1 - bdnz ZGEMM_L1x8_LOOP - MY_ALIGN -ZGEMM_L1x8_LOOP_END: - END1x8 AO, BO, 128,16 - - b ZGEMM_L1x8_SUB1 - ZGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP1x8_128K + addi BO,BO,-16 + addi AO,AO,-128 + LOAD1x8O 128,16 + END1x8_WITHOUT_ADD + LOAD1x8_2O 256, 32 + mtctr T8 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne ZGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-256 + LOAD1x8_2O 256,32 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + MY_ALIGN - andi. L, K, 63 - - b ZGEMM_L1x8_SUB2 - -ZGEMM_L1x8_SUB1: - - andi. L, T1, 31 - ble ZGEMM_L1x8_SAVE ZGEMM_L1x8_SUB2: - srawi. T1,L, 3 - ble ZGEMM_L1x8_SUB2_4 - mtctr T1 +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L1x8_SUB2_32 + bl ZGEMM_1x8_L64_SUB MY_ALIGN -ZGEMM_L1x8_SUB2_LOOP: - LOAD1x8 0 - KERNEL1x8_L 128,16, 0,0 - KERNEL1x8_L 128,16, 1,0 - KERNEL1x8_L 128,16, 2,0 - KERNEL1x8_E 128,16, 3,1 - bdnz ZGEMM_L1x8_SUB2_LOOP - MY_ALIGN + + +ZGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L1x8_SUB2_16 + bl ZGEMM_1x8_L32_SUB + MY_ALIGN + + +ZGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x8_SUB2_8 + bl ZGEMM_1x8_L16_SUB + MY_ALIGN + + +ZGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x8_SUB2_4 + LOAD1x8_2 + KERNEL1x8_L2 256,32, 0,0 + KERNEL1x8_L2 256,32, 1,0 + KERNEL1x8_L2 256,32, 2,0 + KERNEL1x8_E2 256,32, 3,1 + MY_ALIGN + + ZGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L1x8_SUB2_2 - LOAD1x8 0 - KERNEL1x8_L 128,16, 0,0 - KERNEL1x8_E 128,16, 1,1 + LOAD1x8_2 + KERNEL1x8_L2 256,32, 0,0 + KERNEL1x8_E2 256,32, 1,1 MY_ALIGN + + ZGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L1x8_SUB2_1 - LOAD1x8 0 - KERNEL1x8_E 128,16, 0,1 + LOAD1x8_2 + KERNEL1x8_E2 256,32, 0,1 MY_ALIGN + + ZGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ andi. T1,L, 1 - ble ZGEMM_L1x8_SAVE - KERNEL1x8 - + ble ZGEMM_L1x8_SAVE + KERNEL1x8 + ZGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 +#endif + bgt ZGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END + b ZGEMM_L1x4_BEGIN + MY_ALIGN - SAVE1x8 - - addic. I, I, -1 - bgt ZGEMM_L1x8_BEGIN ZGEMM_L1x8_END: +/*----------------------------------------*/ + ZGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x4 + ble ZGEMM_L1x4_SUB0 + bl ZGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x4_SAVE + b ZGEMM_L1x4_SUB2 - andi. T2, M, 7 - ble ZGEMM_L1x1_END - andi. T1, M, 4 - ble ZGEMM_L1x4_END - mr BO, B - mr T1, K - addi T1,T1, -1 - srawi. L, T1, 5 /**(K-1) % 16x */ - ZERO1x4 - ble ZGEMM_L1x4_SUB0 - -ZGEMM_L1x4_LOOP_START: - LOAD1x4 0 - mtctr L - - MY_ALIGN -ZGEMM_L1x4_LOOP: - KERNEL1x4_L 64,16,0,0 - KERNEL1x4_L 64,16,1,0 - KERNEL1x4_L 64,16,2,0 - KERNEL1x4_L 64,16,3,0 - KERNEL1x4_L 64,16,4,0 - KERNEL1x4_L 64,16,5,0 - KERNEL1x4_L 64,16,6,0 - KERNEL1x4_L 64,16,7,0 - KERNEL1x4_L 64,16,8,0 - KERNEL1x4_L 64,16,9,0 - KERNEL1x4_L 64,16,10,0 - KERNEL1x4_L 64,16,11,0 - KERNEL1x4_L 64,16,12,0 - KERNEL1x4_L 64,16,13,0 - KERNEL1x4_L 64,16,14,0 - KERNEL1x4_L 64,16,15,1 - bdnz ZGEMM_L1x4_LOOP - MY_ALIGN -ZGEMM_L1x4_LOOP_END: - END1x4 AO, BO, 64,16 - - b ZGEMM_L1x4_SUB1 - ZGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x4_32K + addi BO,BO,-16 + addi AO,AO,-64 + LOAD1x4O 64,16 + END1x4_WITHOUT_ADD + LOAD1x4_2O 128, 32 + mtctr T8 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-128 + LOAD1x4_2O 128,32 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN - andi. L, K, 63 - - b ZGEMM_L1x4_SUB2 - -ZGEMM_L1x4_SUB1: - - andi. L, T1, 31 - ble ZGEMM_L1x4_SAVE ZGEMM_L1x4_SUB2: - srawi. T1,L, 3 - ble ZGEMM_L1x4_SUB2_4 - mtctr T1 +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x4_SUB2_8 + bl ZGEMM_1x4_L16_SUB MY_ALIGN -ZGEMM_L1x4_SUB2_LOOP: - LOAD1x4 0 - KERNEL1x4_L 64,16, 0,0 - KERNEL1x4_L 64,16, 1,0 - KERNEL1x4_L 64,16, 2,0 - KERNEL1x4_E 64,16, 3,1 - bdnz ZGEMM_L1x4_SUB2_LOOP + + +ZGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x4_SUB2_4 + bl ZGEMM_1x4_L8_SUB MY_ALIGN + + ZGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L1x4_SUB2_2 - LOAD1x4 0 - KERNEL1x4_L 64,16, 0,0 - KERNEL1x4_E 64,16, 1,1 + LOAD1x4_2 + KERNEL1x4_L2 128,32, 0,0 + KERNEL1x4_E2 128,32, 1,1 MY_ALIGN + + ZGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L1x4_SUB2_1 - LOAD1x4 0 - KERNEL1x4_E 64,16, 0,1 + LOAD1x4_2 + KERNEL1x4_E2 128,32, 0,1 MY_ALIGN + + ZGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ andi. T1,L, 1 - ble ZGEMM_L1x4_SAVE - KERNEL1x4 + ble ZGEMM_L1x4_SAVE + KERNEL1x4 + ZGEMM_L1x4_SAVE: +/*----------------------------------------*/ + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 +#endif - SAVE1x4 ZGEMM_L1x4_END: +/*----------------------------------------*/ + ZGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x2 + ble ZGEMM_L1x2_SUB0 + bl ZGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x2_SAVE + b ZGEMM_L1x2_SUB2 - andi. T1, M, 2 - ble ZGEMM_L1x2_END - mr BO, B - mr T1, K - addi T1,T1, -1 - srawi. L, T1, 5 /**(K-1) % 16x */ - ZERO1x2 - ble ZGEMM_L1x2_SUB0 - -ZGEMM_L1x2_LOOP_START: - LOAD1x2 0 - mtctr L - - MY_ALIGN -ZGEMM_L1x2_LOOP: - KERNEL1x2_L 32,16,0,0 - KERNEL1x2_L 32,16,1,0 - KERNEL1x2_L 32,16,2,0 - KERNEL1x2_L 32,16,3,0 - KERNEL1x2_L 32,16,4,0 - KERNEL1x2_L 32,16,5,0 - KERNEL1x2_L 32,16,6,0 - KERNEL1x2_L 32,16,7,0 - KERNEL1x2_L 32,16,8,0 - KERNEL1x2_L 32,16,9,0 - KERNEL1x2_L 32,16,10,0 - KERNEL1x2_L 32,16,11,0 - KERNEL1x2_L 32,16,12,0 - KERNEL1x2_L 32,16,13,0 - KERNEL1x2_L 32,16,14,0 - KERNEL1x2_L 32,16,15,1 - bdnz ZGEMM_L1x2_LOOP - MY_ALIGN -ZGEMM_L1x2_LOOP_END: - END1x2 AO, BO, 32,16 - - b ZGEMM_L1x2_SUB1 - ZGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x2_32K + addi BO,BO,-16 + addi AO,AO,-32 + LOAD1x2O 32,16 + END1x2_WITHOUT_ADD + LOAD1x2_2O 64, 32 + mtctr T8 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-64 + LOAD1x2_2O 64,32 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN - andi. L, K, 63 - - b ZGEMM_L1x2_SUB2 - -ZGEMM_L1x2_SUB1: - - andi. L, T1, 31 - ble ZGEMM_L1x2_SAVE ZGEMM_L1x2_SUB2: - srawi. T1,L, 3 - ble ZGEMM_L1x2_SUB2_4 - mtctr T1 +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x2_SUB2_8 + bl ZGEMM_1x2_L16_SUB MY_ALIGN -ZGEMM_L1x2_SUB2_LOOP: - LOAD1x2 0 - KERNEL1x2_L 32,16, 0,0 - KERNEL1x2_L 32,16, 1,0 - KERNEL1x2_L 32,16, 2,0 - KERNEL1x2_E 32,16, 3,1 - bdnz ZGEMM_L1x2_SUB2_LOOP + + +ZGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x2_SUB2_4 + bl ZGEMM_1x2_L8_SUB MY_ALIGN + + ZGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L1x2_SUB2_2 - LOAD1x2 0 - KERNEL1x2_L 32,16, 0,0 - KERNEL1x2_E 32,16, 1,1 + LOAD1x2_2 + KERNEL1x2_L2 64,32, 0,0 + KERNEL1x2_E2 64,32, 1,1 MY_ALIGN + + ZGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L1x2_SUB2_1 - LOAD1x2 0 - KERNEL1x2_E 32,16, 0,1 + LOAD1x2_2 + KERNEL1x2_E2 64,32, 0,1 MY_ALIGN -ZGEMM_L1x2_SUB2_1: - andi. T1,L, 1 - ble ZGEMM_L1x2_SAVE - KERNEL1x2 -ZGEMM_L1x2_SAVE: - SAVE1x2 + +ZGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x2_SAVE + KERNEL1x2 + + +ZGEMM_L1x2_SAVE: +/*----------------------------------------*/ + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 +#endif + ZGEMM_L1x2_END: +/*----------------------------------------*/ + ZGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x1 + ble ZGEMM_L1x1_SUB0 + bl ZGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x1_SAVE + b ZGEMM_L1x1_SUB2 - andi. T1, M, 1 - ble ZGEMM_L1x1_END - mr BO, B - mr T1, K - addi T1,T1, -1 - srawi. L, T1, 5 /**(K-1) % 16x */ - ZERO1x1 - ble ZGEMM_L1x1_SUB0 - -ZGEMM_L1x1_LOOP_START: - - LOAD1x1 0 - mtctr L - - MY_ALIGN -ZGEMM_L1x1_LOOP: - KERNEL1x1_L 16,16,0,0 - KERNEL1x1_L 16,16,1,0 - KERNEL1x1_L 16,16,2,0 - KERNEL1x1_L 16,16,3,0 - KERNEL1x1_L 16,16,4,0 - KERNEL1x1_L 16,16,5,0 - KERNEL1x1_L 16,16,6,0 - KERNEL1x1_L 16,16,7,0 - KERNEL1x1_L 16,16,8,0 - KERNEL1x1_L 16,16,9,0 - KERNEL1x1_L 16,16,10,0 - KERNEL1x1_L 16,16,11,0 - KERNEL1x1_L 16,16,12,0 - KERNEL1x1_L 16,16,13,0 - KERNEL1x1_L 16,16,14,0 - KERNEL1x1_L 16,16,15,1 - bdnz ZGEMM_L1x1_LOOP - MY_ALIGN -ZGEMM_L1x1_LOOP_END: - END1x1 AO, BO, 16, 16 - - b ZGEMM_L1x1_SUB1 - ZGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x1_32K + addi BO,BO,-16 + addi AO,AO,-16 + LOAD1x1O 16,16 + END1x1_WITHOUT_ADD + LOAD1x1_2O 32, 32 + mtctr T8 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-32 + LOAD1x1_2O 32,32 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + MY_ALIGN + MY_ALIGN - andi. L, K, 63 - - b ZGEMM_L1x1_SUB2 - -ZGEMM_L1x1_SUB1: - - andi. L, T1, 31 - ble ZGEMM_L1x1_SAVE ZGEMM_L1x1_SUB2: - srawi. T1,L, 3 - ble ZGEMM_L1x1_SUB2_4 - mtctr T1 +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x1_SUB2_8 + bl ZGEMM_1x1_L16_SUB MY_ALIGN -ZGEMM_L1x1_SUB2_LOOP: - LOAD1x1 0 - KERNEL1x1_L 16,16, 0,0 - KERNEL1x1_L 16,16, 1,0 - KERNEL1x1_L 16,16, 2,0 - KERNEL1x1_E 16,16, 3,1 - bdnz ZGEMM_L1x1_SUB2_LOOP + + +ZGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x1_SUB2_4 + bl ZGEMM_1x1_L8_SUB MY_ALIGN + + ZGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L1x1_SUB2_2 - LOAD1x1 0 - KERNEL1x1_L 16,16, 0,0 - KERNEL1x1_E 16,16, 1,1 + LOAD1x1_2 + KERNEL1x1_L2 32,32, 0,0 + KERNEL1x1_E2 32,32, 1,1 MY_ALIGN + + ZGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L1x1_SUB2_1 - LOAD1x1 0 - KERNEL1x1_E 16,16, 0,1 + LOAD1x1_2 + KERNEL1x1_E2 32,32, 0,1 MY_ALIGN + + ZGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ andi. T1,L, 1 - ble ZGEMM_L1x1_SAVE - KERNEL1x1 + ble ZGEMM_L1x1_SAVE + KERNEL1x1 + ZGEMM_L1x1_SAVE: +/*----------------------------------------*/ + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 +#endif - SAVE1x1 ZGEMM_L1x1_END: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + ZGEMM_L1_END: +/*----------------------------------------*/ + \ No newline at end of file diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S index 10d9e4cc3..8670e9574 100644 --- a/kernel/power/zgemm_macros_power9.S +++ b/kernel/power/zgemm_macros_power9.S @@ -25,7 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #define unit_size 16 #define DISP32(ind,disp) (ind*unit_size*32+disp) #define DISP16(ind,disp) (ind*unit_size*16+disp) @@ -34,10 +33,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DISP2(ind,disp) (ind*unit_size*2+disp) #define DISP1(ind,disp) (ind*unit_size+disp) #define DISPX(disp) (disp) - /* HELPERS FOR SAVE */ - /* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */ + + .macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET #ifndef TRMMKERNEL lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) @@ -46,20 +45,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 #endif .endm - /*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ + + .macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ .endm - /*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ + + .macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ .endm - /* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ + + .macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI #if defined(NN) || defined(NT) || defined(TN) || defined(TT) xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR @@ -78,8 +80,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI #endif .endm - /* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */ + + .macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 #ifndef TRMMKERNEL xvmsubadp \VSOUT1,\VSINII, alpha_i @@ -89,23 +92,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp \VSOUT2,\VSINRR, alpha_i #endif .endm - /* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + + .macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 xvmsubadp \VSOUT1,\VSINRR, alpha_r xvmaddadp \VSOUT2,\VSINII, alpha_r .endm - /* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */ + + .macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 xxmrghd \VSOUT1,\VSIN2,\VSIN1 xxmrgld \VSOUT2,\VSIN2,\VSIN1 .endm + + .macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2 stxv \VSIN1, DISPX(\LOFFSET)(\REG) stxv \VSIN2, DISPX(\LOFFSET+16)(\REG) .endm + .macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET @@ -141,6 +149,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3 .endm + .macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET @@ -161,6 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm + .macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET @@ -173,6 +183,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm + .macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3 #ifndef TRMMKERNEL @@ -188,9 +199,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxmrghd vs7,vs15,vs14 stxv vs7, (\LOFFSET)(\BASE_REG) .endm - /********************************************************************************************** -* Macros for N=2 and M=8 +* + +.macros for N=2 and M=8 **********************************************************************************************/ .macro Zero2x8 @@ -228,269 +240,272 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs63, vs63, vs63 .endm -.macro LOAD2x8 Zero - lxv vs16, 0(BO) // load real imag from B - lxv vs18, 16(BO) // load real,imag from B +.macro LOAD2x8 + LOAD2x8O 0,0 +.endm + + +.macro LOAD2x8O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B xxswapd vs17, vs16 xxswapd vs19, vs18 - - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A - - lxv vs4, 64(AO) // load real,imag from A - lxv vs5, 80(AO) // load real,imag from A - lxv vs6, 96(AO) // load real,imag from A - lxv vs7, 112(AO) // load real,imag from A - -.if \Zero==1 - Zero2x8 -.endif - + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + .endm + .macro END2x8_NORMAL END2x8 AO,BO,128,32 .endm -.macro END2x8 AREG, BREG, OffsetA, OffsetB +.macro END2x8_WITHOUT_ADD + END2x8 AO,BO,0,0 +.endm + + +.macro END2x8 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif - xvmaddadp vs32, vs0, vs16 xvmaddadp vs48, vs0, vs18 - - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs50, vs1, vs18 - - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs52, vs2, vs18 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs54, vs3, vs18 - - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs56, vs4, vs18 - - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs58, vs5, vs18 - - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs60, vs6, vs18 - - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs62, vs7, vs18 - - xvmaddadp vs33, vs0, vs17 xvmaddadp vs49, vs0, vs19 - + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs50, vs1, vs18 xvmaddadp vs35, vs1, vs17 xvmaddadp vs51, vs1, vs19 - + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs52, vs2, vs18 xvmaddadp vs37, vs2, vs17 xvmaddadp vs53, vs2, vs19 - + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs54, vs3, vs18 xvmaddadp vs39, vs3, vs17 xvmaddadp vs55, vs3, vs19 - + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs56, vs4, vs18 xvmaddadp vs41, vs4, vs17 xvmaddadp vs57, vs4, vs19 - + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs58, vs5, vs18 xvmaddadp vs43, vs5, vs17 xvmaddadp vs59, vs5, vs19 - + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs60, vs6, vs18 xvmaddadp vs45, vs6, vs17 xvmaddadp vs61, vs6, vs19 - + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs62, vs7, vs18 xvmaddadp vs47, vs7, vs17 xvmaddadp vs63, vs7, vs19 - .endm -.macro KERNEL2x8_L OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 + +.macro LOAD2x8_2 + LOAD2x8_2O 0,0 +.endm + + +.macro LOAD2x8_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A + lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A + lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A + lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A + lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A + lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A + lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A + lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x8_2 + /*for load2 offset will be 256 and 64*/ + KERNEL2x8_2 AO,BO, 256,64,0 ,1,1 +.endm + + + +.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL2x8_E OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 + +.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm .macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B xvmaddadp vs32, vs0, vs16 xvmaddadp vs48, vs0, vs18 xvmaddadp vs33, vs0, vs17 xvmaddadp vs49, vs0, vs19 - - xxswapd vs21, vs20 - xxswapd vs23, vs22 - + xxswapd vs21, vs20 + xxswapd vs23, vs22 xvmaddadp vs34, vs1, vs16 xvmaddadp vs50, vs1, vs18 - - lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - xvmaddadp vs35, vs1, vs17 xvmaddadp vs51, vs1, vs19 - - lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A - +.if \Complete==0 + lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs36, vs2, vs16 xvmaddadp vs52, vs2, vs18 - - lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A - xvmaddadp vs37, vs2, vs17 xvmaddadp vs53, vs2, vs19 - - lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) -.endif -.endif - - xvmaddadp vs38, vs3, vs16 xvmaddadp vs54, vs3, vs18 - -.if \Complete==0 - lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - - xvmaddadp vs39, vs3, vs17 xvmaddadp vs55, vs3, vs19 - -.if \Complete==0 - lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A -.endif +.if \Complete==0 + lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs40, vs4, vs16 xvmaddadp vs56, vs4, vs18 - xvmaddadp vs41, vs4, vs17 xvmaddadp vs57, vs4, vs19 - xvmaddadp vs42, vs5, vs16 xvmaddadp vs58, vs5, vs18 xvmaddadp vs43, vs5, vs17 xvmaddadp vs59, vs5, vs19 - -.if \Complete==0 - lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A -.endif - +.if \Complete==0 + lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs44, vs6, vs16 xvmaddadp vs60, vs6, vs18 xvmaddadp vs45, vs6, vs17 xvmaddadp vs61, vs6, vs19 - xvmaddadp vs46, vs7, vs16 xvmaddadp vs62, vs7, vs18 xvmaddadp vs47, vs7, vs17 - xvmaddadp vs63, vs7, vs19 - -.if \Complete==0 - lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs63, vs7, vs19 +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B .endif - xvmaddadp vs32, vs8, vs20 xvmaddadp vs48, vs8, vs22 .if \Complete==0 - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \Complete==0 -.if \IsLast==1 - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP4(\Index,64) -.endif - -.endif + lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs33, vs8, vs21 xvmaddadp vs49, vs8, vs23 - -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 .endif - xvmaddadp vs34, vs9, vs20 xvmaddadp vs50, vs9, vs22 xvmaddadp vs35, vs9, vs21 xvmaddadp vs51, vs9, vs23 - +.if \Complete==0 + lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs36, vs10, vs20 xvmaddadp vs52, vs10, vs22 xvmaddadp vs37, vs10, vs21 xvmaddadp vs53, vs10, vs23 - xvmaddadp vs38, vs11, vs20 xvmaddadp vs54, vs11, vs22 xvmaddadp vs39, vs11, vs21 xvmaddadp vs55, vs11, vs23 - +.if \Complete==0 + lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs40, vs12, vs20 xvmaddadp vs56, vs12, vs22 xvmaddadp vs41, vs12, vs21 xvmaddadp vs57, vs12, vs23 - xvmaddadp vs42, vs13, vs20 xvmaddadp vs58, vs13, vs22 xvmaddadp vs43, vs13, vs21 xvmaddadp vs59, vs13, vs23 - +.if \Complete==0 + lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs44, vs14, vs20 xvmaddadp vs60, vs14, vs22 xvmaddadp vs45, vs14, vs21 xvmaddadp vs61, vs14, vs23 - xvmaddadp vs46, vs15, vs20 xvmaddadp vs62, vs15, vs22 xvmaddadp vs47, vs15, vs21 xvmaddadp vs63, vs15, vs23 - +.if \Complete==0 + lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif .endm + + + + .macro KERNEL2x8 - LOAD2x8 0 + LOAD2x8 END2x8 AO, BO, 128,32 .endm -.macro SAVE2x8 +.macro SAVE2x8 add T1, CO ,LDC SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0 addi CO, CO, 128 - .endm - /********************************************************************************************** -* Macros for N=2 and M=4 +* + +.macros for N=2 and M=4 **********************************************************************************************/ + .macro Zero2x4 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 @@ -510,167 +525,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs47, vs47, vs47 .endm -.macro LOAD2x4 Zero - lxv vs16, 0(BO) // load real imag from B - lxv vs18, 16(BO) // load real,imag from B +.macro LOAD2x4 + LOAD2x4O 0,0 +.endm + + +.macro LOAD2x4O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B xxswapd vs17, vs16 xxswapd vs19, vs18 - - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A - -.if \Zero==1 - Zero2x4 -.endif - + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A .endm + .macro END2x4_NORMAL END2x4 AO,BO,64,32 .endm -.macro END2x4 AREG, BREG, OffsetA, OffsetB +.macro END2x4_WITHOUT_ADD + END2x4 AO,BO,0,0 +.endm + + +.macro END2x4 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 xvmaddadp vs40, vs0, vs18 + xvmaddadp vs33, vs0, vs17 xvmaddadp vs41, vs0, vs19 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 xvmaddadp vs42, vs1, vs18 + xvmaddadp vs35, vs1, vs17 xvmaddadp vs43, vs1, vs19 - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 xvmaddadp vs44, vs2, vs18 + xvmaddadp vs37, vs2, vs17 xvmaddadp vs45, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 xvmaddadp vs46, vs3, vs18 + xvmaddadp vs39, vs3, vs17 xvmaddadp vs47, vs3, vs19 .endm -.macro KERNEL2x4_L OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm -.macro KERNEL2x4_E OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - -.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B - - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - xxswapd vs21, vs20 - xxswapd vs23, vs22 - lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - xvmaddadp vs40, vs0, vs18 - xvmaddadp vs41, vs0, vs19 - lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) - addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) -.endif -.endif - - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs42, vs1, vs18 - xvmaddadp vs43, vs1, vs19 - - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 -.if \Complete==0 - lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs2, vs18 - xvmaddadp vs45, vs2, vs19 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs46, vs3, vs18 - xvmaddadp vs47, vs3, vs19 +.macro LOAD2x4_2 + LOAD2x4_2O 0,0 +.endm -.if \Complete==0 - lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A - -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.if \IsLast==1 - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif - -.if \Complete==0 +.macro LOAD2x4_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B xxswapd vs17, vs16 xxswapd vs19, vs18 -.endif - - xvmaddadp vs40, vs8, vs22 - xvmaddadp vs41, vs8, vs23 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs8, (64+\OffsetA)(AO) // load real,imag from A + lxv vs9, (80+\OffsetA)(AO) // load real,imag from A + lxv vs10, (96+\OffsetA)(AO) // load real,imag from A + lxv vs11, (112+\OffsetA)(AO) // load real,imag from A +.endm - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 - xvmaddadp vs42, vs9, vs22 - xvmaddadp vs43, vs9, vs23 - - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs44, vs10, vs22 - xvmaddadp vs45, vs10, vs23 - - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs39, vs11, vs21 - xvmaddadp vs46, vs11, vs22 - xvmaddadp vs47, vs11, vs23 +.macro END2x4_2 + /*for load2 offset will be 128 and 64*/ + KERNEL2x4_2 AO,BO, 128,64,0 ,1,1 .endm + + + +.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs40, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs41, vs0, vs19 + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs42, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs43, vs1, vs19 +.if \Complete==0 + lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs44, vs2, vs18 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs45, vs2, vs19 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs47, vs3, vs19 +.if \Complete==0 + lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs40, vs8, vs22 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs41, vs8, vs23 +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs42, vs9, vs22 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs43, vs9, vs23 +.if \Complete==0 + lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs44, vs10, vs22 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs45, vs10, vs23 + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs46, vs11, vs22 + xvmaddadp vs39, vs11, vs21 + xvmaddadp vs47, vs11, vs23 +.if \Complete==0 + lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + .macro KERNEL2x4 - LOAD2x4 0 + LOAD2x4 END2x4 AO, BO, 64,32 .endm + + .macro SAVE2x4 add T1, CO ,LDC SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0 addi CO, CO, 64 - .endm - /********************************************************************************************** -* Macros for N=2 and M=2 +* + +.macros for N=2 and M=2 **********************************************************************************************/ + .macro Zero2x2 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 @@ -680,231 +727,299 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs37, vs37, vs37 xxlxor vs38, vs38, vs38 xxlxor vs39, vs39, vs39 + .endm -.macro LOAD2x2 Zero - lxv vs16, 0(BO) // load real imag from B - lxv vs18, 16(BO) // load real,imag from B +.macro LOAD2x2 + LOAD2x2O 0,0 +.endm + + +.macro LOAD2x2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B xxswapd vs17, vs16 xxswapd vs19, vs18 - - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - - -.if \Zero==1 - Zero2x2 -.endif + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + .endm + .macro END2x2_NORMAL END2x2 AO,BO,32,32 .endm -.macro END2x2 AREG, BREG, OffsetA, OffsetB +.macro END2x2_WITHOUT_ADD + END2x2 AO,BO,0,0 +.endm + + +.macro END2x2 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 xvmaddadp vs36, vs0, vs18 - xvmaddadp vs37, vs0, vs19 - - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs38, vs1, vs18 - xvmaddadp vs39, vs1, vs19 - -.endm - -.macro KERNEL2x2_L OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - -.macro KERNEL2x2_E OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - -.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B - xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 - xxswapd vs21, vs20 - xxswapd vs23, vs22 - - lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) -.endif -.endif - xvmaddadp vs36, vs0, vs18 xvmaddadp vs37, vs0, vs19 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 xvmaddadp vs38, vs1, vs18 - xvmaddadp vs39, vs1, vs19 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs39, vs1, vs19 -.if \Complete==0 - lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,64) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif +.endm - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 +.macro LOAD2x2_2 + LOAD2x2_2O 0,0 +.endm + + +.macro LOAD2x2_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B xxswapd vs17, vs16 xxswapd vs19, vs18 -.endif - xvmaddadp vs36, vs8, vs22 - xvmaddadp vs37, vs8, vs23 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs8, (32+\OffsetA)(AO) // load real,imag from A + lxv vs9, (48+\OffsetA)(AO) // load real,imag from A + +.endm - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 - - xvmaddadp vs38, vs9, vs22 - xvmaddadp vs39, vs9, vs23 +.macro END2x2_2 + /*for load2 offset will be 64 and 64*/ + KERNEL2x2_2 AO,BO, 64,64,0 ,1,1 .endm + + + +.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs36, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs37, vs0, vs19 + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs39, vs1, vs19 +.if \Complete==0 + lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs36, vs8, vs22 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs37, vs8, vs23 +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs38, vs9, vs22 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs39, vs9, vs23 +.if \Complete==0 + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + .macro KERNEL2x2 - LOAD2x2 0 + LOAD2x2 END2x2 AO, BO, 32,32 .endm + + .macro SAVE2x2 add T1, CO ,LDC SAVE2 vs32,vs33,vs34,vs35,CO,0 SAVE2 vs36,vs37,vs38,vs39,T1,0 addi CO, CO, 32 .endm - /********************************************************************************************** -* Macros for N=2 and M=1 +* + +.macros for N=2 and M=1 **********************************************************************************************/ + + .macro Zero2x1 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 xxlxor vs34, vs34, vs34 xxlxor vs35, vs35, vs35 + .endm -.macro LOAD2x1 Zero - lxv vs0, 0(AO) // load real,imag from A - lxv vs16, 0(BO) // load real imag from B - lxv vs18, 16(BO) // load real,imag from B +.macro LOAD2x1 + LOAD2x1O 0,0 +.endm + +.macro LOAD2x1O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B xxswapd vs17, vs16 xxswapd vs19, vs18 -.if \Zero==1 - Zero2x1 -.endif + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A .endm + .macro END2x1_NORMAL END2x1 AO,BO,16,32 .endm -.macro END2x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif +.macro END2x1_WITHOUT_ADD + END2x1 AO,BO,0,0 +.endm + + +.macro END2x1 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif - +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs34, vs0, vs18 - xvmaddadp vs35, vs0, vs19 - -.endm - -.macro KERNEL2x1_L OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - -.macro KERNEL2x1_E OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - -.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B - - lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - - xxswapd vs21, vs20 - xxswapd vs23, vs22 -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) -.endif -.endif - - xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 + xvmaddadp vs35, vs0, vs19 +.endm - xvmaddadp vs34, vs0, vs18 - xvmaddadp vs35, vs0, vs19 -.if \Complete==0 - lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.macro LOAD2x1_2 + LOAD2x1_2O 0,0 +.endm -.endif -.if \Complete==0 - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.if \IsLast==1 - addi \AREG, \AREG, DISP2(\Index,32) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif - -.if \Complete==0 + +.macro LOAD2x1_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B xxswapd vs17, vs16 xxswapd vs19, vs18 -.endif + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs8, (16+\OffsetA)(AO) // load real,imag from A +.endm - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 - - xvmaddadp vs34, vs8, vs22 - xvmaddadp vs35, vs8, vs23 +.macro END2x1_2 + /*for load2 offset will be 32 and 64*/ + KERNEL2x1_2 AO,BO, 32,64,0 ,1,1 .endm + + + +.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs35, vs0, vs19 +.if \Complete==0 + lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs34, vs8, vs22 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs35, vs8, vs23 +.if \Complete==0 + lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + .macro KERNEL2x1 - LOAD2x1 0 + LOAD2x1 END2x1 AO, BO, 16,32 .endm + + .macro SAVE2x1 add T1, CO ,LDC SAVE1 vs32,vs33,CO,0 @@ -913,8 +1028,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm /********************************************************************************************** -* Macros for N=1 and M=8 +* + +.macros for N=1 and M=8 **********************************************************************************************/ + + .macro Zero1x8 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 @@ -932,167 +1051,228 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs45, vs45, vs45 xxlxor vs46, vs46, vs46 xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 .endm -.macro LOAD1x8 Zero - - lxv vs16, 0(BO) // load real imag from B - xxswapd vs17, vs16 - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A - - lxv vs4, 64(AO) // load real,imag from A - lxv vs5, 80(AO) // load real,imag from A - lxv vs6, 96(AO) // load real,imag from A - lxv vs7, 112(AO) // load real,imag from A - -.if \Zero==1 - Zero1x8 -.endif +.macro LOAD1x8 + LOAD1x8O 0,0 .endm + +.macro LOAD1x8O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + xxswapd vs17, vs16 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + +.endm + + .macro END1x8_NORMAL END1x8 AO,BO,128,16 .endm -.macro END1x8 AREG, BREG, OffsetA, OffsetB +.macro END1x8_WITHOUT_ADD + END1x8 AO,BO,0,0 +.endm + + +.macro END1x8 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif - xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 xvmaddadp vs35, vs1, vs17 + xvmaddadp vs36, vs2, vs16 xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 xvmaddadp vs39, vs3, vs17 + xvmaddadp vs40, vs4, vs16 xvmaddadp vs41, vs4, vs17 + xvmaddadp vs42, vs5, vs16 xvmaddadp vs43, vs5, vs17 + xvmaddadp vs44, vs6, vs16 xvmaddadp vs45, vs6, vs17 + xvmaddadp vs46, vs7, vs16 xvmaddadp vs47, vs7, vs17 .endm -.macro KERNEL1x8_L OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 + +.macro LOAD1x8_2 + LOAD1x8_2O 0,0 +.endm + + +.macro LOAD1x8_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A + lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A + lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A + lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A + lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A + lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A + lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A + lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x8_2 + /*for load2 offset will be 256 and 32*/ + KERNEL1x8_2 AO,BO, 256,32,0 ,1,1 +.endm + + + +.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL1x8_E OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 + +.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm + .macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - xxswapd vs21, vs20 - - - lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs33, vs0, vs17 + xxswapd vs21, vs20 xvmaddadp vs34, vs1, vs16 xvmaddadp vs35, vs1, vs17 - lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.if \Complete==0 + lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs36, vs2, vs16 xvmaddadp vs37, vs2, vs17 - lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A xvmaddadp vs38, vs3, vs16 xvmaddadp vs39, vs3, vs17 -.if \Complete==0 - lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A +.if \Complete==0 + lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A .endif xvmaddadp vs40, vs4, vs16 xvmaddadp vs41, vs4, vs17 -.if \Complete==0 - lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A -.endif + xvmaddadp vs42, vs5, vs16 xvmaddadp vs43, vs5, vs17 +.if \Complete==0 + lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs44, vs6, vs16 xvmaddadp vs45, vs6, vs17 -.if \Complete==0 - lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A -.endif + xvmaddadp vs46, vs7, vs16 xvmaddadp vs47, vs7, vs17 - - +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif +.if \Complete==0 + xxswapd vs17, vs16 +.endif xvmaddadp vs32, vs8, vs20 xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A -.endif +.if \Complete==0 + lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs34, vs9, vs20 xvmaddadp vs35, vs9, vs21 -.if \Complete==0 - lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B - xxswapd vs17,vs16 -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) - addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) -.else - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP2(\Index,32) -.endif +.if \Complete==0 + lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A .endif xvmaddadp vs36, vs10, vs20 xvmaddadp vs37, vs10, vs21 - xvmaddadp vs38, vs11, vs20 xvmaddadp vs39, vs11, vs21 - +.if \Complete==0 + lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs40, vs12, vs20 xvmaddadp vs41, vs12, vs21 xvmaddadp vs42, vs13, vs20 xvmaddadp vs43, vs13, vs21 +.if \Complete==0 + lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs44, vs14, vs20 xvmaddadp vs45, vs14, vs21 xvmaddadp vs46, vs15, vs20 xvmaddadp vs47, vs15, vs21 - +.if \Complete==0 + lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif .endm + + + + .macro KERNEL1x8 - LOAD1x8 0 + LOAD1x8 END1x8 AO, BO, 128,16 .endm + .macro SAVE1x8 - - SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 + SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 addi CO, CO, 128 - .endm - /********************************************************************************************** -* Macros for N=1 and M=4 +* + +.macros for N=2 and M=4 **********************************************************************************************/ + .macro Zero1x4 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 @@ -1104,324 +1284,542 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs39, vs39, vs39 .endm -.macro LOAD1x4 Zero - - lxv vs16, 0(BO) // load real imag from B - xxswapd vs17,vs16 - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A - -.if \Zero==1 - Zero1x4 -.endif +.macro LOAD1x4 + LOAD1x4O 0,0 .endm + +.macro LOAD1x4O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + +.endm + + .macro END1x4_NORMAL END1x4 AO,BO,64,16 .endm -.macro END1x4 AREG, BREG, OffsetA, OffsetB +.macro END1x4_WITHOUT_ADD + END1x4 AO,BO,0,0 +.endm + + +.macro END1x4 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif - xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 xvmaddadp vs35, vs1, vs17 + xvmaddadp vs36, vs2, vs16 xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 xvmaddadp vs39, vs3, vs17 .endm -.macro KERNEL1x4_L OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 + +.macro LOAD1x4_2 + LOAD1x4_2O 0,0 +.endm + + +.macro LOAD1x4_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs8, (64+\OffsetA)(AO) // load real,imag from A + lxv vs9, (80+\OffsetA)(AO) // load real,imag from A + lxv vs10, (96+\OffsetA)(AO) // load real,imag from A + lxv vs11, (112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x4_2 + /*for load2 offset will be 128 and 32*/ + KERNEL1x4_2 AO,BO, 128,32,0 ,1,1 +.endm + + + +.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL1x4_E OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 + +.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm + .macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - xxswapd vs21,vs20 - xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 - - lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + xxswapd vs21, vs20 xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A - + xvmaddadp vs35, vs1, vs17 +.if \Complete==0 + lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs36, vs2, vs16 xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 xvmaddadp vs39, vs3, vs17 - - xvmaddadp vs40, vs0, vs18 - xvmaddadp vs41, vs0, vs19 - xvmaddadp vs42, vs1, vs18 - xvmaddadp vs43, vs1, vs19 - xvmaddadp vs44, vs2, vs18 - xvmaddadp vs45, vs2, vs19 - xvmaddadp vs46, vs3, vs18 - xvmaddadp vs47, vs3, vs19 - -.if \Complete==0 - lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif .if \Complete==0 - lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B .endif -.if \Complete==0 - lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B - xxswapd vs17,vs16 -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) - addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) -.else - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif - xvmaddadp vs32, vs8, vs20 xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + xxswapd vs17, vs16 +.endif xvmaddadp vs34, vs9, vs20 xvmaddadp vs35, vs9, vs21 +.if \Complete==0 + lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs36, vs10, vs20 xvmaddadp vs37, vs10, vs21 xvmaddadp vs38, vs11, vs20 xvmaddadp vs39, vs11, vs21 - - xvmaddadp vs40, vs8, vs22 - xvmaddadp vs41, vs8, vs23 - xvmaddadp vs42, vs9, vs22 - xvmaddadp vs43, vs9, vs23 - xvmaddadp vs44, vs10, vs22 - xvmaddadp vs45, vs10, vs23 - xvmaddadp vs46, vs11, vs22 - xvmaddadp vs47, vs11, vs23 - +.if \Complete==0 + lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif .endm + + .macro KERNEL1x4 - LOAD1x4 0 + LOAD1x4 END1x4 AO, BO, 64,16 .endm -.macro SAVE1x4 + + +.macro SAVE1x4 SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 addi CO, CO, 64 - .endm - /********************************************************************************************** -* Macros for N=1 and M=2 +* + +.macros for N=2 and M=2 **********************************************************************************************/ + .macro Zero1x2 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 -.endm - -.macro LOAD1x2 Zero - - lxv vs16, 0(BO) // load real imag from B - xxswapd vs17,vs16 - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - -.if \Zero==1 - Zero1x2 -.endif + xxlxor vs35, vs35, vs35 .endm + +.macro LOAD1x2 + LOAD1x2O 0,0 +.endm + + +.macro LOAD1x2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + +.endm + + .macro END1x2_NORMAL END1x2 AO,BO,32,16 .endm -.macro END1x2 AREG, BREG, OffsetA, OffsetB +.macro END1x2_WITHOUT_ADD + END1x2 AO,BO,0,0 +.endm + + +.macro END1x2 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif - xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 xvmaddadp vs35, vs1, vs17 .endm -.macro KERNEL1x2_L OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 + +.macro LOAD1x2_2 + LOAD1x2_2O 0,0 +.endm + + +.macro LOAD1x2_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs8, (32+\OffsetA)(AO) // load real,imag from A + lxv vs9, (48+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x2_2 + /*for load2 offset will be 64 and 32*/ + KERNEL1x2_2 AO,BO, 64,32,0 ,1,1 +.endm + + + +.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL1x2_E OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 + +.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm + .macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - xxswapd vs21,vs20 - xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 - - lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - + xxswapd vs21, vs20 xvmaddadp vs34, vs1, vs16 xvmaddadp vs35, vs1, vs17 -.if \Complete==0 - lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A +.if \Complete==0 + lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B .endif -.if \Complete==0 - lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B - xxswapd vs17,vs16 -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) - addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,64) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif - xvmaddadp vs32, vs8, vs20 xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + xxswapd vs17, vs16 +.endif xvmaddadp vs34, vs9, vs20 xvmaddadp vs35, vs9, vs21 +.if \Complete==0 + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif .endm + + .macro KERNEL1x2 - LOAD1x2 0 + LOAD1x2 END1x2 AO, BO, 32,16 .endm -.macro SAVE1x2 + + +.macro SAVE1x2 SAVE2 vs32,vs33,vs34,vs35,CO,0 addi CO, CO, 32 .endm - /********************************************************************************************** -* Macros for N=1 and M=1 +* + +.macros for N=2 and M=1 **********************************************************************************************/ + + .macro Zero1x1 xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 + xxlxor vs33, vs33, vs33 .endm -.macro LOAD1x1 Zero - lxv vs0, 0(AO) // load real,imag from A - lxv vs16, 0(BO) // load real imag from B - xxswapd vs17, vs16 -.if \Zero==1 - Zero1x1 -.endif - +.macro LOAD1x1 + LOAD1x1O 0,0 .endm + +.macro LOAD1x1O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + xxswapd vs17, vs16 + +.endm + + .macro END1x1_NORMAL END1x1 AO,BO,16,16 .endm -.macro END1x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif +.macro END1x1_WITHOUT_ADD + END1x1 AO,BO,0,0 +.endm + + +.macro END1x1 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif - - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - - +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 .endm -.macro KERNEL1x1_L OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 + +.macro LOAD1x1_2 + LOAD1x1_2O 0,0 +.endm + + +.macro LOAD1x1_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs8, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x1_2 + /*for load2 offset will be 32 and 32*/ + KERNEL1x1_2 AO,BO, 32,32,0 ,1,1 +.endm + + + +.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL1x1_E OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 + +.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm + .macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - xxswapd vs21, vs20 - - lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + xxswapd vs21, vs20 + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 +.if \Complete==0 + lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif +.if \Complete==0 + xxswapd vs17, vs16 +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - -.if \Complete==0 - lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.if \Complete==0 + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B .endif -.if \Complete==0 - lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B - xxswapd vs17, vs16 -.endif - .if \IsLast==1 .if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) - addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) .else - addi \AREG, \AREG, DISP2(\Index,32) + addi \AREG, \AREG, DISP2(\Index,32) addi \BREG, \BREG, DISP2(\Index,32) .endif -.endif - - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 - - +.endif .endm + + .macro KERNEL1x1 - LOAD1x1 0 + LOAD1x1 END1x1 AO, BO, 16,16 - .endm -.macro SAVE1x1 + + +.macro SAVE1x1 SAVE1 vs32,vs33,CO,0 addi CO, CO, 16 .endm +/****************************TRMM POINTER REFRESH + +.macroSES*************************/ + + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 8 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 4 + .endif +.endm +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ + + +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ + + +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + #endif + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif +.endm \ No newline at end of file diff --git a/param.h b/param.h index 8f78a6a64..9a1a68ecd 100644 --- a/param.h +++ b/param.h @@ -2256,7 +2256,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 1025 #define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 640 -#define ZGEMM_DEFAULT_Q 1025 +#define ZGEMM_DEFAULT_Q 1026 #define SYMV_P 8 From 148c4cc5fd4db4d10dcda94c5640de12611b7669 Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Wed, 5 Jun 2019 20:50:50 +0000 Subject: [PATCH 076/127] conflict resolve --- kernel/power/KERNEL.POWER9 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 126313c9a..0f91d6d7d 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -42,7 +42,7 @@ ZGEMMKERNEL = zgemm_kernel_power9.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c -ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c +ZGEMMITCOPY = zgemm_tcopy_8_power8.S ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o ZGEMMINCOPYOBJ = zgemm_incopy.o From 900d5a3205bd06c04990ff842449ec80808d5027 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Jun 2019 10:18:40 +0200 Subject: [PATCH 077/127] Add gfortran workaround for ABI violations in LAPACKE for #2154 (see gcc bug 90329) --- Makefile.system | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.system b/Makefile.system index a95d6190f..49c02bbcb 100644 --- a/Makefile.system +++ b/Makefile.system @@ -744,6 +744,8 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT FCOMMON_OPT += -Wall # make single-threaded LAPACK calls thread-safe #1847 FCOMMON_OPT += -frecursive +# work around ABI problem with passing single-character arguments +FCOMMON_OPT += -fno-optimize-sibling-calls #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc ifneq ($(NO_LAPACK), 1) EXTRALIB += -lgfortran From a0caa762b3066f28b1b9334932b39a3d377f79f9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Jun 2019 10:24:16 +0200 Subject: [PATCH 078/127] Add gfortran workaround for ABI violations for #2154 (see gcc bug 90329) --- Makefile.power | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.power b/Makefile.power index 195f1930f..24d8aa8a7 100644 --- a/Makefile.power +++ b/Makefile.power @@ -29,6 +29,10 @@ FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fas endif endif +# workaround for C->FORTRAN ABI violation in LAPACKE +ifeq ($(F_COMPILER), GFORTRAN) +FCOMMON_OPT += -fno-optimize-sibling-calls +endif FLAMEPATH = $(HOME)/flame/lib From 6ca898b63b81325559cbd2e925bf245f2a8ac999 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 8 Jun 2019 23:17:03 +0200 Subject: [PATCH 079/127] Add gfortran workaround for potential ABI violation for #2154 --- cmake/fc.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index adec28a91..9d8a5713c 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -44,7 +44,10 @@ endif () if (${F_COMPILER} STREQUAL "GFORTRAN") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") + # ensure reentrancy of lapack codes set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") + # work around ABI violation in passing string arguments from C + set(FCOMMON_OPT "$(FCOMMON_OPT) -fno-optimize-sibling-calls") #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc if (NOT NO_LAPACK) set(EXTRALIB "{EXTRALIB} -lgfortran") From e674e1c73515fab38e263d121429a1a5da494a45 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Jun 2019 09:31:13 +0200 Subject: [PATCH 080/127] Update fc.cmake --- cmake/fc.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 9d8a5713c..f54c989d4 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -47,7 +47,7 @@ if (${F_COMPILER} STREQUAL "GFORTRAN") # ensure reentrancy of lapack codes set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") # work around ABI violation in passing string arguments from C - set(FCOMMON_OPT "$(FCOMMON_OPT) -fno-optimize-sibling-calls") + set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls") #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc if (NOT NO_LAPACK) set(EXTRALIB "{EXTRALIB} -lgfortran") From 1f4b6a5d5d2c1e3aaf7ca1da6720825cd075391f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 10 Jun 2019 09:50:13 +0200 Subject: [PATCH 081/127] Remove any inadvertent use of -march=native from DYNAMIC_ARCH builds from #2143, -march=native precludes use of more specific options like -march=skylake-avx512 in individual kernels, and defeats the purpose of dynamic arch anyway. --- cmake/arch.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 470ea2a8f..b4547b7c9 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -73,7 +73,8 @@ if (DYNAMIC_ARCH) endif () if (NOT NO_AVX512) set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) - endif () + string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS}) + endif () if (DYNAMIC_LIST) set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) endif () From 4ea794a52253ee56573922a15a64606ec82248a5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 10 Jun 2019 17:24:15 +0200 Subject: [PATCH 082/127] Avoid unintentional activation of TLS code via USE_TLS=0 fixes #2149 --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 44eacda4b..c24647d62 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1070,7 +1070,7 @@ ifdef USE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 endif -ifdef USE_TLS +ifeq ($(USE_TLS), 1) CCOMMON_OPT += -DUSE_TLS endif From d9ff2cd90df9a114701dcd6298ae8439d6648e04 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Jun 2019 23:01:35 +0200 Subject: [PATCH 083/127] Do not force gcc options on non-gcc compilers fixes compile failure with pgi 18.10 as reported on OpenBLAS-users --- Makefile.x86_64 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 1b7fe3ef4..d23645058 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -28,11 +28,15 @@ endif ifeq ($(CORE), HASWELL) ifndef DYNAMIC_ARCH ifndef NO_AVX2 +ifeq ($(C_COMPILER), GCC) CCOMMON_OPT += -mavx2 +endif +ifeq $(F_COMPILER), GFORTRAN) FCOMMON_OPT += -mavx2 endif endif endif +endif From 6d3efb2b5829d78926f818496de5572dbd34e64f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 Jun 2019 08:08:11 +0200 Subject: [PATCH 084/127] Update Makefile.x86_64 --- Makefile.x86_64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index d23645058..99364752f 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -31,7 +31,7 @@ ifndef NO_AVX2 ifeq ($(C_COMPILER), GCC) CCOMMON_OPT += -mavx2 endif -ifeq $(F_COMPILER), GFORTRAN) +ifeq ($(F_COMPILER), GFORTRAN) FCOMMON_OPT += -mavx2 endif endif From bbd4bb0154b6c4bfc561dce07b71eba7c7fa9013 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Jun 2019 15:04:10 +0200 Subject: [PATCH 085/127] Zero ecx with a mov instruction PGI assembler does not like the initialization in the constraints. --- common_x86_64.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/common_x86_64.h b/common_x86_64.h index f59ff6627..9db66b545 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -129,12 +129,13 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ *ecx=cpuinfo[2]; *edx=cpuinfo[3]; #else - __asm__ __volatile__("cpuid" + __asm__ __volatile__("mov %%ecx, 0;" + "cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) - : "0" (op), "c"(0)); + : "0" (op)); #endif } From 280552b988e4377d95bc2f77bc07d2c00bb544e2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Jun 2019 18:35:43 +0200 Subject: [PATCH 086/127] Fix mov syntax --- common_x86_64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_x86_64.h b/common_x86_64.h index 9db66b545..c05998d58 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -129,7 +129,7 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ *ecx=cpuinfo[2]; *edx=cpuinfo[3]; #else - __asm__ __volatile__("mov %%ecx, 0;" + __asm__ __volatile__("mov $0, %%ecx;" "cpuid" : "=a" (*eax), "=b" (*ebx), From cdbfb891da2a8de14aa1d9bd7a57265284f7432c Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Mon, 17 Jun 2019 15:33:38 +0000 Subject: [PATCH 087/127] new sgemm 8x16 --- kernel/power/sgemm_logic_power9.S | 193 ++++++++-------- kernel/power/sgemm_macros_power9.S | 344 +++++++++++++++-------------- param.h | 2 +- 3 files changed, 288 insertions(+), 251 deletions(-) diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S index 25e8c8387..053836cbf 100644 --- a/kernel/power/sgemm_logic_power9.S +++ b/kernel/power/sgemm_logic_power9.S @@ -3,89 +3,89 @@ b L8 MY_ALIGN LSGEMM_L8x16_LMAIN_SUB: - LOAD8x16_0 - mtctr L + LOAD8x16_2 MY_ALIGN LSGEMM_L8x16_LOOP: - - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_2 64,32, 1,0 - KERNEL8x16_I1_L4_2 64,32, 2,0 - KERNEL8x16_I1_L4_2 64,32, 3,0 - KERNEL8x16_I1_L4_2 64,32, 4,0 - KERNEL8x16_I1_L4_2 64,32, 5,0 - KERNEL8x16_I1_L4_2 64,32, 6,0 - KERNEL8x16_I1_L4_2 64,32, 7,0 - KERNEL8x16_I1_L4_2 64,32, 8,0 - KERNEL8x16_I1_L4_2 64,32, 9,0 - KERNEL8x16_I1_L4_2 64,32, 10,0 - KERNEL8x16_I1_L4_2 64,32, 11,0 - KERNEL8x16_I1_L4_2 64,32, 12,0 - KERNEL8x16_I1_L4_2 64,32, 13,0 - KERNEL8x16_I1_L4_2 64,32, 14,0 - KERNEL8x16_I1_L4_2 64,32, 15,0 - KERNEL8x16_I1_L4_2 64,32, 16,0 - KERNEL8x16_I1_L4_2 64,32, 17,0 - KERNEL8x16_I1_L4_2 64,32, 18,0 - KERNEL8x16_I1_L4_2 64,32, 19,0 - KERNEL8x16_I1_L4_2 64,32, 20,0 - KERNEL8x16_I1_L4_2 64,32, 21,0 - KERNEL8x16_I1_L4_2 64,32, 22,0 - KERNEL8x16_I1_L4_2 64,32, 23,0 - KERNEL8x16_I1_L4_2 64,32, 24,0 - KERNEL8x16_I1_L4_2 64,32, 25,0 - KERNEL8x16_I1_L4_2 64,32, 26,0 - KERNEL8x16_I1_L4_2 64,32, 27,0 - KERNEL8x16_I1_L4_2 64,32, 28,0 - KERNEL8x16_I1_L4_2 64,32, 29,0 - KERNEL8x16_I1_L4_2 64,32, 30,0 - KERNEL8x16_I1_L4_2 64,32, 31,1 + KERNEL8x16_L2 128,64,0,0 +LSGEMM_L8x16_K128: + KERNEL8x16_L2 128,64,1,0 + KERNEL8x16_I1_L4_2 128,64, 1,0 + KERNEL8x16_I1_L4_2 128,64, 2,0 + KERNEL8x16_I1_L4_2 128,64, 3,0 + KERNEL8x16_I1_L4_2 128,64, 4,0 + KERNEL8x16_I1_L4_2 128,64, 5,0 + KERNEL8x16_I1_L4_2 128,64, 6,0 + KERNEL8x16_I1_L4_2 128,64, 7,0 + KERNEL8x16_I1_L4_2 128,64, 8,0 + KERNEL8x16_I1_L4_2 128,64, 9,0 + KERNEL8x16_I1_L4_2 128,64, 10,0 + KERNEL8x16_I1_L4_2 128,64, 11,0 + KERNEL8x16_I1_L4_2 128,64, 12,0 + KERNEL8x16_I1_L4_2 128,64, 13,0 + KERNEL8x16_I1_L4_2 128,64, 14,0 + KERNEL8x16_I1_L4_2 128,64, 15,0 + KERNEL8x16_I1_L4_2 128,64, 16,0 + KERNEL8x16_I1_L4_2 128,64, 17,0 + KERNEL8x16_I1_L4_2 128,64, 18,0 + KERNEL8x16_I1_L4_2 128,64, 19,0 + KERNEL8x16_I1_L4_2 128,64, 20,0 + KERNEL8x16_I1_L4_2 128,64, 21,0 + KERNEL8x16_I1_L4_2 128,64, 22,0 + KERNEL8x16_I1_L4_2 128,64, 23,0 + KERNEL8x16_I1_L4_2 128,64, 24,0 + KERNEL8x16_I1_L4_2 128,64, 25,0 + KERNEL8x16_I1_L4_2 128,64, 26,0 + KERNEL8x16_I1_L4_2 128,64, 27,0 + KERNEL8x16_I1_L4_2 128,64, 28,0 + KERNEL8x16_I1_L4_2 128,64, 29,0 + KERNEL8x16_I1_L4_2 128,64, 30,0 + KERNEL8x16_I1_L4_2 128,64, 31,1 bdnz LSGEMM_L8x16_LOOP MY_ALIGN LSGEMM_L8x16_LOOP_END: - END8x16 0, AO, BO, 64, 32 + END8x16_2 blr MY_ALIGN LSGEMM_L8x16_L64_SUB: - LOAD8x16_0 - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_2 64,32, 1,0 - KERNEL8x16_I1_L4_2 64,32, 2,0 - KERNEL8x16_I1_L4_2 64,32, 3,0 - KERNEL8x16_I1_L4_2 64,32, 4,0 - KERNEL8x16_I1_L4_2 64,32, 5,0 - KERNEL8x16_I1_L4_2 64,32, 6,0 - KERNEL8x16_I1_L4_2 64,32, 7,0 - KERNEL8x16_I1_L4_2 64,32, 8,0 - KERNEL8x16_I1_L4_2 64,32, 9,0 - KERNEL8x16_I1_L4_2 64,32, 10,0 - KERNEL8x16_I1_L4_2 64,32, 11,0 - KERNEL8x16_I1_L4_2 64,32, 12,0 - KERNEL8x16_I1_L4_2 64,32, 13,0 - KERNEL8x16_I1_L4_2 64,32, 14,0 - KERNEL8x16_I1_L4_3 64,32, 15,1 + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64, 0,0 + KERNEL8x16_I1_L4_2 128,64, 1,0 + KERNEL8x16_I1_L4_2 128,64, 2,0 + KERNEL8x16_I1_L4_2 128,64,3,0 + KERNEL8x16_I1_L4_2 128,64,4,0 + KERNEL8x16_I1_L4_2 128,64,5,0 + KERNEL8x16_I1_L4_2 128,64,6,0 + KERNEL8x16_I1_L4_2 128,64,7,0 + KERNEL8x16_I1_L4_2 128,64,8,0 + KERNEL8x16_I1_L4_2 128,64,9,0 + KERNEL8x16_I1_L4_2 128,64,10,0 + KERNEL8x16_I1_L4_2 128,64,11,0 + KERNEL8x16_I1_L4_2 128,64,12,0 + KERNEL8x16_I1_L4_2 128,64,13,0 + KERNEL8x16_I1_L4_2 128,64,14,0 + KERNEL8x16_I1_L4_3 128,64,15,1 blr LSGEMM_L8x16_L32_SUB: - LOAD8x16_0 - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_2 64,32, 1,0 - KERNEL8x16_I1_L4_2 64,32, 2,0 - KERNEL8x16_I1_L4_2 64,32, 3,0 - KERNEL8x16_I1_L4_2 64,32, 4,0 - KERNEL8x16_I1_L4_2 64,32, 5,0 - KERNEL8x16_I1_L4_2 64,32, 6,0 - KERNEL8x16_I1_L4_3 64,32, 7,1 + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64,0,0 + KERNEL8x16_I1_L4_2 128,64,1,0 + KERNEL8x16_I1_L4_2 128,64,2,0 + KERNEL8x16_I1_L4_2 128,64,3,0 + KERNEL8x16_I1_L4_2 128,64,4,0 + KERNEL8x16_I1_L4_2 128,64,5,0 + KERNEL8x16_I1_L4_2 128,64,6,0 + KERNEL8x16_I1_L4_3 128,64,7,1 blr LSGEMM_L8x16_L16_SUB: - LOAD8x16_0 - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_2 64,32, 1,0 - KERNEL8x16_I1_L4_2 64,32, 2,0 - KERNEL8x16_I1_L4_3 64,32, 3,1 + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64,0,0 + KERNEL8x16_I1_L4_2 128,64,1,0 + KERNEL8x16_I1_L4_2 128,64,2,0 + KERNEL8x16_I1_L4_3 128,64,3,1 blr L8: @@ -127,15 +127,16 @@ LSGEMM_L8x16_BEGIN: #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 7 /**(T11-1) % 128x */ + addi T12,T12, -2 + srawi. L, T12, 7 /**(T11-2) % 128x */ #else mr T12, K - addi T12,T12, -1 - srawi. L, T12, 7 /**(K-1) % 128x */ + addi T12,T12, -2 + srawi. L, T12, 7 /**(K-2) % 128x */ #endif - ZERO8x16 + ZERO8x16 + mtctr L ble LSGEMM_L8x16_SUB0 bl LSGEMM_L8x16_LMAIN_SUB andi. L, T12, 127 @@ -148,15 +149,33 @@ LSGEMM_L8x16_SUB0: cmpwi T11,128 #else andi. L, K, 255 + cmpwi K,129 +#endif + li T10,1 + bne CMP8x16_128K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD8x16 64,32 + END8x16_WITHOUT_ADD + LOAD8x16_2O AO,BO, 128, 64 + mtctr T10 + bl LSGEMM_L8x16_K128 + b LSGEMM_L8x16_SAVE +CMP8x16_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T11,128 +#else cmpwi K,128 -#endif - - bne LSGEMM_L8x16_SUB2 - MY_ALIGN -LSGEMM_L8x16_SUB2_128: - bl LSGEMM_L8x16_L64_SUB - bl LSGEMM_L8x16_L64_SUB - b LSGEMM_L8x16_SAVE +#endif + bne LSGEMM_L8x16_SUB2 + MY_ALIGN + mtctr T10 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD8x16_2O AO,BO, 128,64 + bl LSGEMM_L8x16_K128 + b LSGEMM_L8x16_SAVE MY_ALIGN LSGEMM_L8x16_SUB2: andi. T10,L,64 @@ -176,21 +195,21 @@ LSGEMM_L8x16_SUB2_16: LSGEMM_L8x16_SUB2_8: andi. T10,L, 8 ble LSGEMM_L8x16_SUB2_4 - LOAD8x16_0 - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_3 64,32, 1,1 + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64, 0,0 + KERNEL8x16_I1_L4_3 128,64, 1,1 MY_ALIGN LSGEMM_L8x16_SUB2_4: andi. T10,L, 4 ble LSGEMM_L8x16_SUB2_2 - LOAD8x16_0 - KERNEL8x16_I1_L4_3 64,32, 0,1 + LOAD8x16_2 + KERNEL8x16_I1_L4_3 128,64, 0,1 MY_ALIGN LSGEMM_L8x16_SUB2_2: andi. T10,L, 2 ble LSGEMM_L8x16_SUB2_1 - LOAD8x16_0 - KERNEL8x16_I1_L2_3 64,32, 0,1 + LOAD8x16_2 + KERNEL8x16_E2 128,64, 0,1 MY_ALIGN LSGEMM_L8x16_SUB2_1: andi. T10,L, 1 diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S index 3f86a1d25..2c9e537c7 100644 --- a/kernel/power/sgemm_macros_power9.S +++ b/kernel/power/sgemm_macros_power9.S @@ -38,13 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=8 and M=16 **********************************************************************************************/ -.macro LOAD8x16_1 - LOAD8x16 1 -.endm - -.macro LOAD8x16_0 - LOAD8x16 0 -.endm + .macro KERNEL8x16_L1_L4 Index,IsLast KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 @@ -61,10 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm -.macro KERNEL8x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L2_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - + .macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 .endm @@ -108,61 +99,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs63, vs63, vs63 .endm -.macro LOAD8x16 Zero +.macro LOAD8x16 OffsetA,OffsetB - lxv vs24, 0(BO) - lxv vs28, 16(BO) + lxv vs24, (\OffsetB+0)(BO) + lxv vs28, (\OffsetB+16)(BO) xxperm vs26, vs24, permute_mask xxperm vs30, vs28, permute_mask - lxv vs0, 0(AO) - lxv vs1, 16(AO) + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) xxpermdi vs25, vs24, vs24,2 xxpermdi vs29, vs28, vs28,2 - lxv vs2, 32(AO) - lxv vs3, 48(AO) + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) xxpermdi vs27, vs26, vs26,2 xxpermdi vs31, vs30, vs30,2 -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endif .endm .macro END8x16_NORMAL END8x16 0, AO, BO, 64,32 .endm +.macro END8x16_WITHOUT_ADD + END8x16 0, AO,BO,0,0 +.endm + .macro END8x16 First, AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 @@ -258,145 +219,202 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete -KERNEL8x16_L1_L2_I \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0 -KERNEL8x16_L1_L2_I \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete +KERNEL8x16_2 \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0 +KERNEL8x16_2 \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete .endm .macro KERNEL8x16 First - LOAD8x16 0 + LOAD8x16 0,0 END8x16 \First, AO, BO, 64,32 .endm -.macro KERNEL8x16_L1_L2_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP16(\Index,\OffsetB)(\BREG) - lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) +.macro LOAD8x16_2 + LOAD8x16_2O AO,BO, 0,0 +.endm + +.macro LOAD8x16_2O AREG,BREG, OffsetA,OffsetB + lxv vs8, (\OffsetB)(\BREG) + lxv vs12, (16+\OffsetB)(\BREG) + lxv vs24, (32+\OffsetB)(\BREG) + lxv vs28, (32+16+\OffsetB)(\BREG) + lxv vs4, (0+\OffsetA)(\AREG) + lxv vs5, (16+\OffsetA)(\AREG) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + lxv vs6, (32+\OffsetA)(\AREG) + lxv vs7, (48+\OffsetA)(\AREG) + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + lxv vs0, (64+\OffsetA)(\AREG) + lxv vs1, (64+16+\OffsetA)(\AREG) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + lxv vs2, (64+32+\OffsetA)(\AREG) + lxv vs3, (64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + +.macro END8x16_2 + /*for load2 offset will be 128 and 64*/ + KERNEL8x16_2 AO,BO, 128,64,0 ,1,1 +.endm + + + +.macro KERNEL8x16_E2 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL8x16_L2 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL8x16_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + +.if \Complete==0 + lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 +.if \Complete==0 + lxv vs8, DISP16(\Index,\OffsetB)(\BREG) + lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask +.endif + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 +.if \Complete==0 + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.endif + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 +.endif + +.if \Complete==0 + lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) +.endif xvmaddasp vs32, vs0,vs24 - xvmaddasp vs36, vs0,vs25 - lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs44, vs0,vs27 - lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) + xvmaddasp vs33, vs1,vs24 xvmaddasp vs48, vs0,vs28 - xvmaddasp vs52, vs0,vs29 - - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 xvmaddasp vs60, vs0,vs31 - - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - - - - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs37, vs1,vs25 - - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs61, vs1,vs31 + xvmaddasp vs61, vs1,vs31 .if \Complete==0 - lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) + lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) .endif - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs47, vs3,vs27 - xvmaddasp vs51, vs3,vs28 - xvmaddasp vs55, vs3,vs29 - xvmaddasp vs59, vs3,vs30 - xvmaddasp vs63, vs3,vs31 -.if \Complete==0 - lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) -.endif - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs36, vs4,vs9 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 .if \Complete==0 - lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) + lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 .endif +.if \Complete==0 + lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) +.endif + + .if \IsLast==1 .if \Complete==1 - addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) - addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) + addi \BREG, \BREG, DISP16(\Index,\OffsetB) + addi \AREG, \AREG, DISP32(\Index,\OffsetA) .else - addi \AREG, \AREG, DISP32(\Index,128) addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP32(\Index,128) .endif .endif - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs44, vs4,vs11 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask -.endif - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs52, vs4,vs13 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs60, vs4,vs15 - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.endif - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs49, vs5,vs12 - xvmaddasp vs53, vs5,vs13 - xvmaddasp vs57, vs5,vs14 - xvmaddasp vs61, vs5,vs15 - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs50, vs6,vs12 - xvmaddasp vs54, vs6,vs13 - xvmaddasp vs58, vs6,vs14 - xvmaddasp vs62, vs6,vs15 - - xvmaddasp vs35, vs7,vs8 - xvmaddasp vs39, vs7,vs9 - xvmaddasp vs43, vs7,vs10 - xvmaddasp vs47, vs7,vs11 - xvmaddasp vs51, vs7,vs12 - xvmaddasp vs55, vs7,vs13 - xvmaddasp vs59, vs7,vs14 - xvmaddasp vs63, vs7,vs15 - .endm diff --git a/param.h b/param.h index 9a1a68ecd..3934da6c8 100644 --- a/param.h +++ b/param.h @@ -2253,7 +2253,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_P 640 #define ZGEMM_DEFAULT_P 256 -#define SGEMM_DEFAULT_Q 1025 +#define SGEMM_DEFAULT_Q 1026 #define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 640 #define ZGEMM_DEFAULT_Q 1026 From a575f1e4c771b31ba29bd11af4a3190f240cf1d2 Mon Sep 17 00:00:00 2001 From: kavanabhat Date: Wed, 19 Jun 2019 15:27:14 +0530 Subject: [PATCH 088/127] Update dtrmm_kernel_16x4_power8.S --- kernel/power/dtrmm_kernel_16x4_power8.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S index 47e703a3a..57829ac51 100644 --- a/kernel/power/dtrmm_kernel_16x4_power8.S +++ b/kernel/power/dtrmm_kernel_16x4_power8.S @@ -257,8 +257,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stvx v31, r11, r0 li r11,0 - stw r31, 144(SP) - stfd f1, ALPHA_SP stw r0, FZERO From 7684c4f8f8f979ec4d8a563e9b9cb442d9b04a80 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 20 Jun 2019 19:56:01 +0200 Subject: [PATCH 089/127] PGI compiler does not like -march=native --- Makefile.system | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 44eacda4b..fcb3cbe33 100644 --- a/Makefile.system +++ b/Makefile.system @@ -144,9 +144,10 @@ endif # On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. ifeq ($(ARCH), x86_64) +ifneq ($(C_COMPILER), PGI) GETARCH_FLAGS += -march=native endif - +endif ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) From eebfeba7680e4b81f0803f44999c86303aa5945b Mon Sep 17 00:00:00 2001 From: Piotr Kubaj Date: Tue, 25 Jun 2019 10:58:56 +0200 Subject: [PATCH 090/127] Fix build on FreeBSD/powerpc64. Signed-off-by: Piotr Kubaj --- common_power.h | 6 +++--- kernel/power/axpy.S | 2 +- kernel/power/axpy_ppc440.S | 2 +- kernel/power/cgemm_kernel_8x4_power8.S | 6 +++--- kernel/power/ctrmm_kernel_8x4_power8.S | 6 +++--- kernel/power/dgemm_kernel_16x4_power8.S | 4 ++-- kernel/power/dtrmm_kernel_16x4_power8.S | 4 ++-- kernel/power/dtrsm_kernel_LT_16x4_power8.S | 4 ++-- kernel/power/gemm_beta.S | 2 +- kernel/power/gemm_kernel.S | 6 +++--- kernel/power/gemm_kernel_altivec.S | 2 +- kernel/power/gemm_kernel_altivec_cell.S | 2 +- kernel/power/gemm_kernel_altivec_g4.S | 2 +- kernel/power/gemm_kernel_cell.S | 6 +++--- kernel/power/gemm_kernel_g4.S | 4 ++-- kernel/power/gemm_kernel_hummer.S | 2 +- kernel/power/gemm_kernel_power3.S | 4 ++-- kernel/power/gemm_kernel_power6.S | 4 ++-- kernel/power/gemm_kernel_ppc440.S | 4 ++-- kernel/power/gemv_n.S | 4 ++-- kernel/power/gemv_n_ppc440.S | 4 ++-- kernel/power/gemv_t.S | 4 ++-- kernel/power/gemv_t_ppc440.S | 4 ++-- kernel/power/ger.S | 4 ++-- kernel/power/scal.S | 2 +- kernel/power/scal_ppc440.S | 2 +- kernel/power/sgemm_kernel_16x8_power8.S | 4 ++-- kernel/power/strmm_kernel_16x8_power8.S | 4 ++-- kernel/power/swap.S | 2 +- kernel/power/symv_L.S | 4 ++-- kernel/power/symv_U.S | 4 ++-- kernel/power/trsm_kernel_LN.S | 6 +++--- kernel/power/trsm_kernel_LT.S | 6 +++--- kernel/power/trsm_kernel_RT.S | 6 +++--- kernel/power/trsm_kernel_cell_LN.S | 6 +++--- kernel/power/trsm_kernel_cell_LT.S | 6 +++--- kernel/power/trsm_kernel_cell_RT.S | 6 +++--- kernel/power/trsm_kernel_hummer_LN.S | 2 +- kernel/power/trsm_kernel_hummer_LT.S | 2 +- kernel/power/trsm_kernel_hummer_RT.S | 2 +- kernel/power/trsm_kernel_power6_LN.S | 4 ++-- kernel/power/trsm_kernel_power6_LT.S | 4 ++-- kernel/power/trsm_kernel_power6_RT.S | 4 ++-- kernel/power/trsm_kernel_ppc440_LN.S | 4 ++-- kernel/power/trsm_kernel_ppc440_LT.S | 4 ++-- kernel/power/trsm_kernel_ppc440_RT.S | 4 ++-- kernel/power/zaxpy.S | 4 ++-- kernel/power/zaxpy_ppc440.S | 4 ++-- kernel/power/zgemm_beta.S | 2 +- kernel/power/zgemm_kernel.S | 8 ++++---- kernel/power/zgemm_kernel_8x2_power8.S | 6 +++--- kernel/power/zgemm_kernel_altivec.S | 6 +++--- kernel/power/zgemm_kernel_altivec_cell.S | 6 +++--- kernel/power/zgemm_kernel_altivec_g4.S | 4 ++-- kernel/power/zgemm_kernel_cell.S | 8 ++++---- kernel/power/zgemm_kernel_g4.S | 6 +++--- kernel/power/zgemm_kernel_hummer.S | 2 +- kernel/power/zgemm_kernel_power3.S | 6 +++--- kernel/power/zgemm_kernel_power6.S | 6 +++--- kernel/power/zgemm_kernel_power9.S | 4 ++-- kernel/power/zgemm_kernel_ppc440.S | 6 +++--- kernel/power/zgemv_n.S | 4 ++-- kernel/power/zgemv_n_ppc440.S | 4 ++-- kernel/power/zgemv_t.S | 4 ++-- kernel/power/zgemv_t_ppc440.S | 4 ++-- kernel/power/zger.S | 4 ++-- kernel/power/zscal.S | 2 +- kernel/power/zscal_ppc440.S | 2 +- kernel/power/zswap.S | 4 ++-- kernel/power/zsymv_L.S | 4 ++-- kernel/power/zsymv_U.S | 4 ++-- kernel/power/ztrmm_kernel_8x2_power8.S | 6 +++--- kernel/power/ztrsm_kernel_LN.S | 8 ++++---- kernel/power/ztrsm_kernel_LT.S | 8 ++++---- kernel/power/ztrsm_kernel_RT.S | 8 ++++---- kernel/power/ztrsm_kernel_cell_LN.S | 6 +++--- kernel/power/ztrsm_kernel_cell_LT.S | 8 ++++---- kernel/power/ztrsm_kernel_cell_RT.S | 6 +++--- kernel/power/ztrsm_kernel_hummer_LN.S | 2 +- kernel/power/ztrsm_kernel_hummer_LT.S | 2 +- kernel/power/ztrsm_kernel_hummer_RT.S | 2 +- kernel/power/ztrsm_kernel_power6_LN.S | 6 +++--- kernel/power/ztrsm_kernel_power6_LT.S | 6 +++--- kernel/power/ztrsm_kernel_power6_RT.S | 6 +++--- kernel/power/ztrsm_kernel_ppc440_LN.S | 6 +++--- kernel/power/ztrsm_kernel_ppc440_LT.S | 6 +++--- kernel/power/ztrsm_kernel_ppc440_RT.S | 6 +++--- 87 files changed, 193 insertions(+), 193 deletions(-) diff --git a/common_power.h b/common_power.h index 889205c75..f38b85864 100644 --- a/common_power.h +++ b/common_power.h @@ -499,7 +499,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #if defined(ASSEMBLER) && !defined(NEEDPARAM) -#ifdef OS_LINUX +#if defined(OS_LINUX) || defined(OS_FREEBSD) #ifndef __64BIT__ #define PROLOGUE \ .section .text;\ @@ -784,7 +784,7 @@ Lmcount$lazy_ptr: #define HALT mfspr r0, 1023 -#ifdef OS_LINUX +#if defined(OS_LINUX) || defined(OS_FREEBSD) #if defined(PPC440) || defined(PPC440FP2) #undef MAX_CPU_NUMBER #define MAX_CPU_NUMBER 1 @@ -829,7 +829,7 @@ Lmcount$lazy_ptr: #define MAP_ANONYMOUS MAP_ANON #endif -#ifdef OS_LINUX +#if defined(OS_LINUX) || defined(OS_FREEBSD) #ifndef __64BIT__ #define FRAMESLOT(X) (((X) * 4) + 8) #else diff --git a/kernel/power/axpy.S b/kernel/power/axpy.S index fb9789da4..238771826 100644 --- a/kernel/power/axpy.S +++ b/kernel/power/axpy.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define N r3 #define X r6 diff --git a/kernel/power/axpy_ppc440.S b/kernel/power/axpy_ppc440.S index 81a660e4d..7733e46e7 100644 --- a/kernel/power/axpy_ppc440.S +++ b/kernel/power/axpy_ppc440.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define N r3 #define X r6 diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S index 8dbb6011d..2bc99974f 100644 --- a/kernel/power/cgemm_kernel_8x4_power8.S +++ b/kernel/power/cgemm_kernel_8x4_power8.S @@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -265,7 +265,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stfs f2, ALPHA_I_SP // stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif @@ -286,7 +286,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif diff --git a/kernel/power/ctrmm_kernel_8x4_power8.S b/kernel/power/ctrmm_kernel_8x4_power8.S index 26f49c663..822420dfd 100644 --- a/kernel/power/ctrmm_kernel_8x4_power8.S +++ b/kernel/power/ctrmm_kernel_8x4_power8.S @@ -98,7 +98,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -264,7 +264,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stfs f2, ALPHA_I_SP // stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -285,7 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S index 41958eab0..651fd53fc 100644 --- a/kernel/power/dgemm_kernel_16x4_power8.S +++ b/kernel/power/dgemm_kernel_16x4_power8.S @@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -271,7 +271,7 @@ li r11,0 slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S index 57829ac51..84c65f503 100644 --- a/kernel/power/dtrmm_kernel_16x4_power8.S +++ b/kernel/power/dtrmm_kernel_16x4_power8.S @@ -96,7 +96,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -269,7 +269,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/dtrsm_kernel_LT_16x4_power8.S b/kernel/power/dtrsm_kernel_LT_16x4_power8.S index 7a4a30390..8a423f181 100644 --- a/kernel/power/dtrsm_kernel_LT_16x4_power8.S +++ b/kernel/power/dtrsm_kernel_LT_16x4_power8.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -217,7 +217,7 @@ li r11,0 #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/gemm_beta.S b/kernel/power/gemm_beta.S index 7acc05b4d..81457b698 100644 --- a/kernel/power/gemm_beta.S +++ b/kernel/power/gemm_beta.S @@ -62,7 +62,7 @@ stfd f31, 16(SP) stw r0, 24(SP) -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/gemm_kernel.S b/kernel/power/gemm_kernel.S index e5e9ec346..37ff9c9e7 100644 --- a/kernel/power/gemm_kernel.S +++ b/kernel/power/gemm_kernel.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -186,7 +186,7 @@ slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -228,7 +228,7 @@ #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/gemm_kernel_altivec.S b/kernel/power/gemm_kernel_altivec.S index 6c7e78319..2dae49cb8 100644 --- a/kernel/power/gemm_kernel_altivec.S +++ b/kernel/power/gemm_kernel_altivec.S @@ -58,7 +58,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 diff --git a/kernel/power/gemm_kernel_altivec_cell.S b/kernel/power/gemm_kernel_altivec_cell.S index b7445a1f6..0823420dd 100644 --- a/kernel/power/gemm_kernel_altivec_cell.S +++ b/kernel/power/gemm_kernel_altivec_cell.S @@ -58,7 +58,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 diff --git a/kernel/power/gemm_kernel_altivec_g4.S b/kernel/power/gemm_kernel_altivec_g4.S index 548150143..3a214b248 100644 --- a/kernel/power/gemm_kernel_altivec_g4.S +++ b/kernel/power/gemm_kernel_altivec_g4.S @@ -58,7 +58,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 diff --git a/kernel/power/gemm_kernel_cell.S b/kernel/power/gemm_kernel_cell.S index f3d3b8325..26f9cb023 100644 --- a/kernel/power/gemm_kernel_cell.S +++ b/kernel/power/gemm_kernel_cell.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -192,7 +192,7 @@ slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -226,7 +226,7 @@ li PREC, 4 * SIZE #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/gemm_kernel_g4.S b/kernel/power/gemm_kernel_g4.S index 259f04c4e..a5c4d3a43 100644 --- a/kernel/power/gemm_kernel_g4.S +++ b/kernel/power/gemm_kernel_g4.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -184,7 +184,7 @@ slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/gemm_kernel_hummer.S b/kernel/power/gemm_kernel_hummer.S index 3a8e1edfa..6ecbeb3e0 100644 --- a/kernel/power/gemm_kernel_hummer.S +++ b/kernel/power/gemm_kernel_hummer.S @@ -46,7 +46,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/gemm_kernel_power3.S b/kernel/power/gemm_kernel_power3.S index 4a6b5da62..f88bc291c 100644 --- a/kernel/power/gemm_kernel_power3.S +++ b/kernel/power/gemm_kernel_power3.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -187,7 +187,7 @@ li PREC, 4 * SIZE #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/gemm_kernel_power6.S b/kernel/power/gemm_kernel_power6.S index 1a412c4fb..b274f7655 100644 --- a/kernel/power/gemm_kernel_power6.S +++ b/kernel/power/gemm_kernel_power6.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -183,7 +183,7 @@ slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/gemm_kernel_ppc440.S b/kernel/power/gemm_kernel_ppc440.S index b128beb38..c5ef6e4e5 100644 --- a/kernel/power/gemm_kernel_ppc440.S +++ b/kernel/power/gemm_kernel_ppc440.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -183,7 +183,7 @@ slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S index 02160bd61..abc61b62e 100644 --- a/kernel/power/gemv_n.S +++ b/kernel/power/gemv_n.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -252,7 +252,7 @@ stw r27, 196(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/gemv_n_ppc440.S b/kernel/power/gemv_n_ppc440.S index beb21200a..18d804520 100644 --- a/kernel/power/gemv_n_ppc440.S +++ b/kernel/power/gemv_n_ppc440.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -199,7 +199,7 @@ stw r23, 180(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S index 457753065..25a4dd01b 100644 --- a/kernel/power/gemv_t.S +++ b/kernel/power/gemv_t.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -260,7 +260,7 @@ stw r29, 220(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/gemv_t_ppc440.S b/kernel/power/gemv_t_ppc440.S index 6e560db6c..7d12b07a4 100644 --- a/kernel/power/gemv_t_ppc440.S +++ b/kernel/power/gemv_t_ppc440.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -190,7 +190,7 @@ stw r22, 192(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/ger.S b/kernel/power/ger.S index fd397ce8c..d83546b0d 100644 --- a/kernel/power/ger.S +++ b/kernel/power/ger.S @@ -47,7 +47,7 @@ #endif #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -224,7 +224,7 @@ stw r27, 196(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/scal.S b/kernel/power/scal.S index 7c65d1234..19fdd32ab 100644 --- a/kernel/power/scal.S +++ b/kernel/power/scal.S @@ -43,7 +43,7 @@ #define XX r4 #define PREA r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define X r6 #define INCX r7 diff --git a/kernel/power/scal_ppc440.S b/kernel/power/scal_ppc440.S index ed148834d..d977b0b59 100644 --- a/kernel/power/scal_ppc440.S +++ b/kernel/power/scal_ppc440.S @@ -43,7 +43,7 @@ #define XX r4 #define PRE r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define X r6 #define INCX r7 diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S index c72b00cf6..3e6440af8 100644 --- a/kernel/power/sgemm_kernel_16x8_power8.S +++ b/kernel/power/sgemm_kernel_16x8_power8.S @@ -95,7 +95,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -273,7 +273,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. slwi LDC, LDC, 2 #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif diff --git a/kernel/power/strmm_kernel_16x8_power8.S b/kernel/power/strmm_kernel_16x8_power8.S index f9b8a0bb8..78e539231 100644 --- a/kernel/power/strmm_kernel_16x8_power8.S +++ b/kernel/power/strmm_kernel_16x8_power8.S @@ -96,7 +96,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/swap.S b/kernel/power/swap.S index e862b17bb..c9c0f86b0 100644 --- a/kernel/power/swap.S +++ b/kernel/power/swap.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define N r3 #define X r6 diff --git a/kernel/power/symv_L.S b/kernel/power/symv_L.S index f7d768c50..a4ff703e2 100644 --- a/kernel/power/symv_L.S +++ b/kernel/power/symv_L.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -248,7 +248,7 @@ stw r27, 196(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/symv_U.S b/kernel/power/symv_U.S index d8e082397..c3063e077 100644 --- a/kernel/power/symv_U.S +++ b/kernel/power/symv_U.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define IS r4 @@ -247,7 +247,7 @@ stw r27, 196(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/trsm_kernel_LN.S b/kernel/power/trsm_kernel_LN.S index 7983c573b..8319d5ed8 100644 --- a/kernel/power/trsm_kernel_LN.S +++ b/kernel/power/trsm_kernel_LN.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -236,7 +236,7 @@ #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_LT.S b/kernel/power/trsm_kernel_LT.S index c561fd014..30f25e015 100644 --- a/kernel/power/trsm_kernel_LT.S +++ b/kernel/power/trsm_kernel_LT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -257,7 +257,7 @@ #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_RT.S b/kernel/power/trsm_kernel_RT.S index 07b88402c..d39d3a6e2 100644 --- a/kernel/power/trsm_kernel_RT.S +++ b/kernel/power/trsm_kernel_RT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -254,7 +254,7 @@ #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_cell_LN.S b/kernel/power/trsm_kernel_cell_LN.S index 803530cbb..f656015a8 100644 --- a/kernel/power/trsm_kernel_cell_LN.S +++ b/kernel/power/trsm_kernel_cell_LN.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -231,7 +231,7 @@ li PREC, -4 * SIZE #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_cell_LT.S b/kernel/power/trsm_kernel_cell_LT.S index 105e7d43c..083af7289 100644 --- a/kernel/power/trsm_kernel_cell_LT.S +++ b/kernel/power/trsm_kernel_cell_LT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -257,7 +257,7 @@ #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_cell_RT.S b/kernel/power/trsm_kernel_cell_RT.S index a54a261cb..5a5b67e77 100644 --- a/kernel/power/trsm_kernel_cell_RT.S +++ b/kernel/power/trsm_kernel_cell_RT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -231,7 +231,7 @@ li PREC, -4 * SIZE #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_hummer_LN.S b/kernel/power/trsm_kernel_hummer_LN.S index 109dacb8c..35ffab427 100644 --- a/kernel/power/trsm_kernel_hummer_LN.S +++ b/kernel/power/trsm_kernel_hummer_LN.S @@ -46,7 +46,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/trsm_kernel_hummer_LT.S b/kernel/power/trsm_kernel_hummer_LT.S index 1ad062a7c..f7a09dbd8 100644 --- a/kernel/power/trsm_kernel_hummer_LT.S +++ b/kernel/power/trsm_kernel_hummer_LT.S @@ -46,7 +46,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/trsm_kernel_hummer_RT.S b/kernel/power/trsm_kernel_hummer_RT.S index 94b3c0c85..0e563e5cc 100644 --- a/kernel/power/trsm_kernel_hummer_RT.S +++ b/kernel/power/trsm_kernel_hummer_RT.S @@ -46,7 +46,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/trsm_kernel_power6_LN.S b/kernel/power/trsm_kernel_power6_LN.S index 937a6761a..83594c772 100644 --- a/kernel/power/trsm_kernel_power6_LN.S +++ b/kernel/power/trsm_kernel_power6_LN.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -179,7 +179,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/trsm_kernel_power6_LT.S b/kernel/power/trsm_kernel_power6_LT.S index 924f00ec0..54a8547b0 100644 --- a/kernel/power/trsm_kernel_power6_LT.S +++ b/kernel/power/trsm_kernel_power6_LT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/trsm_kernel_power6_RT.S b/kernel/power/trsm_kernel_power6_RT.S index 40ee5e28d..b2b27613c 100644 --- a/kernel/power/trsm_kernel_power6_RT.S +++ b/kernel/power/trsm_kernel_power6_RT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -179,7 +179,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/trsm_kernel_ppc440_LN.S b/kernel/power/trsm_kernel_ppc440_LN.S index 6b7312101..a708a084d 100644 --- a/kernel/power/trsm_kernel_ppc440_LN.S +++ b/kernel/power/trsm_kernel_ppc440_LN.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -191,7 +191,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/trsm_kernel_ppc440_LT.S b/kernel/power/trsm_kernel_ppc440_LT.S index 28b109b96..31f82de2c 100644 --- a/kernel/power/trsm_kernel_ppc440_LT.S +++ b/kernel/power/trsm_kernel_ppc440_LT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -176,7 +176,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/trsm_kernel_ppc440_RT.S b/kernel/power/trsm_kernel_ppc440_RT.S index df80cd393..f5005403c 100644 --- a/kernel/power/trsm_kernel_ppc440_RT.S +++ b/kernel/power/trsm_kernel_ppc440_RT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -191,7 +191,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/zaxpy.S b/kernel/power/zaxpy.S index ac5b249bb..b001f42d1 100644 --- a/kernel/power/zaxpy.S +++ b/kernel/power/zaxpy.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define N r3 #define X r6 @@ -123,7 +123,7 @@ stfd f24, 80(SP) stfd f25, 88(SP) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/zaxpy_ppc440.S b/kernel/power/zaxpy_ppc440.S index b5c604e91..848a0135f 100644 --- a/kernel/power/zaxpy_ppc440.S +++ b/kernel/power/zaxpy_ppc440.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define N r3 #define X r6 @@ -112,7 +112,7 @@ stfd f24, 80(SP) stfd f25, 88(SP) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/zgemm_beta.S b/kernel/power/zgemm_beta.S index 1f4c29210..57c3bed50 100644 --- a/kernel/power/zgemm_beta.S +++ b/kernel/power/zgemm_beta.S @@ -62,7 +62,7 @@ stfd f31, 8(SP) stw r0, 16(SP) -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/zgemm_kernel.S b/kernel/power/zgemm_kernel.S index 8ec8b674a..ae8a93e89 100644 --- a/kernel/power/zgemm_kernel.S +++ b/kernel/power/zgemm_kernel.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -169,7 +169,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -190,7 +190,7 @@ #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -231,7 +231,7 @@ #endif #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S index 5526b91c9..dfe2d9dc6 100644 --- a/kernel/power/zgemm_kernel_8x2_power8.S +++ b/kernel/power/zgemm_kernel_8x2_power8.S @@ -132,7 +132,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -296,7 +296,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stfd f2, ALPHA_I_SP stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif @@ -317,7 +317,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif diff --git a/kernel/power/zgemm_kernel_altivec.S b/kernel/power/zgemm_kernel_altivec.S index 2b650cd02..2525a8e58 100644 --- a/kernel/power/zgemm_kernel_altivec.S +++ b/kernel/power/zgemm_kernel_altivec.S @@ -62,7 +62,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -238,7 +238,7 @@ #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -264,7 +264,7 @@ #endif #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/zgemm_kernel_altivec_cell.S b/kernel/power/zgemm_kernel_altivec_cell.S index 642d1f2e7..47a79064d 100644 --- a/kernel/power/zgemm_kernel_altivec_cell.S +++ b/kernel/power/zgemm_kernel_altivec_cell.S @@ -62,7 +62,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -244,7 +244,7 @@ #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -270,7 +270,7 @@ #endif #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/zgemm_kernel_altivec_g4.S b/kernel/power/zgemm_kernel_altivec_g4.S index 0f7a6f9aa..c305270bd 100644 --- a/kernel/power/zgemm_kernel_altivec_g4.S +++ b/kernel/power/zgemm_kernel_altivec_g4.S @@ -62,7 +62,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -238,7 +238,7 @@ #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/zgemm_kernel_cell.S b/kernel/power/zgemm_kernel_cell.S index 8fd6b0afb..3d179378b 100644 --- a/kernel/power/zgemm_kernel_cell.S +++ b/kernel/power/zgemm_kernel_cell.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -175,7 +175,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -196,7 +196,7 @@ #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -230,7 +230,7 @@ li PREA, 16 * 12 * SIZE #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/zgemm_kernel_g4.S b/kernel/power/zgemm_kernel_g4.S index bf6bf77e8..b92fb4225 100644 --- a/kernel/power/zgemm_kernel_g4.S +++ b/kernel/power/zgemm_kernel_g4.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -185,7 +185,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -206,7 +206,7 @@ #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/zgemm_kernel_hummer.S b/kernel/power/zgemm_kernel_hummer.S index 991a64373..5546dd2f6 100644 --- a/kernel/power/zgemm_kernel_hummer.S +++ b/kernel/power/zgemm_kernel_hummer.S @@ -48,7 +48,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/zgemm_kernel_power3.S b/kernel/power/zgemm_kernel_power3.S index 471d3b9ae..d14cb1cd9 100644 --- a/kernel/power/zgemm_kernel_power3.S +++ b/kernel/power/zgemm_kernel_power3.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -161,7 +161,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -202,7 +202,7 @@ #endif #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/zgemm_kernel_power6.S b/kernel/power/zgemm_kernel_power6.S index 3c28649bc..9b47b9fc1 100644 --- a/kernel/power/zgemm_kernel_power6.S +++ b/kernel/power/zgemm_kernel_power6.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -199,7 +199,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -220,7 +220,7 @@ #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S index 813f270b8..d1e60da6c 100644 --- a/kernel/power/zgemm_kernel_power9.S +++ b/kernel/power/zgemm_kernel_power9.S @@ -147,13 +147,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r0, FLINK_SAVE(SP) -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif #endif diff --git a/kernel/power/zgemm_kernel_ppc440.S b/kernel/power/zgemm_kernel_ppc440.S index 748b69a0c..ba99a21c5 100644 --- a/kernel/power/zgemm_kernel_ppc440.S +++ b/kernel/power/zgemm_kernel_ppc440.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -182,7 +182,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -203,7 +203,7 @@ #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S index f93439986..708f1318d 100644 --- a/kernel/power/zgemv_n.S +++ b/kernel/power/zgemv_n.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -250,7 +250,7 @@ stw r22, 176(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/zgemv_n_ppc440.S b/kernel/power/zgemv_n_ppc440.S index 55dd2d84f..bd1148b65 100644 --- a/kernel/power/zgemv_n_ppc440.S +++ b/kernel/power/zgemv_n_ppc440.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -223,7 +223,7 @@ stw r22, 176(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S index 9c6f510c2..d82fab16a 100644 --- a/kernel/power/zgemv_t.S +++ b/kernel/power/zgemv_t.S @@ -47,7 +47,7 @@ #define STACKSIZE 304 #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -226,7 +226,7 @@ stw r0, 4 + FZERO #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/zgemv_t_ppc440.S b/kernel/power/zgemv_t_ppc440.S index bfc039a0c..d7f3ee027 100644 --- a/kernel/power/zgemv_t_ppc440.S +++ b/kernel/power/zgemv_t_ppc440.S @@ -47,7 +47,7 @@ #define STACKSIZE 304 #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -179,7 +179,7 @@ stw r0, 4 + FZERO #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/zger.S b/kernel/power/zger.S index a9a607815..73757d448 100644 --- a/kernel/power/zger.S +++ b/kernel/power/zger.S @@ -47,7 +47,7 @@ #endif #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -235,7 +235,7 @@ stw r27, 196(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/zscal.S b/kernel/power/zscal.S index 2eb7b0df3..ae68ee672 100644 --- a/kernel/power/zscal.S +++ b/kernel/power/zscal.S @@ -43,7 +43,7 @@ #define XX r4 #define PREA r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define X r6 #define INCX r7 diff --git a/kernel/power/zscal_ppc440.S b/kernel/power/zscal_ppc440.S index d0e4c9bcf..55dd1b87b 100644 --- a/kernel/power/zscal_ppc440.S +++ b/kernel/power/zscal_ppc440.S @@ -43,7 +43,7 @@ #define XX r4 #define PRE r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define X r6 #define INCX r7 diff --git a/kernel/power/zswap.S b/kernel/power/zswap.S index 8befadca2..415164a2b 100644 --- a/kernel/power/zswap.S +++ b/kernel/power/zswap.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define N r3 #define X r6 @@ -117,7 +117,7 @@ stfd f30, 128(SP) stfd f31, 136(SP) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/zsymv_L.S b/kernel/power/zsymv_L.S index b348e328f..9f00df072 100644 --- a/kernel/power/zsymv_L.S +++ b/kernel/power/zsymv_L.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -259,7 +259,7 @@ stw r27, 196(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/zsymv_U.S b/kernel/power/zsymv_U.S index b631cbe35..fe97fde8b 100644 --- a/kernel/power/zsymv_U.S +++ b/kernel/power/zsymv_U.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define IS r4 @@ -256,7 +256,7 @@ stw r27, 196(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/ztrmm_kernel_8x2_power8.S b/kernel/power/ztrmm_kernel_8x2_power8.S index c1415138c..684cbd6eb 100644 --- a/kernel/power/ztrmm_kernel_8x2_power8.S +++ b/kernel/power/ztrmm_kernel_8x2_power8.S @@ -98,7 +98,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -259,7 +259,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stfd f2, ALPHA_I_SP stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -280,7 +280,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_LN.S b/kernel/power/ztrsm_kernel_LN.S index 87473b45d..3acd9562d 100644 --- a/kernel/power/ztrsm_kernel_LN.S +++ b/kernel/power/ztrsm_kernel_LN.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -166,7 +166,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -186,7 +186,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -244,7 +244,7 @@ #endif #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/ztrsm_kernel_LT.S b/kernel/power/ztrsm_kernel_LT.S index db0860124..2d4f31189 100644 --- a/kernel/power/ztrsm_kernel_LT.S +++ b/kernel/power/ztrsm_kernel_LT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -166,7 +166,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -186,7 +186,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -247,7 +247,7 @@ #endif #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/ztrsm_kernel_RT.S b/kernel/power/ztrsm_kernel_RT.S index c50ab86df..605363119 100644 --- a/kernel/power/ztrsm_kernel_RT.S +++ b/kernel/power/ztrsm_kernel_RT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -166,7 +166,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -186,7 +186,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -247,7 +247,7 @@ #endif #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/ztrsm_kernel_cell_LN.S b/kernel/power/ztrsm_kernel_cell_LN.S index 884a3e864..4798b5958 100644 --- a/kernel/power/ztrsm_kernel_cell_LN.S +++ b/kernel/power/ztrsm_kernel_cell_LN.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -172,7 +172,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -192,7 +192,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_cell_LT.S b/kernel/power/ztrsm_kernel_cell_LT.S index 388dfe3c2..654938a4d 100644 --- a/kernel/power/ztrsm_kernel_cell_LT.S +++ b/kernel/power/ztrsm_kernel_cell_LT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -172,7 +172,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -192,7 +192,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -246,7 +246,7 @@ li PREA, 16 * 12 * SIZE #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/ztrsm_kernel_cell_RT.S b/kernel/power/ztrsm_kernel_cell_RT.S index 00b50fe04..e3fe84d00 100644 --- a/kernel/power/ztrsm_kernel_cell_RT.S +++ b/kernel/power/ztrsm_kernel_cell_RT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -172,7 +172,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -192,7 +192,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_hummer_LN.S b/kernel/power/ztrsm_kernel_hummer_LN.S index bf3eafa45..042f4d476 100644 --- a/kernel/power/ztrsm_kernel_hummer_LN.S +++ b/kernel/power/ztrsm_kernel_hummer_LN.S @@ -48,7 +48,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/ztrsm_kernel_hummer_LT.S b/kernel/power/ztrsm_kernel_hummer_LT.S index 865c85f78..fc8a0bef8 100644 --- a/kernel/power/ztrsm_kernel_hummer_LT.S +++ b/kernel/power/ztrsm_kernel_hummer_LT.S @@ -48,7 +48,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/ztrsm_kernel_hummer_RT.S b/kernel/power/ztrsm_kernel_hummer_RT.S index 99868f948..17e31ffa8 100644 --- a/kernel/power/ztrsm_kernel_hummer_RT.S +++ b/kernel/power/ztrsm_kernel_hummer_RT.S @@ -48,7 +48,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/ztrsm_kernel_power6_LN.S b/kernel/power/ztrsm_kernel_power6_LN.S index 65b8077db..3c40f605a 100644 --- a/kernel/power/ztrsm_kernel_power6_LN.S +++ b/kernel/power/ztrsm_kernel_power6_LN.S @@ -57,7 +57,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -184,7 +184,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -204,7 +204,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_power6_LT.S b/kernel/power/ztrsm_kernel_power6_LT.S index c27170604..b2a92301d 100644 --- a/kernel/power/ztrsm_kernel_power6_LT.S +++ b/kernel/power/ztrsm_kernel_power6_LT.S @@ -57,7 +57,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -184,7 +184,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -204,7 +204,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_power6_RT.S b/kernel/power/ztrsm_kernel_power6_RT.S index ff0338cdc..cf37b5ca0 100644 --- a/kernel/power/ztrsm_kernel_power6_RT.S +++ b/kernel/power/ztrsm_kernel_power6_RT.S @@ -57,7 +57,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -184,7 +184,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -204,7 +204,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_ppc440_LN.S b/kernel/power/ztrsm_kernel_ppc440_LN.S index d33522456..f0be64d81 100644 --- a/kernel/power/ztrsm_kernel_ppc440_LN.S +++ b/kernel/power/ztrsm_kernel_ppc440_LN.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -177,7 +177,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -197,7 +197,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_ppc440_LT.S b/kernel/power/ztrsm_kernel_ppc440_LT.S index a9e7b891f..d5ff1b57f 100644 --- a/kernel/power/ztrsm_kernel_ppc440_LT.S +++ b/kernel/power/ztrsm_kernel_ppc440_LT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -177,7 +177,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -197,7 +197,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_ppc440_RT.S b/kernel/power/ztrsm_kernel_ppc440_RT.S index 43f4b07cb..b77dd76d1 100644 --- a/kernel/power/ztrsm_kernel_ppc440_RT.S +++ b/kernel/power/ztrsm_kernel_ppc440_RT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -177,7 +177,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -197,7 +197,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif From 5a4f1a21188a99d935482f1bda057d4ea42d34f4 Mon Sep 17 00:00:00 2001 From: pkubaj Date: Fri, 28 Jun 2019 10:29:44 +0000 Subject: [PATCH 091/127] Fix build for PPC970 on FreeBSD pt. 1 FreeBSD needs DCBT_ARG=0 as well. --- common_power.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_power.h b/common_power.h index f38b85864..5e15b7554 100644 --- a/common_power.h +++ b/common_power.h @@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define HAVE_PREFETCH #endif -#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) ) +#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && ( defined(OS_DARWIN) || defined(OS_FREEBSD) ) ) #define DCBT_ARG 0 #else #define DCBT_ARG 8 From 7c7505a7784a698ecfac453284080b5074a7b102 Mon Sep 17 00:00:00 2001 From: pkubaj Date: Fri, 28 Jun 2019 10:31:45 +0000 Subject: [PATCH 092/127] Fix build for PPC970 on FreeBSD pt.2 FreeBSD needs those macros too. --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 9a1a68ecd..0f354f2bc 100644 --- a/param.h +++ b/param.h @@ -1999,7 +1999,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 -#if defined(OS_LINUX) || defined(OS_DARWIN) +#if defined(OS_LINUX) || defined(OS_DARWIN) || defined(OS_FREEBSD) #if L2_SIZE == 1024976 #define SGEMM_DEFAULT_P 320 #define DGEMM_DEFAULT_P 256 From a97b301aaabbe4bdf99a4506cc6a007d707f8b14 Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Tue, 18 Jun 2019 15:55:56 +0000 Subject: [PATCH 093/127] cgemm/ctrmm power9 --- kernel/power/KERNEL.POWER9 | 6 +- kernel/power/cgemm_kernel_power9.S | 293 +++ kernel/power/cgemm_logic_power9.S | 2816 ++++++++++++++++++++++++++ kernel/power/cgemm_macros_power9.S | 3019 ++++++++++++++++++++++++++++ kernel/power/zgemm_logic_power9.S | 2 +- param.h | 4 +- 6 files changed, 6134 insertions(+), 6 deletions(-) create mode 100644 kernel/power/cgemm_kernel_power9.S create mode 100644 kernel/power/cgemm_logic_power9.S create mode 100644 kernel/power/cgemm_macros_power9.S diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 0f91d6d7d..31a5deeba 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -5,7 +5,7 @@ STRMMKERNEL = sgemm_kernel_power9.S DTRMMKERNEL = dgemm_kernel_power9.S -CTRMMKERNEL = ctrmm_kernel_8x4_power8.S +CTRMMKERNEL = cgemm_kernel_power9.S ZTRMMKERNEL = zgemm_kernel_power9.S SGEMMKERNEL = sgemm_kernel_power9.S @@ -28,9 +28,9 @@ DGEMMITCOPYOBJ = dgemm_itcopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -CGEMMKERNEL = cgemm_kernel_8x4_power8.S +CGEMMKERNEL = cgemm_kernel_power9.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c -CGEMMITCOPY = cgemm_tcopy_8_power8.S +CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPYOBJ = cgemm_oncopy.o diff --git a/kernel/power/cgemm_kernel_power9.S b/kernel/power/cgemm_kernel_power9.S new file mode 100644 index 000000000..4b5c2fa31 --- /dev/null +++ b/kernel/power/cgemm_kernel_power9.S @@ -0,0 +1,293 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@gmail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld +#define STACKSIZE (512 ) +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + +#define alpha_r vs19 +#define alpha_i vs20 +#define save_permute_1 vs21 +#define permute_mask vs22 +#define o0 0 + + +#define T1 r11 +#define T2 r12 +#define T3 r14 +#define T4 r15 +#define T5 r16 +#define T6 r17 +#define L r18 +#define T7 r19 +#define T8 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T9 r27 +#define T10 r28 +#define PRE r29 + +#define T12 r30 +#define T13 r31 + +#include "cgemm_macros_power9.S" + +.equ perm_const1, 0x0405060700010203 +.equ perm_const2, 0x0c0d0e0f08090a0b +.equ save_permute_12, 0x0c0d0e0f1c1d1e1f +.equ save_permute_11, 0x0405060714151617 + + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + + addi SP, SP, -STACKSIZE + mflr r0 + + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + std r0, FLINK_SAVE(SP) + + + + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) + + + +#ifdef TRMMKERNEL + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif + slwi LDC, LDC, ZBASE_SHIFT + + + + /*alpha is stored in f1. convert to single and splat*/ + xscvdpspn alpha_r,vs1 + xscvdpspn alpha_i,vs2 + xxspltw alpha_r,alpha_r,0 + xxspltw alpha_i,alpha_i,0 +/*load reverse permute mask for big endian + uint128 = 0xc0d0e0f08090a0b0405060700010203 +*/ + + lis T2, perm_const2@highest + lis T1, perm_const1@highest + lis T3, save_permute_12@highest + lis T4, save_permute_11@highest + + + ori T2, T2, perm_const2@higher + ori T1, T1, perm_const1@higher + ori T3, T3, save_permute_12@higher + ori T4, T4, save_permute_11@higher + + + rldicr T2, T2, 32, 31 + rldicr T1, T1, 32, 31 + rldicr T3, T3, 32, 31 + rldicr T4, T4, 32, 31 + + oris T2, T2, perm_const2@h + oris T1, T1, perm_const1@h + oris T3, T3, save_permute_12@h + oris T4, T4, save_permute_11@h + + + ori T2, T2, perm_const2@l + ori T1, T1, perm_const1@l + ori T3, T3, save_permute_12@l + ori T4, T4, save_permute_11@l + + + li r0,0 + li PRE,512 + +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegsp alpha_r,alpha_r + xvnegsp alpha_i,alpha_i +#endif + + mtvsrdd permute_mask,T2,T1 + mtvsrdd save_permute_1,T3,T4 + + /*mask is reverse permute so we have to make it inner permute */ + xxpermdi permute_mask, permute_mask, permute_mask,2 + +#include "cgemm_logic_power9.S" + +.L999: + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + + EPILOGUE +#endif diff --git a/kernel/power/cgemm_logic_power9.S b/kernel/power/cgemm_logic_power9.S new file mode 100644 index 000000000..b4f937e90 --- /dev/null +++ b/kernel/power/cgemm_logic_power9.S @@ -0,0 +1,2816 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@gmail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ +#define MY_ALIGN .align 3 +b CGEMM_L4 +/* MINI SUBROUTINES */ +/* 4x8 MAIN 128x+2 LOOP */ + + +CGEMM_L4x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x8_2 + MY_ALIGN +CGEMM_L4x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 +CGEMM_L4x8_K128: +/*----------------------------------------*/ + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_L2 128,64,15,0 + KERNEL4x8_L2 128,64,16,0 + KERNEL4x8_L2 128,64,17,0 + KERNEL4x8_L2 128,64,18,0 + KERNEL4x8_L2 128,64,19,0 + KERNEL4x8_L2 128,64,20,0 + KERNEL4x8_L2 128,64,21,0 + KERNEL4x8_L2 128,64,22,0 + KERNEL4x8_L2 128,64,23,0 + KERNEL4x8_L2 128,64,24,0 + KERNEL4x8_L2 128,64,25,0 + KERNEL4x8_L2 128,64,26,0 + KERNEL4x8_L2 128,64,27,0 + KERNEL4x8_L2 128,64,28,0 + KERNEL4x8_L2 128,64,29,0 + KERNEL4x8_L2 128,64,30,0 + KERNEL4x8_L2 128,64,31,0 + KERNEL4x8_L2 128,64,32,0 + KERNEL4x8_L2 128,64,33,0 + KERNEL4x8_L2 128,64,34,0 + KERNEL4x8_L2 128,64,35,0 + KERNEL4x8_L2 128,64,36,0 + KERNEL4x8_L2 128,64,37,0 + KERNEL4x8_L2 128,64,38,0 + KERNEL4x8_L2 128,64,39,0 + KERNEL4x8_L2 128,64,40,0 + KERNEL4x8_L2 128,64,41,0 + KERNEL4x8_L2 128,64,42,0 + KERNEL4x8_L2 128,64,43,0 + KERNEL4x8_L2 128,64,44,0 + KERNEL4x8_L2 128,64,45,0 + KERNEL4x8_L2 128,64,46,0 + KERNEL4x8_L2 128,64,47,0 + KERNEL4x8_L2 128,64,48,0 + KERNEL4x8_L2 128,64,49,0 + KERNEL4x8_L2 128,64,50,0 + KERNEL4x8_L2 128,64,51,0 + KERNEL4x8_L2 128,64,52,0 + KERNEL4x8_L2 128,64,53,0 + KERNEL4x8_L2 128,64,54,0 + KERNEL4x8_L2 128,64,55,0 + KERNEL4x8_L2 128,64,56,0 + KERNEL4x8_L2 128,64,57,0 + KERNEL4x8_L2 128,64,58,0 + KERNEL4x8_L2 128,64,59,0 + KERNEL4x8_L2 128,64,60,0 + KERNEL4x8_L2 128,64,61,0 + KERNEL4x8_L2 128,64,62,0 + KERNEL4x8_L2 128,64,63,1 + bdnz CGEMM_L4x8_LOOP + MY_ALIGN +CGEMM_L4x8_LOOP_END: +/*----------------------------------------*/ + END4x8_2 + blr + MY_ALIGN + + +CGEMM_4x8_L64_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_L2 128,64,15,0 + KERNEL4x8_L2 128,64,16,0 + KERNEL4x8_L2 128,64,17,0 + KERNEL4x8_L2 128,64,18,0 + KERNEL4x8_L2 128,64,19,0 + KERNEL4x8_L2 128,64,20,0 + KERNEL4x8_L2 128,64,21,0 + KERNEL4x8_L2 128,64,22,0 + KERNEL4x8_L2 128,64,23,0 + KERNEL4x8_L2 128,64,24,0 + KERNEL4x8_L2 128,64,25,0 + KERNEL4x8_L2 128,64,26,0 + KERNEL4x8_L2 128,64,27,0 + KERNEL4x8_L2 128,64,28,0 + KERNEL4x8_L2 128,64,29,0 + KERNEL4x8_L2 128,64,30,0 + KERNEL4x8_E2 128,64,31,1 + blr + MY_ALIGN + + +CGEMM_4x8_L32_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_E2 128,64,15,1 + blr + MY_ALIGN + + +CGEMM_4x8_L16_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_E2 128,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x4_2 + MY_ALIGN +CGEMM_L4x4_LOOP: +/*----------------------------------------*/ + KERNEL4x4_L2 64,64,0,0 +CGEMM_L4x4_K32: +/*----------------------------------------*/ + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_L2 64,64,3,0 + KERNEL4x4_L2 64,64,4,0 + KERNEL4x4_L2 64,64,5,0 + KERNEL4x4_L2 64,64,6,0 + KERNEL4x4_L2 64,64,7,0 + KERNEL4x4_L2 64,64,8,0 + KERNEL4x4_L2 64,64,9,0 + KERNEL4x4_L2 64,64,10,0 + KERNEL4x4_L2 64,64,11,0 + KERNEL4x4_L2 64,64,12,0 + KERNEL4x4_L2 64,64,13,0 + KERNEL4x4_L2 64,64,14,0 + KERNEL4x4_L2 64,64,15,1 + bdnz CGEMM_L4x4_LOOP + MY_ALIGN +CGEMM_L4x4_LOOP_END: +/*----------------------------------------*/ + END4x4_2 + blr + MY_ALIGN + + +CGEMM_4x4_L16_SUB: +/*----------------------------------------*/ + LOAD4x4_2 + KERNEL4x4_L2 64,64,0,0 + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_L2 64,64,3,0 + KERNEL4x4_L2 64,64,4,0 + KERNEL4x4_L2 64,64,5,0 + KERNEL4x4_L2 64,64,6,0 + KERNEL4x4_E2 64,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x4_L8_SUB: +/*----------------------------------------*/ + LOAD4x4_2 + KERNEL4x4_L2 64,64,0,0 + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_E2 64,64,3,1 + blr + + +CGEMM_4x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x2_2 + MY_ALIGN +CGEMM_L4x2_LOOP: +/*----------------------------------------*/ + KERNEL4x2_L2 32,64,0,0 +CGEMM_L4x2_K32: +/*----------------------------------------*/ + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_L2 32,64,3,0 + KERNEL4x2_L2 32,64,4,0 + KERNEL4x2_L2 32,64,5,0 + KERNEL4x2_L2 32,64,6,0 + KERNEL4x2_L2 32,64,7,0 + KERNEL4x2_L2 32,64,8,0 + KERNEL4x2_L2 32,64,9,0 + KERNEL4x2_L2 32,64,10,0 + KERNEL4x2_L2 32,64,11,0 + KERNEL4x2_L2 32,64,12,0 + KERNEL4x2_L2 32,64,13,0 + KERNEL4x2_L2 32,64,14,0 + KERNEL4x2_L2 32,64,15,1 + bdnz CGEMM_L4x2_LOOP + MY_ALIGN + + +CGEMM_L4x2_LOOP_END: +/*----------------------------------------*/ + END4x2_2 + blr + MY_ALIGN +CGEMM_4x2_L16_SUB: +/*----------------------------------------*/ + LOAD4x2_2 + KERNEL4x2_L2 32,64,0,0 + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_L2 32,64,3,0 + KERNEL4x2_L2 32,64,4,0 + KERNEL4x2_L2 32,64,5,0 + KERNEL4x2_L2 32,64,6,0 + KERNEL4x2_E2 32,64,7,1 + blr + MY_ALIGN +CGEMM_4x2_L8_SUB: +/*----------------------------------------*/ + LOAD4x2_2 + KERNEL4x2_L2 32,64,0,0 + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_E2 32,64,3,1 + blr + + +CGEMM_4x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x1_2 + MY_ALIGN +CGEMM_L4x1_LOOP: +/*----------------------------------------*/ + KERNEL4x1_L2 16,64,0,0 +CGEMM_L4x1_K32: +/*----------------------------------------*/ + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_L2 16,64,3,0 + KERNEL4x1_L2 16,64,4,0 + KERNEL4x1_L2 16,64,5,0 + KERNEL4x1_L2 16,64,6,0 + KERNEL4x1_L2 16,64,7,0 + KERNEL4x1_L2 16,64,8,0 + KERNEL4x1_L2 16,64,9,0 + KERNEL4x1_L2 16,64,10,0 + KERNEL4x1_L2 16,64,11,0 + KERNEL4x1_L2 16,64,12,0 + KERNEL4x1_L2 16,64,13,0 + KERNEL4x1_L2 16,64,14,0 + KERNEL4x1_L2 16,64,15,1 + bdnz CGEMM_L4x1_LOOP + MY_ALIGN +CGEMM_L4x1_LOOP_END: +/*----------------------------------------*/ + END4x1_2 + blr + + MY_ALIGN +CGEMM_4x1_L16_SUB: +/*----------------------------------------*/ + LOAD4x1_2 + KERNEL4x1_L2 16,64,0,0 + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_L2 16,64,3,0 + KERNEL4x1_L2 16,64,4,0 + KERNEL4x1_L2 16,64,5,0 + KERNEL4x1_L2 16,64,6,0 + KERNEL4x1_E2 16,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x1_L8_SUB: +/*----------------------------------------*/ + LOAD4x1_2 + KERNEL4x1_L2 16,64,0,0 + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_E2 16,64,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L4: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + srawi. J, N, 2 + ble CGEMM_L4_END + + +CGEMM_L4_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 2 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L4x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L4x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,4 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO4x8 + ble CGEMM_L4x8_SUB0 + bl CGEMM_L4x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L4x8_SAVE + b CGEMM_L4x8_SUB2 + + +CGEMM_L4x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP4x8_128K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD4x8O 64,32 + END4x8_WITHOUT_ADD + LOAD4x8_2O 128, 64 + mtctr T8 + bl CGEMM_L4x8_K128 + b CGEMM_L4x8_SAVE + CMP4x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L4x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD4x8_2O 128,64 + bl CGEMM_L4x8_K128 + b CGEMM_L4x8_SAVE + MY_ALIGN + + +CGEMM_L4x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L4x8_SUB2_32 + bl CGEMM_4x8_L64_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L4x8_SUB2_16 + bl CGEMM_4x8_L32_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x8_SUB2_8 + bl CGEMM_4x8_L16_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x8_SUB2_4 + LOAD4x8_2 + KERNEL4x8_L2 128,64, 0,0 + KERNEL4x8_L2 128,64, 1,0 + KERNEL4x8_L2 128,64, 2,0 + KERNEL4x8_E2 128,64, 3,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x8_SUB2_2 + LOAD4x8_2 + KERNEL4x8_L2 128,64, 0,0 + KERNEL4x8_E2 128,64, 1,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x8_SUB2_1 + LOAD4x8_2 + KERNEL4x8_E2 128,64, 0,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x8_SAVE + KERNEL4x8 + + MY_ALIGN +CGEMM_L4x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4 +#endif + bgt CGEMM_L4x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L4x1_END + andi. T1, M, 4 + ble CGEMM_L4x4_END + b CGEMM_L4x4_BEGIN + MY_ALIGN + + +CGEMM_L4x8_END: +/*----------------------------------------*/ + + +CGEMM_L4x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L4x1_END + andi. T1, M, 4 + ble CGEMM_L4x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x4 + ble CGEMM_L4x4_SUB0 + bl CGEMM_4x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x4_SAVE + b CGEMM_L4x4_SUB2 + + +CGEMM_L4x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x4_32K + addi BO,BO,-32 + addi AO,AO,-32 + LOAD4x4O 32,32 + END4x4_WITHOUT_ADD + LOAD4x4_2O 64, 64 + mtctr T8 + bl CGEMM_L4x4_K32 + b CGEMM_L4x4_SAVE + CMP4x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-64 + LOAD4x4_2O 64,64 + bl CGEMM_L4x4_K32 + b CGEMM_L4x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x4_SUB2_8 + bl CGEMM_4x4_L16_SUB + MY_ALIGN + + +CGEMM_L4x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x4_SUB2_4 + bl CGEMM_4x4_L8_SUB + MY_ALIGN + + +CGEMM_L4x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x4_SUB2_2 + LOAD4x4_2 + KERNEL4x4_L2 64,64, 0,0 + KERNEL4x4_E2 64,64, 1,1 + MY_ALIGN + + +CGEMM_L4x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x4_SUB2_1 + LOAD4x4_2 + KERNEL4x4_E2 64,64, 0,1 + MY_ALIGN + + +CGEMM_L4x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x4_SAVE + KERNEL4x4 + + +CGEMM_L4x4_SAVE: +/*----------------------------------------*/ + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4 +#endif + + +CGEMM_L4x4_END: +/*----------------------------------------*/ + + +CGEMM_L4x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L4x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x2 + ble CGEMM_L4x2_SUB0 + bl CGEMM_4x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x2_SAVE + b CGEMM_L4x2_SUB2 + + +CGEMM_L4x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x2_32K + addi BO,BO,-32 + addi AO,AO,-16 + LOAD4x2O 16,32 + END4x2_WITHOUT_ADD + LOAD4x2_2O 32, 64 + mtctr T8 + bl CGEMM_L4x2_K32 + b CGEMM_L4x2_SAVE + CMP4x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-32 + LOAD4x2_2O 32,64 + bl CGEMM_L4x2_K32 + b CGEMM_L4x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x2_SUB2_8 + bl CGEMM_4x2_L16_SUB + MY_ALIGN + + +CGEMM_L4x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x2_SUB2_4 + bl CGEMM_4x2_L8_SUB + MY_ALIGN + + +CGEMM_L4x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x2_SUB2_2 + LOAD4x2_2 + KERNEL4x2_L2 32,64, 0,0 + KERNEL4x2_E2 32,64, 1,1 + MY_ALIGN + + +CGEMM_L4x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x2_SUB2_1 + LOAD4x2_2 + KERNEL4x2_E2 32,64, 0,1 + MY_ALIGN + + +CGEMM_L4x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x2_SAVE + KERNEL4x2 + + MY_ALIGN +CGEMM_L4x2_SAVE: +/*----------------------------------------*/ + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4 +#endif + + +CGEMM_L4x2_END: +/*----------------------------------------*/ + + +CGEMM_L4x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x1 + ble CGEMM_L4x1_SUB0 + bl CGEMM_4x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x1_SAVE + b CGEMM_L4x1_SUB2 + + +CGEMM_L4x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x1_32K + addi BO,BO,-32 + addi AO,AO,-8 + LOAD4x1O 8,32 + END4x1_WITHOUT_ADD + LOAD4x1_2O 16, 64 + mtctr T8 + bl CGEMM_L4x1_K32 + b CGEMM_L4x1_SAVE + CMP4x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-16 + LOAD4x1_2O 16,64 + bl CGEMM_L4x1_K32 + b CGEMM_L4x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x1_SUB2_8 + bl CGEMM_4x1_L16_SUB + MY_ALIGN + + +CGEMM_L4x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x1_SUB2_4 + bl CGEMM_4x1_L8_SUB + MY_ALIGN + + +CGEMM_L4x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x1_SUB2_2 + LOAD4x1_2 + KERNEL4x1_L2 16,64, 0,0 + KERNEL4x1_E2 16,64, 1,1 + MY_ALIGN + + +CGEMM_L4x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x1_SUB2_1 + LOAD4x1_2 + KERNEL4x1_E2 16,64, 0,1 + MY_ALIGN + + +CGEMM_L4x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x1_SAVE + KERNEL4x1 + + MY_ALIGN +CGEMM_L4x1_SAVE: +/*----------------------------------------*/ + + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4 +#endif + + +CGEMM_L4x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + bgt CGEMM_L4_BEGIN + + +CGEMM_L4_END: + +b CGEMM_L2 +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ + + +CGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x8_2 + MY_ALIGN +CGEMM_L2x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 +CGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_L2 128,32,15,0 + KERNEL2x8_L2 128,32,16,0 + KERNEL2x8_L2 128,32,17,0 + KERNEL2x8_L2 128,32,18,0 + KERNEL2x8_L2 128,32,19,0 + KERNEL2x8_L2 128,32,20,0 + KERNEL2x8_L2 128,32,21,0 + KERNEL2x8_L2 128,32,22,0 + KERNEL2x8_L2 128,32,23,0 + KERNEL2x8_L2 128,32,24,0 + KERNEL2x8_L2 128,32,25,0 + KERNEL2x8_L2 128,32,26,0 + KERNEL2x8_L2 128,32,27,0 + KERNEL2x8_L2 128,32,28,0 + KERNEL2x8_L2 128,32,29,0 + KERNEL2x8_L2 128,32,30,0 + KERNEL2x8_L2 128,32,31,0 + KERNEL2x8_L2 128,32,32,0 + KERNEL2x8_L2 128,32,33,0 + KERNEL2x8_L2 128,32,34,0 + KERNEL2x8_L2 128,32,35,0 + KERNEL2x8_L2 128,32,36,0 + KERNEL2x8_L2 128,32,37,0 + KERNEL2x8_L2 128,32,38,0 + KERNEL2x8_L2 128,32,39,0 + KERNEL2x8_L2 128,32,40,0 + KERNEL2x8_L2 128,32,41,0 + KERNEL2x8_L2 128,32,42,0 + KERNEL2x8_L2 128,32,43,0 + KERNEL2x8_L2 128,32,44,0 + KERNEL2x8_L2 128,32,45,0 + KERNEL2x8_L2 128,32,46,0 + KERNEL2x8_L2 128,32,47,0 + KERNEL2x8_L2 128,32,48,0 + KERNEL2x8_L2 128,32,49,0 + KERNEL2x8_L2 128,32,50,0 + KERNEL2x8_L2 128,32,51,0 + KERNEL2x8_L2 128,32,52,0 + KERNEL2x8_L2 128,32,53,0 + KERNEL2x8_L2 128,32,54,0 + KERNEL2x8_L2 128,32,55,0 + KERNEL2x8_L2 128,32,56,0 + KERNEL2x8_L2 128,32,57,0 + KERNEL2x8_L2 128,32,58,0 + KERNEL2x8_L2 128,32,59,0 + KERNEL2x8_L2 128,32,60,0 + KERNEL2x8_L2 128,32,61,0 + KERNEL2x8_L2 128,32,62,0 + KERNEL2x8_L2 128,32,63,1 + bdnz CGEMM_L2x8_LOOP + MY_ALIGN +CGEMM_L2x8_LOOP_END: +/*----------------------------------------*/ + END2x8_2 + blr + MY_ALIGN + + +CGEMM_2x8_L64_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_L2 128,32,15,0 + KERNEL2x8_L2 128,32,16,0 + KERNEL2x8_L2 128,32,17,0 + KERNEL2x8_L2 128,32,18,0 + KERNEL2x8_L2 128,32,19,0 + KERNEL2x8_L2 128,32,20,0 + KERNEL2x8_L2 128,32,21,0 + KERNEL2x8_L2 128,32,22,0 + KERNEL2x8_L2 128,32,23,0 + KERNEL2x8_L2 128,32,24,0 + KERNEL2x8_L2 128,32,25,0 + KERNEL2x8_L2 128,32,26,0 + KERNEL2x8_L2 128,32,27,0 + KERNEL2x8_L2 128,32,28,0 + KERNEL2x8_L2 128,32,29,0 + KERNEL2x8_L2 128,32,30,0 + KERNEL2x8_E2 128,32,31,1 + blr + MY_ALIGN + + +CGEMM_2x8_L32_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_E2 128,32,15,1 + blr + MY_ALIGN + + +CGEMM_2x8_L16_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_E2 128,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x4_2 + MY_ALIGN +CGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_L2 64,32,0,0 +CGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_L2 64,32,3,0 + KERNEL2x4_L2 64,32,4,0 + KERNEL2x4_L2 64,32,5,0 + KERNEL2x4_L2 64,32,6,0 + KERNEL2x4_L2 64,32,7,0 + KERNEL2x4_L2 64,32,8,0 + KERNEL2x4_L2 64,32,9,0 + KERNEL2x4_L2 64,32,10,0 + KERNEL2x4_L2 64,32,11,0 + KERNEL2x4_L2 64,32,12,0 + KERNEL2x4_L2 64,32,13,0 + KERNEL2x4_L2 64,32,14,0 + KERNEL2x4_L2 64,32,15,1 + bdnz CGEMM_L2x4_LOOP + MY_ALIGN +CGEMM_L2x4_LOOP_END: +/*----------------------------------------*/ + END2x4_2 + blr + MY_ALIGN + + +CGEMM_2x4_L16_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 64,32,0,0 + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_L2 64,32,3,0 + KERNEL2x4_L2 64,32,4,0 + KERNEL2x4_L2 64,32,5,0 + KERNEL2x4_L2 64,32,6,0 + KERNEL2x4_E2 64,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x4_L8_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 64,32,0,0 + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_E2 64,32,3,1 + blr + + +CGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x2_2 + MY_ALIGN +CGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_L2 32,32,0,0 +CGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_L2 32,32,3,0 + KERNEL2x2_L2 32,32,4,0 + KERNEL2x2_L2 32,32,5,0 + KERNEL2x2_L2 32,32,6,0 + KERNEL2x2_L2 32,32,7,0 + KERNEL2x2_L2 32,32,8,0 + KERNEL2x2_L2 32,32,9,0 + KERNEL2x2_L2 32,32,10,0 + KERNEL2x2_L2 32,32,11,0 + KERNEL2x2_L2 32,32,12,0 + KERNEL2x2_L2 32,32,13,0 + KERNEL2x2_L2 32,32,14,0 + KERNEL2x2_L2 32,32,15,1 + bdnz CGEMM_L2x2_LOOP + MY_ALIGN + + +CGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + END2x2_2 + blr + MY_ALIGN +CGEMM_2x2_L16_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 32,32,0,0 + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_L2 32,32,3,0 + KERNEL2x2_L2 32,32,4,0 + KERNEL2x2_L2 32,32,5,0 + KERNEL2x2_L2 32,32,6,0 + KERNEL2x2_E2 32,32,7,1 + blr + MY_ALIGN +CGEMM_2x2_L8_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 32,32,0,0 + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_E2 32,32,3,1 + blr + + +CGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +CGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 16,32,0,0 +CGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_L2 16,32,3,0 + KERNEL2x1_L2 16,32,4,0 + KERNEL2x1_L2 16,32,5,0 + KERNEL2x1_L2 16,32,6,0 + KERNEL2x1_L2 16,32,7,0 + KERNEL2x1_L2 16,32,8,0 + KERNEL2x1_L2 16,32,9,0 + KERNEL2x1_L2 16,32,10,0 + KERNEL2x1_L2 16,32,11,0 + KERNEL2x1_L2 16,32,12,0 + KERNEL2x1_L2 16,32,13,0 + KERNEL2x1_L2 16,32,14,0 + KERNEL2x1_L2 16,32,15,1 + bdnz CGEMM_L2x1_LOOP + MY_ALIGN +CGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 + blr + + MY_ALIGN +CGEMM_2x1_L16_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 16,32,0,0 + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_L2 16,32,3,0 + KERNEL2x1_L2 16,32,4,0 + KERNEL2x1_L2 16,32,5,0 + KERNEL2x1_L2 16,32,6,0 + KERNEL2x1_E2 16,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x1_L8_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 16,32,0,0 + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_E2 16,32,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L2: +/*----------------------------------------*/ + + andi. J, N, 2 + ble CGEMM_L2_END + + +CGEMM_L2_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L2x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO2x8 + ble CGEMM_L2x8_SUB0 + bl CGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L2x8_SAVE + b CGEMM_L2x8_SUB2 + + +CGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP2x8_128K + addi BO,BO,-16 + addi AO,AO,-64 + LOAD2x8O 64,16 + END2x8_WITHOUT_ADD + LOAD2x8_2O 128, 32 + mtctr T8 + bl CGEMM_L2x8_K128 + b CGEMM_L2x8_SAVE + CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-128 + LOAD2x8_2O 128,32 + bl CGEMM_L2x8_K128 + b CGEMM_L2x8_SAVE + MY_ALIGN + + +CGEMM_L2x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L2x8_SUB2_32 + bl CGEMM_2x8_L64_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L2x8_SUB2_16 + bl CGEMM_2x8_L32_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x8_SUB2_8 + bl CGEMM_2x8_L16_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x8_SUB2_4 + LOAD2x8_2 + KERNEL2x8_L2 128,32, 0,0 + KERNEL2x8_L2 128,32, 1,0 + KERNEL2x8_L2 128,32, 2,0 + KERNEL2x8_E2 128,32, 3,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x8_SUB2_2 + LOAD2x8_2 + KERNEL2x8_L2 128,32, 0,0 + KERNEL2x8_E2 128,32, 1,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x8_SUB2_1 + LOAD2x8_2 + KERNEL2x8_E2 128,32, 0,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x8_SAVE + KERNEL2x8 + + MY_ALIGN +CGEMM_L2x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 +#endif + bgt CGEMM_L2x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L2x1_END + andi. T1, M, 4 + ble CGEMM_L2x4_END + b CGEMM_L2x4_BEGIN + MY_ALIGN + + +CGEMM_L2x8_END: +/*----------------------------------------*/ + + +CGEMM_L2x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L2x1_END + andi. T1, M, 4 + ble CGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x4 + ble CGEMM_L2x4_SUB0 + bl CGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x4_SAVE + b CGEMM_L2x4_SUB2 + + +CGEMM_L2x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x4_32K + addi BO,BO,-16 + addi AO,AO,-32 + LOAD2x4O 32,16 + END2x4_WITHOUT_ADD + LOAD2x4_2O 64, 32 + mtctr T8 + bl CGEMM_L2x4_K32 + b CGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-64 + LOAD2x4_2O 64,32 + bl CGEMM_L2x4_K32 + b CGEMM_L2x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x4_SUB2_8 + bl CGEMM_2x4_L16_SUB + MY_ALIGN + + +CGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x4_SUB2_4 + bl CGEMM_2x4_L8_SUB + MY_ALIGN + + +CGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x4_SUB2_2 + LOAD2x4_2 + KERNEL2x4_L2 64,32, 0,0 + KERNEL2x4_E2 64,32, 1,1 + MY_ALIGN + + +CGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x4_SUB2_1 + LOAD2x4_2 + KERNEL2x4_E2 64,32, 0,1 + MY_ALIGN + + +CGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x4_SAVE + KERNEL2x4 + + +CGEMM_L2x4_SAVE: +/*----------------------------------------*/ + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 +#endif + + +CGEMM_L2x4_END: +/*----------------------------------------*/ + + +CGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x2 + ble CGEMM_L2x2_SUB0 + bl CGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x2_SAVE + b CGEMM_L2x2_SUB2 + + +CGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x2_32K + addi BO,BO,-16 + addi AO,AO,-16 + LOAD2x2O 16,16 + END2x2_WITHOUT_ADD + LOAD2x2_2O 32, 32 + mtctr T8 + bl CGEMM_L2x2_K32 + b CGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-32 + LOAD2x2_2O 32,32 + bl CGEMM_L2x2_K32 + b CGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x2_SUB2_8 + bl CGEMM_2x2_L16_SUB + MY_ALIGN + + +CGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x2_SUB2_4 + bl CGEMM_2x2_L8_SUB + MY_ALIGN + + +CGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x2_SUB2_2 + LOAD2x2_2 + KERNEL2x2_L2 32,32, 0,0 + KERNEL2x2_E2 32,32, 1,1 + MY_ALIGN + + +CGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x2_SUB2_1 + LOAD2x2_2 + KERNEL2x2_E2 32,32, 0,1 + MY_ALIGN + + +CGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x2_SAVE + KERNEL2x2 + + MY_ALIGN +CGEMM_L2x2_SAVE: +/*----------------------------------------*/ + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 +#endif + + +CGEMM_L2x2_END: +/*----------------------------------------*/ + + +CGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble CGEMM_L2x1_SUB0 + bl CGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x1_SAVE + b CGEMM_L2x1_SUB2 + + +CGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x1_32K + addi BO,BO,-16 + addi AO,AO,-8 + LOAD2x1O 8,16 + END2x1_WITHOUT_ADD + LOAD2x1_2O 16, 32 + mtctr T8 + bl CGEMM_L2x1_K32 + b CGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-16 + LOAD2x1_2O 16,32 + bl CGEMM_L2x1_K32 + b CGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x1_SUB2_8 + bl CGEMM_2x1_L16_SUB + MY_ALIGN + + +CGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x1_SUB2_4 + bl CGEMM_2x1_L8_SUB + MY_ALIGN + + +CGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x1_SUB2_2 + LOAD2x1_2 + KERNEL2x1_L2 16,32, 0,0 + KERNEL2x1_E2 16,32, 1,1 + MY_ALIGN + + +CGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x1_SUB2_1 + LOAD2x1_2 + KERNEL2x1_E2 16,32, 0,1 + MY_ALIGN + + +CGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x1_SAVE + KERNEL2x1 + + MY_ALIGN +CGEMM_L2x1_SAVE: +/*----------------------------------------*/ + + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 +#endif + + +CGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 4 + + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + +CGEMM_L2_END: + + +b CGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ + + +CGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x8_2 + MY_ALIGN +CGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 +CGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_L2 128,16,15,0 + KERNEL1x8_L2 128,16,16,0 + KERNEL1x8_L2 128,16,17,0 + KERNEL1x8_L2 128,16,18,0 + KERNEL1x8_L2 128,16,19,0 + KERNEL1x8_L2 128,16,20,0 + KERNEL1x8_L2 128,16,21,0 + KERNEL1x8_L2 128,16,22,0 + KERNEL1x8_L2 128,16,23,0 + KERNEL1x8_L2 128,16,24,0 + KERNEL1x8_L2 128,16,25,0 + KERNEL1x8_L2 128,16,26,0 + KERNEL1x8_L2 128,16,27,0 + KERNEL1x8_L2 128,16,28,0 + KERNEL1x8_L2 128,16,29,0 + KERNEL1x8_L2 128,16,30,0 + KERNEL1x8_L2 128,16,31,0 + KERNEL1x8_L2 128,16,32,0 + KERNEL1x8_L2 128,16,33,0 + KERNEL1x8_L2 128,16,34,0 + KERNEL1x8_L2 128,16,35,0 + KERNEL1x8_L2 128,16,36,0 + KERNEL1x8_L2 128,16,37,0 + KERNEL1x8_L2 128,16,38,0 + KERNEL1x8_L2 128,16,39,0 + KERNEL1x8_L2 128,16,40,0 + KERNEL1x8_L2 128,16,41,0 + KERNEL1x8_L2 128,16,42,0 + KERNEL1x8_L2 128,16,43,0 + KERNEL1x8_L2 128,16,44,0 + KERNEL1x8_L2 128,16,45,0 + KERNEL1x8_L2 128,16,46,0 + KERNEL1x8_L2 128,16,47,0 + KERNEL1x8_L2 128,16,48,0 + KERNEL1x8_L2 128,16,49,0 + KERNEL1x8_L2 128,16,50,0 + KERNEL1x8_L2 128,16,51,0 + KERNEL1x8_L2 128,16,52,0 + KERNEL1x8_L2 128,16,53,0 + KERNEL1x8_L2 128,16,54,0 + KERNEL1x8_L2 128,16,55,0 + KERNEL1x8_L2 128,16,56,0 + KERNEL1x8_L2 128,16,57,0 + KERNEL1x8_L2 128,16,58,0 + KERNEL1x8_L2 128,16,59,0 + KERNEL1x8_L2 128,16,60,0 + KERNEL1x8_L2 128,16,61,0 + KERNEL1x8_L2 128,16,62,0 + KERNEL1x8_L2 128,16,63,1 + bdnz CGEMM_L1x8_LOOP + MY_ALIGN +CGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + END1x8_2 + blr + MY_ALIGN + + +CGEMM_1x8_L64_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_L2 128,16,15,0 + KERNEL1x8_L2 128,16,16,0 + KERNEL1x8_L2 128,16,17,0 + KERNEL1x8_L2 128,16,18,0 + KERNEL1x8_L2 128,16,19,0 + KERNEL1x8_L2 128,16,20,0 + KERNEL1x8_L2 128,16,21,0 + KERNEL1x8_L2 128,16,22,0 + KERNEL1x8_L2 128,16,23,0 + KERNEL1x8_L2 128,16,24,0 + KERNEL1x8_L2 128,16,25,0 + KERNEL1x8_L2 128,16,26,0 + KERNEL1x8_L2 128,16,27,0 + KERNEL1x8_L2 128,16,28,0 + KERNEL1x8_L2 128,16,29,0 + KERNEL1x8_L2 128,16,30,0 + KERNEL1x8_E2 128,16,31,1 + blr + MY_ALIGN + + +CGEMM_1x8_L32_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_E2 128,16,15,1 + blr + MY_ALIGN + + +CGEMM_1x8_L16_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_E2 128,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x4_2 + MY_ALIGN +CGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_L2 64,16,0,0 +CGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_L2 64,16,3,0 + KERNEL1x4_L2 64,16,4,0 + KERNEL1x4_L2 64,16,5,0 + KERNEL1x4_L2 64,16,6,0 + KERNEL1x4_L2 64,16,7,0 + KERNEL1x4_L2 64,16,8,0 + KERNEL1x4_L2 64,16,9,0 + KERNEL1x4_L2 64,16,10,0 + KERNEL1x4_L2 64,16,11,0 + KERNEL1x4_L2 64,16,12,0 + KERNEL1x4_L2 64,16,13,0 + KERNEL1x4_L2 64,16,14,0 + KERNEL1x4_L2 64,16,15,1 + bdnz CGEMM_L1x4_LOOP + MY_ALIGN +CGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + END1x4_2 + blr + MY_ALIGN + + +CGEMM_1x4_L16_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 64,16,0,0 + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_L2 64,16,3,0 + KERNEL1x4_L2 64,16,4,0 + KERNEL1x4_L2 64,16,5,0 + KERNEL1x4_L2 64,16,6,0 + KERNEL1x4_E2 64,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x4_L8_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 64,16,0,0 + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_E2 64,16,3,1 + blr + + +CGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x2_2 + MY_ALIGN +CGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_L2 32,16,0,0 +CGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_L2 32,16,3,0 + KERNEL1x2_L2 32,16,4,0 + KERNEL1x2_L2 32,16,5,0 + KERNEL1x2_L2 32,16,6,0 + KERNEL1x2_L2 32,16,7,0 + KERNEL1x2_L2 32,16,8,0 + KERNEL1x2_L2 32,16,9,0 + KERNEL1x2_L2 32,16,10,0 + KERNEL1x2_L2 32,16,11,0 + KERNEL1x2_L2 32,16,12,0 + KERNEL1x2_L2 32,16,13,0 + KERNEL1x2_L2 32,16,14,0 + KERNEL1x2_L2 32,16,15,1 + bdnz CGEMM_L1x2_LOOP + MY_ALIGN + + +CGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + END1x2_2 + blr + MY_ALIGN +CGEMM_1x2_L16_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 32,16,0,0 + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_L2 32,16,3,0 + KERNEL1x2_L2 32,16,4,0 + KERNEL1x2_L2 32,16,5,0 + KERNEL1x2_L2 32,16,6,0 + KERNEL1x2_E2 32,16,7,1 + blr + MY_ALIGN +CGEMM_1x2_L8_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 32,16,0,0 + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_E2 32,16,3,1 + blr + + +CGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN +CGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 16,16,0,0 +CGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_L2 16,16,3,0 + KERNEL1x1_L2 16,16,4,0 + KERNEL1x1_L2 16,16,5,0 + KERNEL1x1_L2 16,16,6,0 + KERNEL1x1_L2 16,16,7,0 + KERNEL1x1_L2 16,16,8,0 + KERNEL1x1_L2 16,16,9,0 + KERNEL1x1_L2 16,16,10,0 + KERNEL1x1_L2 16,16,11,0 + KERNEL1x1_L2 16,16,12,0 + KERNEL1x1_L2 16,16,13,0 + KERNEL1x1_L2 16,16,14,0 + KERNEL1x1_L2 16,16,15,1 + bdnz CGEMM_L1x1_LOOP + MY_ALIGN +CGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + + MY_ALIGN +CGEMM_1x1_L16_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 16,16,0,0 + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_L2 16,16,3,0 + KERNEL1x1_L2 16,16,4,0 + KERNEL1x1_L2 16,16,5,0 + KERNEL1x1_L2 16,16,6,0 + KERNEL1x1_E2 16,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x1_L8_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 16,16,0,0 + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_E2 16,16,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L1: +/*----------------------------------------*/ + + andi. J, N, 1 + ble CGEMM_L1_END + +CGEMM_L1_BEGIN: +/*----------------------------------------*/ + mr CO, C + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L1x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO1x8 + ble CGEMM_L1x8_SUB0 + bl CGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L1x8_SAVE + b CGEMM_L1x8_SUB2 + + +CGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP1x8_128K + addi BO,BO,-8 + addi AO,AO,-64 + LOAD1x8O 64,8 + END1x8_WITHOUT_ADD + LOAD1x8_2O 128, 16 + mtctr T8 + bl CGEMM_L1x8_K128 + b CGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-128 + LOAD1x8_2O 128,16 + bl CGEMM_L1x8_K128 + b CGEMM_L1x8_SAVE + MY_ALIGN + + +CGEMM_L1x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L1x8_SUB2_32 + bl CGEMM_1x8_L64_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L1x8_SUB2_16 + bl CGEMM_1x8_L32_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x8_SUB2_8 + bl CGEMM_1x8_L16_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x8_SUB2_4 + LOAD1x8_2 + KERNEL1x8_L2 128,16, 0,0 + KERNEL1x8_L2 128,16, 1,0 + KERNEL1x8_L2 128,16, 2,0 + KERNEL1x8_E2 128,16, 3,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x8_SUB2_2 + LOAD1x8_2 + KERNEL1x8_L2 128,16, 0,0 + KERNEL1x8_E2 128,16, 1,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x8_SUB2_1 + LOAD1x8_2 + KERNEL1x8_E2 128,16, 0,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x8_SAVE + KERNEL1x8 + + MY_ALIGN +CGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 +#endif + bgt CGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L1x1_END + andi. T1, M, 4 + ble CGEMM_L1x4_END + b CGEMM_L1x4_BEGIN + MY_ALIGN + + +CGEMM_L1x8_END: +/*----------------------------------------*/ + + +CGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L1x1_END + andi. T1, M, 4 + ble CGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x4 + ble CGEMM_L1x4_SUB0 + bl CGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x4_SAVE + b CGEMM_L1x4_SUB2 + + +CGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x4_32K + addi BO,BO,-8 + addi AO,AO,-32 + LOAD1x4O 32,8 + END1x4_WITHOUT_ADD + LOAD1x4_2O 64, 16 + mtctr T8 + bl CGEMM_L1x4_K32 + b CGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-64 + LOAD1x4_2O 64,16 + bl CGEMM_L1x4_K32 + b CGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x4_SUB2_8 + bl CGEMM_1x4_L16_SUB + MY_ALIGN + + +CGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x4_SUB2_4 + bl CGEMM_1x4_L8_SUB + MY_ALIGN + + +CGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x4_SUB2_2 + LOAD1x4_2 + KERNEL1x4_L2 64,16, 0,0 + KERNEL1x4_E2 64,16, 1,1 + MY_ALIGN + + +CGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x4_SUB2_1 + LOAD1x4_2 + KERNEL1x4_E2 64,16, 0,1 + MY_ALIGN + + +CGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x4_SAVE + KERNEL1x4 + + +CGEMM_L1x4_SAVE: +/*----------------------------------------*/ + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 +#endif + + +CGEMM_L1x4_END: +/*----------------------------------------*/ + + +CGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x2 + ble CGEMM_L1x2_SUB0 + bl CGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x2_SAVE + b CGEMM_L1x2_SUB2 + + +CGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x2_32K + addi BO,BO,-8 + addi AO,AO,-16 + LOAD1x2O 16,8 + END1x2_WITHOUT_ADD + LOAD1x2_2O 32, 16 + mtctr T8 + bl CGEMM_L1x2_K32 + b CGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-32 + LOAD1x2_2O 32,16 + bl CGEMM_L1x2_K32 + b CGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x2_SUB2_8 + bl CGEMM_1x2_L16_SUB + MY_ALIGN + + +CGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x2_SUB2_4 + bl CGEMM_1x2_L8_SUB + MY_ALIGN + + +CGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x2_SUB2_2 + LOAD1x2_2 + KERNEL1x2_L2 32,16, 0,0 + KERNEL1x2_E2 32,16, 1,1 + MY_ALIGN + + +CGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x2_SUB2_1 + LOAD1x2_2 + KERNEL1x2_E2 32,16, 0,1 + MY_ALIGN + + +CGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x2_SAVE + KERNEL1x2 + + MY_ALIGN +CGEMM_L1x2_SAVE: +/*----------------------------------------*/ + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 +#endif + + +CGEMM_L1x2_END: +/*----------------------------------------*/ + + +CGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x1 + ble CGEMM_L1x1_SUB0 + bl CGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x1_SAVE + b CGEMM_L1x1_SUB2 + + +CGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x1_32K + addi BO,BO,-8 + addi AO,AO,-8 + LOAD1x1O 8,8 + END1x1_WITHOUT_ADD + LOAD1x1_2O 16, 16 + mtctr T8 + bl CGEMM_L1x1_K32 + b CGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-16 + LOAD1x1_2O 16,16 + bl CGEMM_L1x1_K32 + b CGEMM_L1x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x1_SUB2_8 + bl CGEMM_1x1_L16_SUB + MY_ALIGN + + +CGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x1_SUB2_4 + bl CGEMM_1x1_L8_SUB + MY_ALIGN + + +CGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x1_SUB2_2 + LOAD1x1_2 + KERNEL1x1_L2 16,16, 0,0 + KERNEL1x1_E2 16,16, 1,1 + MY_ALIGN + + +CGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x1_SUB2_1 + LOAD1x1_2 + KERNEL1x1_E2 16,16, 0,1 + MY_ALIGN + + +CGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x1_SAVE + KERNEL1x1 + + MY_ALIGN +CGEMM_L1x1_SAVE: +/*----------------------------------------*/ + + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 +#endif + + +CGEMM_L1x1_END: +/*----------------------------------------*/ + slwi T1, K, 3 + + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + +CGEMM_L1_END: + + + + diff --git a/kernel/power/cgemm_macros_power9.S b/kernel/power/cgemm_macros_power9.S new file mode 100644 index 000000000..a256e1a01 --- /dev/null +++ b/kernel/power/cgemm_macros_power9.S @@ -0,0 +1,3019 @@ + +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@gmail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ +#define unit_size 8 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) + +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead to fix sign*/ + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm + + +.macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead to fix sign*/ + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm + +/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */ + +.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmulsp \VSOUT1,\VSINII, alpha_i + xvmulsp \VSOUT2,\VSINRR, alpha_i +.endm + +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + +.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmsubasp \VSOUT1,\VSINRR, alpha_r + xvmaddasp \VSOUT2,\VSINII, alpha_r +.endm + +/* macros for N=4 and M=8 +**********************************************************************************************/ + +.macro Zero4x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endm + + +.macro LOAD4x8 + LOAD4x8O 0,0 +.endm + + +.macro LOAD4x8O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + lxv vs28, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x8_NORMAL + END4x8 AO,BO,64,32 +.endm + + +.macro END4x8_WITHOUT_ADD + END4x8 AO,BO,0,0 +.endm + + +.macro END4x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 +.endm + + +.macro LOAD4x8_2 + LOAD4x8_2O 0,0 +.endm + + +.macro LOAD4x8_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs12, (16+\OffsetB)(BO) + lxv vs24, (32+\OffsetB)(BO) + lxv vs28, (32+16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + lxv vs6, (32+\OffsetA)(AO) + lxv vs7, (48+\OffsetA)(AO) + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + lxv vs0, (64+\OffsetA)(AO) + lxv vs1, (64+16+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + lxv vs2, (64+32+\OffsetA)(AO) + lxv vs3, (64+48+\OffsetA)(AO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x8_2 + /*for load2 offset will be 128 and 64*/ + KERNEL4x8_2 AO,BO, 128,64,0 ,1,1 +.endm + + +.macro KERNEL4x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 +.if \Complete==0 + lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 +.if \Complete==0 + lxv vs8, DISP8(\Index,\OffsetB)(\BREG) + lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask +.endif + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 +.if \Complete==0 + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.endif + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 +.endif +.if \Complete==0 + lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 +.if \Complete==0 + lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endif + +.if \Complete==0 + lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,\OffsetB) + addi \AREG, \AREG, DISP16(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP8(\Index,64) + addi \AREG, \AREG, DISP16(\Index,128) +.endif + +.endif +.endm + + +.macro KERNEL4x8 + LOAD4x8 + END4x8 AO, BO, 64,32 +.endm + + +.macro SAVE4x8 + add T4, LDC,LDC + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask +#ifndef TRMMKERNEL + lxv vs26 , 32(CO) + lxv vs27 , 48(CO) +#endif + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask +#ifndef TRMMKERNEL + lxv vs28 , 0(T1) + lxv vs29 , 16(T1) +#endif + xxperm vs2,vs34,permute_mask + xxperm vs6,vs42,permute_mask +#ifndef TRMMKERNEL + lxv vs30 , 32(T1) + lxv vs31 , 48(T1) +#endif + xxperm vs3,vs35,permute_mask + xxperm vs7,vs43,permute_mask + add T2,CO,T4 + add T3,T1,T4 + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 + xxperm vs10,vs38,permute_mask + xxperm vs14,vs46,permute_mask + AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 + xxperm vs11,vs39,permute_mask + xxperm vs15,vs47,permute_mask + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + xxperm vs0,vs48,permute_mask + xxperm vs4,vs56,permute_mask + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + xxperm vs1,vs49,permute_mask + xxperm vs5,vs57,permute_mask + AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 + xxperm vs2,vs50,permute_mask + xxperm vs6,vs58,permute_mask + AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 + xxperm vs3,vs51,permute_mask + xxperm vs7,vs59,permute_mask + AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 + xxperm vs8,vs52,permute_mask + xxperm vs12,vs60,permute_mask + AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 + xxperm vs9,vs53,permute_mask + xxperm vs13,vs61,permute_mask + AGGREGATE_REALS_IMAGES vs50,vs2,vs58,vs6 + xxperm vs10,vs54,permute_mask + xxperm vs14,vs62,permute_mask + AGGREGATE_REALS_IMAGES vs51,vs3,vs59,vs7 + xxperm vs11,vs55,permute_mask + xxperm vs15,vs63,permute_mask + AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 + AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + AGGREGATE_REALS_IMAGES vs54,vs10,vs62,vs14 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + AGGREGATE_REALS_IMAGES vs55,vs11,vs63,vs15 + MULT_APLHA_PART1 vs34,vs42,vs4,vs5 + MULT_APLHA_PART1 vs35,vs43,vs6,vs7 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs34,vs42,vs4,vs5 + MULT_APLHA_PART2 vs35,vs43,vs6,vs7 + #ifndef TRMMKERNEL + lxv vs32 , 0(T2) + lxv vs40 , 16(T2) +#endif + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 +#ifndef TRMMKERNEL + lxv vs33 , 32(T2) + lxv vs41 , 48(T2) +#endif + MULT_APLHA_PART1 vs38,vs46,vs12,vs13 + MULT_APLHA_PART1 vs39,vs47,vs14,vs15 +#ifndef TRMMKERNEL + lxv vs34 , 0(T3) + lxv vs42 , 16(T3) +#endif + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 +#ifndef TRMMKERNEL + lxv vs35 , 32(T3) + lxv vs43 , 48(T3) +#endif + MULT_APLHA_PART2 vs38,vs46,vs12,vs13 + MULT_APLHA_PART2 vs39,vs47,vs14,vs15 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs26,vs26,vs5 + xvaddsp vs27,vs27,vs7 + xvaddsp vs28,vs28,vs9 + xvaddsp vs29,vs29,vs11 + xvaddsp vs30,vs30,vs13 + xvaddsp vs31,vs31,vs15 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs12,vs4,2 + xxpermdi vs27,vs14,vs6,2 + xxpermdi vs28,vs0,vs8,2 + xxpermdi vs29,vs2,vs10,2 + xxpermdi vs30,vs4,vs12,2 + xxpermdi vs31,vs6,vs14,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + MULT_APLHA_PART1 vs48,vs56,vs0,vs1 + MULT_APLHA_PART1 vs49,vs57,vs2,vs3 + stxv vs26 , 32(CO) + stxv vs27 , 48(CO) + MULT_APLHA_PART1 vs50,vs58,vs4,vs5 + MULT_APLHA_PART1 vs51,vs59,vs6,vs7 + stxv vs28 , 0(T1) + stxv vs29 , 16(T1) + MULT_APLHA_PART2 vs48,vs56,vs0,vs1 + MULT_APLHA_PART2 vs49,vs57,vs2,vs3 + stxv vs30 , 32(T1) + stxv vs31 , 48(T1) + MULT_APLHA_PART2 vs50,vs58,vs4,vs5 + MULT_APLHA_PART2 vs51,vs59,vs6,vs7 + MULT_APLHA_PART1 vs52,vs60,vs8,vs9 + MULT_APLHA_PART1 vs53,vs61,vs10,vs11 + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + MULT_APLHA_PART1 vs54,vs62,vs12,vs13 + MULT_APLHA_PART1 vs55,vs63,vs14,vs15 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + MULT_APLHA_PART2 vs52,vs60,vs8,vs9 + MULT_APLHA_PART2 vs53,vs61,vs10,vs11 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + MULT_APLHA_PART2 vs54,vs62,vs12,vs13 + MULT_APLHA_PART2 vs55,vs63,vs14,vs15 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs32,vs32,vs1 + xvaddsp vs40,vs40,vs3 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs33,vs33,vs5 + xvaddsp vs41,vs41,vs7 + xvaddsp vs34,vs34,vs9 + xvaddsp vs42,vs42,vs11 + xvaddsp vs35,vs35,vs13 + xvaddsp vs43,vs43,vs15 +#else + xxpermdi vs32,vs8,vs0,2 + xxpermdi vs40,vs10,vs2,2 + xxpermdi vs33,vs12,vs4,2 + xxpermdi vs41,vs14,vs6,2 + xxpermdi vs34,vs0,vs8,2 + xxpermdi vs42,vs2,vs10,2 + xxpermdi vs35,vs4,vs12,2 + xxpermdi vs43,vs6,vs14,2 +#endif + stxv vs32 , 0(T2) + stxv vs40 , 16(T2) + stxv vs33 , 32(T2) + stxv vs41 , 48(T2) + stxv vs34 , 0(T3) + stxv vs42 , 16(T3) + stxv vs35 , 32(T3) + stxv vs43 , 48(T3) + addi CO, CO, 64 +.endm + +/* macros for N=4 and M=4 +**********************************************************************************************/ + +.macro Zero4x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 +.endm + + +.macro LOAD4x4 + LOAD4x4O 0,0 +.endm + + +.macro LOAD4x4O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + lxv vs28, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x4_NORMAL + END4x4 AO,BO,32,32 +.endm + + +.macro END4x4_WITHOUT_ADD + END4x4 AO,BO,0,0 +.endm + + +.macro END4x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.endm + + +.macro LOAD4x4_2 + LOAD4x4_2O 0,0 +.endm + + +.macro LOAD4x4_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs12, (16+\OffsetB)(BO) + lxv vs24, (32+\OffsetB)(BO) + lxv vs28, (32+16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + lxv vs0, (32+\OffsetA)(AO) + lxv vs1, (32+16+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x4_2 + /*for load2 offset will be 64 and 64*/ + KERNEL4x4_2 AO,BO, 64,64,0 ,1,1 +.endm + + +.macro KERNEL4x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 +.if \Complete==0 + lxv vs8, DISP8(\Index,\OffsetB)(\BREG) + lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask +.endif + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 +.if \Complete==0 + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 +.if \Complete==0 + lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,\OffsetB) + addi \AREG, \AREG, DISP8(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP8(\Index,64) + addi \AREG, \AREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL4x4 + LOAD4x4 + END4x4 AO, BO, 32,32 +.endm + + +.macro SAVE4x4 + add T4, LDC,LDC + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + add T2,CO,T4 + add T3,T1,T4 +#ifndef TRMMKERNEL + lxv vs26 , 0(T1) + lxv vs27 , 16(T1) +#endif + #ifndef TRMMKERNEL + lxv vs28 , 0(T2) + lxv vs29 , 16(T2) +#endif +#ifndef TRMMKERNEL + lxv vs30 , 0(T3) + lxv vs31 , 16(T3) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + xxperm vs0,vs48,permute_mask + xxperm vs4,vs56,permute_mask + xxperm vs1,vs49,permute_mask + xxperm vs5,vs57,permute_mask + xxperm vs8,vs52,permute_mask + xxperm vs12,vs60,permute_mask + xxperm vs9,vs53,permute_mask + xxperm vs13,vs61,permute_mask + AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 + AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 + AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 + AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART1 vs48,vs56,vs4,vs5 + MULT_APLHA_PART1 vs49,vs57,vs6,vs7 + MULT_APLHA_PART1 vs52,vs60,vs12,vs13 + MULT_APLHA_PART1 vs53,vs61,vs14,vs15 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs48,vs56,vs4,vs5 + MULT_APLHA_PART2 vs49,vs57,vs6,vs7 + MULT_APLHA_PART2 vs52,vs60,vs12,vs13 + MULT_APLHA_PART2 vs53,vs61,vs14,vs15 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xvaddsp vs26,vs26,vs9 + xvaddsp vs27,vs27,vs11 + xvaddsp vs28,vs28,vs5 + xvaddsp vs29,vs29,vs7 + xvaddsp vs30,vs30,vs13 + xvaddsp vs31,vs31,vs15 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs0,vs8,2 + xxpermdi vs27,vs2,vs10,2 + xxpermdi vs28,vs12,vs4,2 + xxpermdi vs29,vs14,vs6,2 + xxpermdi vs30,vs4,vs12,2 + xxpermdi vs31,vs6,vs14,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 0(T1) + stxv vs27 , 16(T1) + stxv vs28 , 0(T2) + stxv vs29 , 16(T2) + stxv vs30 , 0(T3) + stxv vs31 , 16(T3) + addi CO, CO, 32 +.endm + +/* macros for N=4 and M=2 +**********************************************************************************************/ + +.macro Zero4x2 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 +.endm + + +.macro LOAD4x2 + LOAD4x2O 0,0 +.endm + + +.macro LOAD4x2O OffsetA,OffsetB + lxv vs24, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + lxv vs1, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END4x2_NORMAL + END4x2 AO,BO,16,32 +.endm + + +.macro END4x2_WITHOUT_ADD + END4x2 AO,BO,0,0 +.endm + + +.macro END4x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.endm + + +.macro LOAD4x2_2 + LOAD4x2_2O 0,0 +.endm + + +.macro LOAD4x2_2O OffsetA,OffsetB + lxv vs8, (\OffsetA)(AO) + lxv vs24, (16+\OffsetA)(AO) + lxv vs4, (0+\OffsetB)(BO) + lxv vs5, (16+\OffsetB)(BO) + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + lxv vs0, (32+\OffsetB)(BO) + lxv vs1, (32+16+\OffsetB)(BO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END4x2_2 + /*for load2 offset will be 32 and 64*/ + KERNEL4x2_2 AO,BO, 32,64,0 ,1,1 +.endm + + +.macro KERNEL4x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) + lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP8(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,32) + addi \BREG, \BREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL4x2 + LOAD4x2 + END4x2 AO, BO, 16,32 +.endm + + +.macro SAVE4x2 + add T4, LDC,LDC + add T1, CO ,LDC + add T2,CO,T4 + add T3,T1,T4 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxv vs25 , 0(T1) +#endif +#ifndef TRMMKERNEL + lxv vs26 , 0(T2) +#endif +#ifndef TRMMKERNEL + lxv vs27 , 0(T3) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs37,vs9,vs45,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,0 + xxpermdi vs9,vs10,vs2,0 + xxpermdi vs3,vs0,vs8,3 + xxpermdi vs11,vs2,vs10,3 + xvaddsp vs24,vs24,vs1 + xvaddsp vs26,vs26,vs9 + xvaddsp vs25,vs25,vs3 + xvaddsp vs27,vs27,vs11 +#else + xxpermdi vs24,vs8,vs0,0 + xxpermdi vs26,vs10,vs2,0 + xxpermdi vs25,vs0,vs8,3 + xxpermdi vs27,vs2,vs10,3 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 0(T1) + stxv vs26 , 0(T2) + stxv vs27 , 0(T3) + addi CO, CO, 16 +.endm + +/* macros for N=4 and M=2 +**********************************************************************************************/ + +.macro Zero4x1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 +.endm + + +.macro LOAD4x1 + LOAD4x1O 0,0 +.endm + + +.macro LOAD4x1O OffsetA,OffsetB + lxsd v4, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + lxv vs1, (\OffsetB+16)(BO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END4x1_NORMAL + END4x1 AO,BO,8,32 +.endm + + +.macro END4x1_WITHOUT_ADD + END4x1 AO,BO,0,0 +.endm + + +.macro END4x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.endm + + +.macro LOAD4x1_2 + LOAD4x1_2O 0,0 +.endm + + +.macro LOAD4x1_2O OffsetA,OffsetB + lxv vs27, (\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + lxv vs4, (0+\OffsetB)(BO) + lxv vs5, (16+\OffsetB)(BO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask + lxv vs0, (32+\OffsetB)(BO) + lxv vs1, (32+16+\OffsetB)(BO) +.endm + + +.macro END4x1_2 + /*for load2 offset will be 16 and 64*/ + KERNEL4x1_2 AO,BO, 16,64,0 ,1,1 +.endm + + +.macro KERNEL4x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetA)(\AREG) + xxspltd vs8,vs27,1 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) + lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP8(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,16) + addi \BREG, \BREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL4x1 + LOAD4x1 + END4x1 AO, BO, 8,32 +.endm + + +.macro SAVE4x1 + add T4, LDC,LDC + add T1, CO ,LDC + add T2,CO,T4 + add T3,T1,T4 +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxsd v5 , 0(T1) +#endif +#ifndef TRMMKERNEL + lxsd v6 , 0(T2) +#endif +#ifndef TRMMKERNEL + lxsd v7 , 0(T3) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxspltd vs1,vs0,0 + xxspltd vs3,vs0,1 + xxspltd vs9,vs2,0 + xxspltd vs11,vs2,1 + /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ + xvaddsp vs36,vs36,vs1 + xvaddsp vs37,vs37,vs3 + xvaddsp vs38,vs38,vs9 + xvaddsp vs39,vs39,vs11 +#else + /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ + xxspltd vs36,vs0,0 + xxspltd vs37,vs0,1 + xxspltd vs38,vs2,0 + xxspltd vs39,vs2,1 +#endif + stxsd v4 , 0(CO) + stxsd v5 , 0(T1) + stxsd v6 , 0(T2) + stxsd v7 , 0(T3) + addi CO, CO, 8 +.endm + +/* macros for N=2 and M=8 +**********************************************************************************************/ + +.macro Zero2x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + + +.macro LOAD2x8 + LOAD2x8O 0,0 +.endm + + +.macro LOAD2x8O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + xxperm vs26, vs24, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x8_NORMAL + END2x8 AO,BO,64,16 +.endm + + +.macro END2x8_WITHOUT_ADD + END2x8 AO,BO,0,0 +.endm + + +.macro END2x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 +.endm + + +.macro LOAD2x8_2 + LOAD2x8_2O 0,0 +.endm + + +.macro LOAD2x8_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs24, (16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask + lxv vs6, (32+\OffsetA)(AO) + lxv vs7, (48+\OffsetA)(AO) + lxv vs0, (64+\OffsetA)(AO) + lxv vs1, (64+16+\OffsetA)(AO) + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs25, vs24, vs24,2 + lxv vs2, (64+32+\OffsetA)(AO) + lxv vs3, (64+48+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x8_2 + /*for load2 offset will be 128 and 32*/ + KERNEL2x8_2 AO,BO, 128,32,0 ,1,1 +.endm + + +.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 +.if \Complete==0 + lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif +.if \Complete==0 + lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \Complete==0 + lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index,\OffsetB) + addi \AREG, \AREG, DISP16(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP4(\Index,32) + addi \AREG, \AREG, DISP16(\Index,128) +.endif + +.endif +.endm + + +.macro KERNEL2x8 + LOAD2x8 + END2x8 AO, BO, 64,16 +.endm + + +.macro SAVE2x8 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask +#ifndef TRMMKERNEL + lxv vs26 , 32(CO) + lxv vs27 , 48(CO) +#endif + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask +#ifndef TRMMKERNEL + lxv vs28 , 0(T1) + lxv vs29 , 16(T1) +#endif + xxperm vs2,vs34,permute_mask + xxperm vs6,vs42,permute_mask +#ifndef TRMMKERNEL + lxv vs30 , 32(T1) + lxv vs31 , 48(T1) +#endif + xxperm vs3,vs35,permute_mask + xxperm vs7,vs43,permute_mask + add T2,CO,T4 + add T3,T1,T4 + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 + xxperm vs10,vs38,permute_mask + xxperm vs14,vs46,permute_mask + AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 + xxperm vs11,vs39,permute_mask + xxperm vs15,vs47,permute_mask + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 + AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs34,vs42,vs4,vs5 + MULT_APLHA_PART1 vs35,vs43,vs6,vs7 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs34,vs42,vs4,vs5 + MULT_APLHA_PART2 vs35,vs43,vs6,vs7 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART1 vs38,vs46,vs12,vs13 + MULT_APLHA_PART1 vs39,vs47,vs14,vs15 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs38,vs46,vs12,vs13 + MULT_APLHA_PART2 vs39,vs47,vs14,vs15 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs26,vs26,vs5 + xvaddsp vs27,vs27,vs7 + xvaddsp vs28,vs28,vs9 + xvaddsp vs29,vs29,vs11 + xvaddsp vs30,vs30,vs13 + xvaddsp vs31,vs31,vs15 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs12,vs4,2 + xxpermdi vs27,vs14,vs6,2 + xxpermdi vs28,vs0,vs8,2 + xxpermdi vs29,vs2,vs10,2 + xxpermdi vs30,vs4,vs12,2 + xxpermdi vs31,vs6,vs14,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 32(CO) + stxv vs27 , 48(CO) + stxv vs28 , 0(T1) + stxv vs29 , 16(T1) + stxv vs30 , 32(T1) + stxv vs31 , 48(T1) + addi CO, CO, 64 +.endm + +/* macros for N=2 and M=4 +**********************************************************************************************/ + +.macro Zero2x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 +.endm + + +.macro LOAD2x4 + LOAD2x4O 0,0 +.endm + + +.macro LOAD2x4O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x4_NORMAL + END2x4 AO,BO,32,16 +.endm + + +.macro END2x4_WITHOUT_ADD + END2x4 AO,BO,0,0 +.endm + + +.macro END2x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.endm + + +.macro LOAD2x4_2 + LOAD2x4_2O 0,0 +.endm + + +.macro LOAD2x4_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs24, (16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs25, vs24, vs24,2 + lxv vs0, (32+\OffsetA)(AO) + lxv vs1, (32+16+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x4_2 + /*for load2 offset will be 64 and 32*/ + KERNEL2x4_2 AO,BO, 64,32,0 ,1,1 +.endm + + +.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index,\OffsetB) + addi \AREG, \AREG, DISP8(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP4(\Index,32) + addi \AREG, \AREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL2x4 + LOAD2x4 + END2x4 AO, BO, 32,16 +.endm + + +.macro SAVE2x4 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif +#ifndef TRMMKERNEL + lxv vs26 , 0(T1) + lxv vs27 , 16(T1) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xvaddsp vs26,vs26,vs9 + xvaddsp vs27,vs27,vs11 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs0,vs8,2 + xxpermdi vs27,vs2,vs10,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 0(T1) + stxv vs27 , 16(T1) + addi CO, CO, 32 +.endm + +/* macros for N=2 and M=2 +**********************************************************************************************/ + +.macro Zero2x2 + xxlxor vs32, vs32, vs32 + xxlxor vs36, vs36, vs36 + xxlxor vs40, vs40, vs40 + xxlxor vs44, vs44, vs44 +.endm + + +.macro LOAD2x2 + LOAD2x2O 0,0 +.endm + + +.macro LOAD2x2O OffsetA,OffsetB + lxv vs24, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x2_NORMAL + END2x2 AO,BO,16,16 +.endm + + +.macro END2x2_WITHOUT_ADD + END2x2 AO,BO,0,0 +.endm + + +.macro END2x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs44, vs0,vs27 +.endm + + +.macro LOAD2x2_2 + LOAD2x2_2O 0,0 +.endm + + +.macro LOAD2x2_2O OffsetA,OffsetB + lxv vs8, (\OffsetA)(AO) + lxv vs24, (16+\OffsetA)(AO) + lxv vs4, (0+\OffsetB)(BO) + lxv vs0, (16+\OffsetB)(BO) + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x2_2 + /*for load2 offset will be 32 and 32*/ + KERNEL2x2_2 AO,BO, 32,32,0 ,1,1 +.endm + + +.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs44, vs4,vs11 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif +.if \Complete==0 + lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs44, vs0,vs27 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \Complete==0 + lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,32) + addi \BREG, \BREG, DISP4(\Index,32) +.endif + +.endif +.endm + + +.macro KERNEL2x2 + LOAD2x2 + END2x2 AO, BO, 16,16 +.endm + + +.macro SAVE2x2 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxv vs26 , 0(T1) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs8,vs9, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,0 + xxpermdi vs9,vs0,vs8,3 + xvaddsp vs24,vs24,vs1 + xvaddsp vs26,vs26,vs9 +#else + xxpermdi vs24,vs8,vs0,0 + xxpermdi vs26,vs0,vs8,3 +#endif + stxv vs24 , 0(CO) + stxv vs26 , 0(T1) + addi CO, CO, 16 +.endm + +/* macros for N=2 and M=1 +**********************************************************************************************/ + +.macro Zero2x1 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + + +.macro LOAD2x1 + LOAD2x1O 0,0 +.endm + + +.macro LOAD2x1O OffsetA,OffsetB + lxsd v4, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END2x1_NORMAL + END2x1 AO,BO,8,16 +.endm + + +.macro END2x1_WITHOUT_ADD + END2x1 AO,BO,0,0 +.endm + + +.macro END2x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.endm + + +.macro LOAD2x1_2 + LOAD2x1_2O 0,0 +.endm + + +.macro LOAD2x1_2O OffsetA,OffsetB + lxv vs27, (\OffsetA)(AO) + lxv vs4, (0+\OffsetB)(BO) + lxv vs0, (16+\OffsetB)(BO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END2x1_2 + /*for load2 offset will be 16 and 32*/ + KERNEL2x1_2 AO,BO, 16,32,0 ,1,1 +.endm + + +.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetA)(\AREG) + xxspltd vs8,vs27,1 +.endif +.if \Complete==0 + lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,16) + addi \BREG, \BREG, DISP4(\Index,32) +.endif + +.endif +.endm + + +.macro KERNEL2x1 + LOAD2x1 + END2x1 AO, BO, 8,16 +.endm + + +.macro SAVE2x1 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxsd v5 , 0(T1) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxspltd vs1,vs0,0 + xxspltd vs3,vs0,1 + /*--v4==vs36 v5==vs37---*/ + xvaddsp vs36,vs36,vs1 + xvaddsp vs37,vs37,vs3 +#else + /*--v4==vs36 v5==vs37---*/ + xxspltd vs36,vs0,0 + xxspltd vs37,vs0,1 +#endif + stxsd v4 , 0(CO) + stxsd v5 , 0(T1) + addi CO, CO, 8 +.endm + +/* macros for N=1 and M=8 +**********************************************************************************************/ + +.macro Zero1x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 +.endm + + +.macro LOAD1x8 + LOAD1x8O 0,0 +.endm + + +.macro LOAD1x8O OffsetA,OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x8_NORMAL + END1x8 AO,BO,64,8 +.endm + + +.macro END1x8_WITHOUT_ADD + END1x8 AO,BO,0,0 +.endm + + +.macro END1x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 +.endm + + +.macro LOAD1x8_2 + LOAD1x8_2O 0,0 +.endm + + +.macro LOAD1x8_2O OffsetA,OffsetB + lxv vs27, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + lxv vs6, (32+\OffsetA)(AO) + lxv vs7, (48+\OffsetA)(AO) + lxv vs0, (64+\OffsetA)(AO) + lxv vs1, (64+16+\OffsetA)(AO) + lxv vs2, (64+32+\OffsetA)(AO) + lxv vs3, (64+48+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x8_2 + /*for load2 offset will be 128 and 16*/ + KERNEL1x8_2 AO,BO, 128,16,0 ,1,1 +.endm + + +.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 +.if \Complete==0 + lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) +.endif +.if \Complete==0 + xxspltd vs8,vs27,1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP16(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP16(\Index,128) +.endif + +.endif +.endm + + +.macro KERNEL1x8 + LOAD1x8 + END1x8 AO, BO, 64,8 +.endm + + +.macro SAVE1x8 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask +#ifndef TRMMKERNEL + lxv vs26 , 32(CO) + lxv vs27 , 48(CO) +#endif + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs2,vs34,permute_mask + xxperm vs6,vs42,permute_mask + xxperm vs3,vs35,permute_mask + xxperm vs7,vs43,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 + AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs34,vs42,vs4,vs5 + MULT_APLHA_PART1 vs35,vs43,vs6,vs7 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs34,vs42,vs4,vs5 + MULT_APLHA_PART2 vs35,vs43,vs6,vs7 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, vs28 + xxperm vs2,vs3, vs28 + xxperm vs4,vs5, vs28 + xxperm vs6,vs7, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24,vs24,vs0 + xvaddsp vs25,vs25,vs2 + xvaddsp vs26,vs26,vs4 + xvaddsp vs27,vs27,vs6 + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 32(CO) + stxv vs27 , 48(CO) +#else +/* reconstruct r,i pairs*/ + stxv vs0 , 0(CO) + stxv vs2 , 16(CO) + stxv vs4 , 32(CO) + stxv vs6 , 48(CO) +#endif + addi CO, CO, 64 +.endm + +/* macros for N=1 and M=4 +**********************************************************************************************/ + +.macro Zero1x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 +.endm + + +.macro LOAD1x4 + LOAD1x4O 0,0 +.endm + + +.macro LOAD1x4O OffsetA,OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x4_NORMAL + END1x4 AO,BO,32,8 +.endm + + +.macro END1x4_WITHOUT_ADD + END1x4 AO,BO,0,0 +.endm + + +.macro END1x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.endm + + +.macro LOAD1x4_2 + LOAD1x4_2O 0,0 +.endm + + +.macro LOAD1x4_2O OffsetA,OffsetB + lxv vs27, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + lxv vs0, (32+\OffsetA)(AO) + lxv vs1, (32+16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x4_2 + /*for load2 offset will be 64 and 16*/ + KERNEL1x4_2 AO,BO, 64,16,0 ,1,1 +.endm + + +.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs8,vs27,1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP8(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL1x4 + LOAD1x4 + END1x4 AO, BO, 32,8 +.endm + + +.macro SAVE1x4 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, vs28 + xxperm vs2,vs3, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24,vs24,vs0 + xvaddsp vs25,vs25,vs2 + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) +#else +/* reconstruct r,i pairs*/ + stxv vs0 , 0(CO) + stxv vs2 , 16(CO) +#endif + addi CO, CO, 32 +.endm + +/* macros for N=1 and M=2 +**********************************************************************************************/ + +.macro Zero1x2 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + + +.macro LOAD1x2 + LOAD1x2O 0,0 +.endm + + +.macro LOAD1x2O OffsetA,OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x2_NORMAL + END1x2 AO,BO,16,8 +.endm + + +.macro END1x2_WITHOUT_ADD + END1x2 AO,BO,0,0 +.endm + + +.macro END1x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.endm + + +.macro LOAD1x2_2 + LOAD1x2_2O 0,0 +.endm + + +.macro LOAD1x2_2O OffsetA,OffsetB + lxv vs27, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs0, (16+\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x2_2 + /*for load2 offset will be 32 and 16*/ + KERNEL1x2_2 AO,BO, 32,16,0 ,1,1 +.endm + + +.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs4, DISP4(\Index,0+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs8,vs27,1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.if \Complete==0 + lxv vs0, DISP4(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP4(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP4(\Index,32) +.endif + +.endif +.endm + + +.macro KERNEL1x2 + LOAD1x2 + END1x2 AO, BO, 16,8 +.endm + + +.macro SAVE1x2 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24,vs24,vs0 + stxv vs24 , 0(CO) +#else +/* reconstruct r,i pairs*/ + stxv vs0 , 0(CO) +#endif + addi CO, CO, 16 +.endm + +/* macros for N=1 and M=1 +**********************************************************************************************/ +.macro Zero1x1 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + + +.macro LOAD1x1 + LOAD1x1O 0,0 +.endm + + +.macro LOAD1x1O OffsetA,OffsetB + lxsd v4, (\OffsetB+0)(BO) + lxsd v5, (\OffsetA+0)(AO) + xxperm vs38, vs36, permute_mask +.endm + + +.macro END1x1_NORMAL + END1x1 AO,BO,8,8 +.endm + + +.macro END1x1_WITHOUT_ADD + END1x1 AO,BO,0,0 +.endm + + +.macro END1x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs37,vs36 + xvmaddasp vs40, vs37,vs38 +.endm + + +.macro LOAD1x1_2 + LOAD1x1_2O 0,0 +.endm + + +.macro LOAD1x1_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask +.endm + + +.macro END1x1_2 + /*for load2 offset will be 16 and 16*/ + KERNEL1x1_2 AO,BO, 16,16,0 ,1,1 +.endm + + +.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs8, DISP2(\Index,\OffsetB)(\BREG) + lxv vs4, DISP2(\Index,\OffsetB)(\AREG) + xxperm vs10, vs8, permute_mask +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP2(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP2(\Index,16) +.endif + +.endif +.endm + + +.macro KERNEL1x1 + LOAD1x1 + END1x1 AO, BO, 8,8 +.endm + + +.macro SAVE1x1 +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif + /*aggregate x2*/ + xxpermdi vs33,vs32,vs32,2 + xxpermdi vs41,vs40,vs40,2 + xvaddsp vs32,vs32,vs33 + xvaddsp vs40,vs40,vs41 + + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs37,vs1 + MULT_APLHA_PART2 vs32,vs40,vs37,vs1 + +/* reconstruct r,i pairs*/ + xxperm vs37,vs1, vs28 + +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs36,vs36,vs37 + stxsd v4 , 0(CO) +#else + +/* vs37 is v5 */ + stxsd v5 , 0(CO) +#endif + addi CO, CO, 8 +.endm + + + + +/****************************TRMM POINTER REFRESH MACROSES*************************/ + + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 4 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 3 + .endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*8; +// ptrbb = bb + off*4; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+8; // number of values in A +// #else +// temp = off+4; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif + +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 8; // number of values in A +// #else +// temp -= 4; // number of values in B +// #endif +// ptrba += temp*8; +// ptrbb += temp*4; +// #endif + +// #ifdef LEFT +// off += 8; // number of values in A +// #endif +*/ + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + + #endif + + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif +.endm \ No newline at end of file diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S index f902484a3..fe5d8ade2 100644 --- a/kernel/power/zgemm_logic_power9.S +++ b/kernel/power/zgemm_logic_power9.S @@ -1353,7 +1353,7 @@ ZGEMM_L1: ZGEMM_L1_BEGIN: /*----------------------------------------*/ mr CO, C - slwi T1, LDC , 1 + add T2,C,LDC mr AO, A add C, C, T1 diff --git a/param.h b/param.h index 3934da6c8..84e577acc 100644 --- a/param.h +++ b/param.h @@ -2250,12 +2250,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 832 #define DGEMM_DEFAULT_P 128 -#define CGEMM_DEFAULT_P 640 +#define CGEMM_DEFAULT_P 512 #define ZGEMM_DEFAULT_P 256 #define SGEMM_DEFAULT_Q 1026 #define DGEMM_DEFAULT_Q 384 -#define CGEMM_DEFAULT_Q 640 +#define CGEMM_DEFAULT_Q 1026 #define ZGEMM_DEFAULT_Q 1026 #define SYMV_P 8 From 9086543f503f63d9107ce539650f28918b027015 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jul 2019 14:29:47 +0200 Subject: [PATCH 094/127] Utest needs CBLAS but not necessarily FORTRAN --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 50da721cd..d7d9c2fce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -211,7 +211,8 @@ if (USE_THREAD) target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT}) endif() -if (MSVC OR NOT NOFORTRAN) +#if (MSVC OR NOT NOFORTRAN) +if (NOT NO_CBLAS) # Broken without fortran on unix add_subdirectory(utest) endif() From ae9e8b131e27f65684cf4cb98e03b7df4b290142 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jul 2019 14:30:33 +0200 Subject: [PATCH 095/127] Add mingw builds to Appveyor config --- appveyor.yml | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 44a616aaa..2f9cc7b0b 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -35,7 +35,14 @@ environment: DYNAMIC_ARCH: ON WITH_FORTRAN: no - COMPILER: cl - + - COMPILER: MinGW64-gcc-7.2.0-mingw + DYNAMIC_ARCH: OFF + WITH_FORTRAN: ignore + - COMPILER: MinGW64-gcc-7.2.0 + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 + COMPILER: MinGW-gcc-5.3.0 + WITH_FORTRAN: ignore + install: - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force @@ -52,7 +59,14 @@ install: before_build: - ps: if (-Not (Test-Path .\build)) { mkdir build } - cd build + - set PATH=%PATH:C:\Program Files\Git\usr\bin;=% + - if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% + - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% + - if [%COMPILER%]==[MinGW64-gcc-7.2.0] set PATH=C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. + - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. + - if [%COMPILER%]==[MinGW64-gcc-7.2.0] cmake -G "MSYS Makefiles" -DBINARY=32 -DNOFORTRAN=1 .. + - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. @@ -64,3 +78,4 @@ test_script: - echo Running Test - cd utest - openblas_utest + From f69a0be712a9dccf5fcf433a734eb1371cb6189a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jul 2019 15:02:39 +0200 Subject: [PATCH 096/127] Add getarch flags to disable AVX on x86 (and other small fixes to match Makefile behaviour) --- cmake/system.cmake | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 7f3696286..1c2093efe 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -70,6 +70,13 @@ if (X86_64) set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native") endif () +# On x86 no AVX support is available +if (X86 OR X86_64) +if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4")) + set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512") +endif () +endif () + if (INTERFACE64) message(STATUS "Using 64-bit integers.") set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT") @@ -148,7 +155,9 @@ else() endif () include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") - +if (DEFINED BINARY) + message(STATUS "Compiling a ${BINARY}-bit binary.") +endif () if (NOT DEFINED NEED_PIC) set(NEED_PIC 1) endif () @@ -165,6 +174,9 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake") if (NOT NOFORTRAN) # Fortran Compiler dependent settings include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") +else () +set(NO_LAPACK 1) +set(NO_LAPACKE 1) endif () if (BINARY64) @@ -190,9 +202,14 @@ if (NEED_PIC) endif () if (DYNAMIC_ARCH) - set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") - if (DYNAMIC_OLDER) - set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") + if (X86 OR X86_64 OR ARM64 OR PPC) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") + if (DYNAMIC_OLDER) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") + endif () + else () + unset (DYNAMIC_ARCH) + message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing") endif () endif () From 04d671aae2b452a0bf63837c289f8948c35eb675 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jul 2019 15:05:04 +0200 Subject: [PATCH 097/127] Make disabling DYNAMIC_ARCH on unsupported systems work needs to be unset in the cache for the change to have any effect --- cmake/arch.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index b4547b7c9..5a7434551 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -81,7 +81,8 @@ if (DYNAMIC_ARCH) endif () if (NOT DYNAMIC_CORE) - unset(DYNAMIC_ARCH) + message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options") + unset(DYNAMIC_ARCH CACHE) endif () endif () From 8fb76134bc0711634b410fa20d6eb113f8893a04 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jul 2019 15:07:15 +0200 Subject: [PATCH 098/127] Mingw32 needs leading underscore on object names (also copy BUNDERSCORE settings for FORTRAN from the corresponding Makefile) --- cmake/prebuild.cmake | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index a67c44bf5..e508a46c2 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -59,6 +59,9 @@ set(FU "") if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")) set(FU "_") endif() +if(MINGW AND NOT MINGW64) + set(FU "_") +endif() set(COMPILER_ID ${CMAKE_C_COMPILER_ID}) if (${COMPILER_ID} STREQUAL "GNU") @@ -82,6 +85,11 @@ endif () # f_check if (NOT NOFORTRAN) include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake") +else () + file(APPEND ${TARGET_CONF_TEMP} + "#define BUNDERSCORE _\n" + "#define NEEDBUNDERSCORE 1\n") + set(BU "_") endif () # Cannot run getarch on target if we are cross-compiling From b89c781637503ec66117eb3b887a3755d42f0f46 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jul 2019 16:04:45 +0200 Subject: [PATCH 099/127] Fix surprising behaviour of NO_AFFINITY=0 --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index 16791bcc2..09a648e4a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1124,8 +1124,12 @@ endif endif ifdef NO_AFFINITY +ifeq ($(NO_AFFINITY), 0) +override undefine NO_AFFINITY +else CCOMMON_OPT += -DNO_AFFINITY endif +endif ifdef FUNCTION_PROFILE CCOMMON_OPT += -DFUNCTION_PROFILE From b89d9762a29ac84422ebb6092584831efd85d355 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Mon, 8 Jul 2019 17:13:21 -0500 Subject: [PATCH 100/127] Change install_name on osx to match linux --- Makefile | 1 + Makefile.install | 3 ++- exports/Makefile | 8 ++++++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 07b08439e..60f189ef2 100644 --- a/Makefile +++ b/Makefile @@ -109,6 +109,7 @@ endif ifeq ($(OSNAME), Darwin) @$(MAKE) -C exports dyn @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib + @ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib endif ifeq ($(OSNAME), WINNT) @$(MAKE) -C exports dll diff --git a/Makefile.install b/Makefile.install index fefecd98d..8070b4729 100644 --- a/Makefile.install +++ b/Makefile.install @@ -83,7 +83,8 @@ ifeq ($(OSNAME), Darwin) @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ - ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib + ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \ + ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib endif ifeq ($(OSNAME), WINNT) @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" diff --git a/exports/Makefile b/exports/Makefile index b1348bd4a..d32e449df 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -105,6 +105,10 @@ $(LIBPREFIX).def : gensymbol libgoto_hpl.def : gensymbol perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) +ifeq ($(OSNAME), Darwin) +INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib +endif + ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) $(LIBDYNNAME) : ../$(LIBNAME) osx.def else @@ -114,9 +118,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def endif ifneq (,$(filter 1 2,$(NOFORTRAN))) #only build without Fortran - $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) else - $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) endif dllinit.$(SUFFIX) : dllinit.c From 0ba29fd2625dfe405a08005a22d0fa21293cc16c Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 00:46:51 +0800 Subject: [PATCH 101/127] Update dgemm_kernel_4x8_haswell.S for zen2 replaced a bunch of vpermpd instructions with vpermilpd and vperm2f128 --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 120 ++++++++++------------- 1 file changed, 54 insertions(+), 66 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index c84b599ce..5416018bb 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 ,%ymm2 , %ymm8 vmulpd %ymm0 ,%ymm3 , %ymm12 prefetcht0 B_PR1+256(BO) - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 vmulpd %ymm0 ,%ymm3 , %ymm13 @@ -153,7 +153,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addq $ 12*SIZE, BO vmulpd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 @@ -172,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm8 prefetcht0 B_PR1+128(BO) vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 @@ -181,7 +181,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm10 vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -196,7 +196,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 @@ -206,7 +206,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups 0 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -222,7 +222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 @@ -232,7 +232,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 @@ -247,7 +247,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vmovups -4 * SIZE(BO), %ymm3 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 addq $ 12*SIZE, BO @@ -257,7 +257,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 4*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 @@ -284,18 +284,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -324,18 +322,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 32(%rax) prefetcht0 32(%rax,LDC) - vpermpd $ 0xb1 , %ymm9 , %ymm9 - vpermpd $ 0xb1 , %ymm11, %ymm11 + vpermilpd $ 0x05 , %ymm9 , %ymm9 + vpermilpd $ 0x05 , %ymm11, %ymm11 vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -365,18 +361,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 32(%rbp) prefetcht0 32(%rbp,LDC) - vpermpd $ 0xb1 , %ymm13, %ymm13 - vpermpd $ 0xb1 , %ymm15, %ymm15 + vpermilpd $ 0x05 , %ymm13, %ymm13 + vpermilpd $ 0x05 , %ymm15, %ymm15 vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -687,7 +681,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -8 * SIZE(BO), %ymm2 vmulpd %ymm0 ,%ymm1 , %ymm4 vmulpd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 vpermpd $ 0x1b, %ymm0 , %ymm0 @@ -695,7 +689,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, BO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 @@ -710,14 +704,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm4 prefetcht0 B_PR1+64(BO) vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -729,7 +723,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vpermpd $ 0x1b, %ymm0 , %ymm0 @@ -737,7 +731,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -4 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -750,7 +744,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vpermpd $ 0x1b, %ymm0 , %ymm0 @@ -758,7 +752,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 addq $ 8*SIZE, BO @@ -770,7 +764,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vmovups -8 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 addq $ 8*SIZE, BO @@ -778,7 +772,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 4*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -799,18 +793,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -839,18 +831,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 32(%rax) prefetcht0 32(%rax,LDC) - vpermpd $ 0xb1 , %ymm9 , %ymm9 - vpermpd $ 0xb1 , %ymm11, %ymm11 + vpermilpd $ 0x05 , %ymm9 , %ymm9 + vpermilpd $ 0x05 , %ymm11, %ymm11 vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -1084,13 +1074,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -12 * SIZE(BO), %ymm1 vmovups -16 * SIZE(AO), %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, BO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 @@ -1100,12 +1090,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 A_PR1(AO) vmovups -16 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 @@ -1114,13 +1104,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M2 vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -8 * SIZE(BO), %ymm1 addq $ 8*SIZE, BO @@ -1130,13 +1120,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_E vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 addq $ 4*SIZE, BO .endm @@ -1145,13 +1135,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -12 * SIZE(BO), %ymm1 vmovups -16 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 addq $ 4*SIZE, BO vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 .endm @@ -1165,18 +1155,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 From 7a9050d6817dd63e4b3cb641566b03f069be47a9 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 00:55:06 +0800 Subject: [PATCH 102/127] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 5416018bb..b98610524 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -292,8 +292,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -330,8 +330,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -369,8 +369,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -801,8 +801,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -839,8 +839,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -1163,8 +1163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 From 182b06d6adb445d00066eff3b15da335ee1656bc Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 17:02:35 +0800 Subject: [PATCH 103/127] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 40 ++++++++++++------------ 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index b98610524..814a1c350 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -317,10 +317,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rax) vmovups %ymm7 , (%rax, LDC) - prefetcht0 32(CO1) - prefetcht0 32(CO1,LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) + prefetcht0 56(CO1) + prefetcht0 56(CO1,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) vpermilpd $ 0x05 , %ymm9 , %ymm9 vpermilpd $ 0x05 , %ymm11, %ymm11 @@ -356,10 +356,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) + prefetcht0 56(%rbp) + prefetcht0 56(%rbp,LDC) vpermilpd $ 0x05 , %ymm13, %ymm13 vpermilpd $ 0x05 , %ymm15, %ymm15 @@ -395,10 +395,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) + prefetcht0 56(%rbp) + prefetcht0 56(%rbp,LDC) addq $ 4*SIZE, CO1 .endm @@ -826,10 +826,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rax) vmovups %ymm7 , (%rax, LDC) - prefetcht0 32(CO1) - prefetcht0 32(CO1,LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) + prefetcht0 56(CO1) + prefetcht0 56(CO1,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) vpermilpd $ 0x05 , %ymm9 , %ymm9 vpermilpd $ 0x05 , %ymm11, %ymm11 @@ -865,10 +865,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) + prefetcht0 56(%rbp) + prefetcht0 56(%rbp,LDC) addq $ 4*SIZE, CO1 .endm From 1733f927e6b892610bda045538a42d495faa1af5 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 21:27:41 +0800 Subject: [PATCH 104/127] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 814a1c350..b30ecccea 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -106,7 +106,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define A_PR1 512 -#define B_PR1 512 +#define B_PR1 160 /******************************************************************************************* * Macro definitions From 211ab03b1402a3c39311b7ca769aaad736ca554c Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 22:39:15 +0800 Subject: [PATCH 105/127] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index b30ecccea..3f7f9a98e 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -267,23 +267,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x12 + prefetcht0 128(%rsp) /*BUFFER 1*/ vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 vmulpd %ymm0 , %ymm7 , %ymm7 - + prefetcht0 192(%rsp) vmulpd %ymm0 , %ymm8 , %ymm8 vmulpd %ymm0 , %ymm9 , %ymm9 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 - + prefetcht0 256(%rsp) vmulpd %ymm0 , %ymm12, %ymm12 vmulpd %ymm0 , %ymm13, %ymm13 vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 - + prefetcht0 320(%rsp) vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 From 8a074b39656636ebec5812532b486cf751231a3b Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 23:47:30 +0800 Subject: [PATCH 106/127] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 42 +++++++++++++++++++++--- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 3f7f9a98e..5242e3efe 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -267,24 +267,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x12 - prefetcht0 128(%rsp) /*BUFFER 1*/ + prefetcht0 BUFFER1 vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 vmulpd %ymm0 , %ymm7 , %ymm7 - prefetcht0 192(%rsp) + prefetcht0 64 + BUFFER1 vmulpd %ymm0 , %ymm8 , %ymm8 vmulpd %ymm0 , %ymm9 , %ymm9 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 - prefetcht0 256(%rsp) + prefetcht0 128 + BUFFER1 vmulpd %ymm0 , %ymm12, %ymm12 vmulpd %ymm0 , %ymm13, %ymm13 vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 - prefetcht0 320(%rsp) + prefetcht0 192 + BUFFER1 vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 @@ -1606,6 +1606,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm +.macro PREFETCHT0_C + prefetcht0 (CO1) + prefetcht0 24(CO1) + prefetcht0 (CO1,LDC,4) + prefetcht0 24(CO1,LDC,4) + prefetcht0 (CO1,LDC,8) + prefetcht0 24(CO1,LDC,8) + addq LDC,CO1 + prefetcht0 (CO1) + prefetcht0 24(CO1) + prefetcht0 (CO1,LDC,4) + prefetcht0 24(CO1,LDC,4) + prefetcht0 (CO1,LDC,8) + prefetcht0 24(CO1,LDC,8) + leaq (CO1,LDC,2),CO1 + prefetcht0 (CO1) + prefetcht0 24(CO1) + prefetcht0 (CO1,LDC,4) + prefetcht0 24(CO1,LDC,4) + prefetcht0 (CO1,LDC,8) + prefetcht0 24(CO1,LDC,8) + subq LDC,CO1 + prefetcht0 (CO1) + prefetcht0 24(CO1) + prefetcht0 (CO1,LDC,4) + prefetcht0 24(CO1,LDC,4) + prefetcht0 (CO1,LDC,8) + prefetcht0 24(CO1,LDC,8) + subq LDC,CO1 + subq LDC,CO1 +.endm /*******************************************************************************************/ #if !defined(TRMMKERNEL) @@ -1773,7 +1804,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. dec %rax jne .L12_12 - + + PREFETCHT0_C .L12_12a: KERNEL4x12_M1 From 9b04baeaeeaaaeba8c12e3fc2418ceaeca53ebb0 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 23:50:03 +0800 Subject: [PATCH 107/127] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 5242e3efe..42692f33b 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -318,10 +318,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rax) vmovups %ymm7 , (%rax, LDC) - prefetcht0 56(CO1) - prefetcht0 56(CO1,LDC) - prefetcht0 56(%rax) - prefetcht0 56(%rax,LDC) + prefetcht1 56(CO1) + prefetcht1 56(CO1,LDC) + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) vpermilpd $ 0x05 , %ymm9 , %ymm9 vpermilpd $ 0x05 , %ymm11, %ymm11 @@ -357,10 +357,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 56(%rax) - prefetcht0 56(%rax,LDC) - prefetcht0 56(%rbp) - prefetcht0 56(%rbp,LDC) + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) + prefetcht1 56(%rbp) + prefetcht1 56(%rbp,LDC) vpermilpd $ 0x05 , %ymm13, %ymm13 vpermilpd $ 0x05 , %ymm15, %ymm15 @@ -396,10 +396,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 56(%rax) - prefetcht0 56(%rax,LDC) - prefetcht0 56(%rbp) - prefetcht0 56(%rbp,LDC) + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) + prefetcht1 56(%rbp) + prefetcht1 56(%rbp,LDC) addq $ 4*SIZE, CO1 .endm From 9c89757562f43af48645a6563161909321077646 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 19 Jul 2019 23:47:58 +0800 Subject: [PATCH 108/127] Add files via upload --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 29 +++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 42692f33b..e26bddea3 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1865,6 +1865,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 + salq $3, K + prefetcht2 32(B) + prefetcht2 32(B, K, 8) + prefetcht2 96(B) + prefetcht2 96(B, K, 8) + addq $128, B + sarq $3, K + decq I # i -- jne .L12_11 ALIGN_4 @@ -1872,6 +1880,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ + movq M, I + sarq $2, I + salq $7, I + subq I, B + .L12_20: // Test rest of M @@ -2102,7 +2115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jmp .L13_16 - + PREFETCHT0_C .L13_13: test $1, %rax @@ -2147,6 +2160,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 + salq $3, K + prefetcht2 (B) + prefetcht2 (B, K, 8) + prefetcht2 64(B) + prefetcht2 64(B, K, 8) + addq $128, B + sarq $3, K + decq I # i -- jne .L13_11 ALIGN_4 @@ -2154,6 +2175,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ + + movq M, I + sarq $2, I + salq $7, I + subq I, B + .L13_20: // Test rest of M From 825777faab163326f38a0e6203ef1fb6fa8de6af Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 19 Jul 2019 23:58:24 +0800 Subject: [PATCH 109/127] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index e26bddea3..225af3673 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1865,12 +1865,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 + /* here for the prefetch of next b source block */ + /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ + /* currently an increment of 128 byte is suitable */ salq $3, K prefetcht2 32(B) prefetcht2 32(B, K, 8) prefetcht2 96(B) prefetcht2 96(B, K, 8) - addq $128, B + addq $128, B /* increment */ sarq $3, K decq I # i -- @@ -1880,6 +1883,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ + /* recover the original value of pointer B */ movq M, I sarq $2, I salq $7, I @@ -2160,6 +2164,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 + /* here for the prefetch of next b source block */ + /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ + /* currently an increment of 128 byte is suitable */ salq $3, K prefetcht2 (B) prefetcht2 (B, K, 8) @@ -2175,7 +2182,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ - + /* recover the original value of pointer B */ movq M, I sarq $2, I salq $7, I From f49f8047acbea636eb2a3542f306803a1285793b Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 20 Jul 2019 14:33:37 +0800 Subject: [PATCH 110/127] Add files via upload --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 50 ++++++++++++++++++++---- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 225af3673..6d1460bb2 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -279,30 +279,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm9 , %ymm9 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 +#if B_PR1 >= 96 prefetcht0 128 + BUFFER1 +#endif vmulpd %ymm0 , %ymm12, %ymm12 vmulpd %ymm0 , %ymm13, %ymm13 vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 +#if B_PR1 >= 160 prefetcht0 192 + BUFFER1 +#endif vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 - +#if B_PR1 >= 224 + prefetcht0 256 + BUFFER1 +#endif vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - +#if B_PR1 >= 288 + prefetcht0 320 + BUFFER1 +#endif vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 - +#if B_PR1 >= 352 + prefetcht0 384 + BUFFER1 +#endif vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - +#if B_PR1 >= 416 + prefetcht0 448 + BUFFER1 +#endif leaq (CO1, LDC, 2), %rax +#if B_PR1 >= 480 + prefetcht0 512 + BUFFER1 +#endif #if !defined(TRMMKERNEL) @@ -1867,13 +1882,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* here for the prefetch of next b source block */ /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ - /* currently an increment of 128 byte is suitable */ + salq $3, K +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + prefetcht2 32(B) + prefetcht2 32(B, K, 8) + addq $64, B /* increment */ +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ prefetcht2 32(B) prefetcht2 32(B, K, 8) prefetcht2 96(B) prefetcht2 96(B, K, 8) addq $128, B /* increment */ +#endif sarq $3, K decq I # i -- @@ -1883,10 +1904,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ - /* recover the original value of pointer B */ + + /* recover the original value of pointer B after prefetch */ movq M, I sarq $2, I +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + salq $6, I +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ salq $7, I +#endif subq I, B .L12_20: @@ -2166,13 +2192,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* here for the prefetch of next b source block */ /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ - /* currently an increment of 128 byte is suitable */ + salq $3, K +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + prefetcht2 (B) + prefetcht2 (B, K, 8) + addq $64, B +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ prefetcht2 (B) prefetcht2 (B, K, 8) prefetcht2 64(B) prefetcht2 64(B, K, 8) addq $128, B +#endif sarq $3, K decq I # i -- @@ -2185,7 +2217,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* recover the original value of pointer B */ movq M, I sarq $2, I +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + salq $6, I +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ salq $7, I +#endif subq I, B .L13_20: From 94db259e5b432a7f1769c1d61071b9dd727778db Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 20 Jul 2019 22:04:41 +0800 Subject: [PATCH 111/127] Add files via upload --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 45 ++++++++++-------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 6d1460bb2..6a8619e32 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1622,35 +1622,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro PREFETCHT0_C + prefetcht0 ALPHA prefetcht0 (CO1) prefetcht0 24(CO1) prefetcht0 (CO1,LDC,4) prefetcht0 24(CO1,LDC,4) prefetcht0 (CO1,LDC,8) prefetcht0 24(CO1,LDC,8) - addq LDC,CO1 - prefetcht0 (CO1) - prefetcht0 24(CO1) - prefetcht0 (CO1,LDC,4) - prefetcht0 24(CO1,LDC,4) - prefetcht0 (CO1,LDC,8) - prefetcht0 24(CO1,LDC,8) - leaq (CO1,LDC,2),CO1 - prefetcht0 (CO1) - prefetcht0 24(CO1) - prefetcht0 (CO1,LDC,4) - prefetcht0 24(CO1,LDC,4) - prefetcht0 (CO1,LDC,8) - prefetcht0 24(CO1,LDC,8) - subq LDC,CO1 - prefetcht0 (CO1) - prefetcht0 24(CO1) - prefetcht0 (CO1,LDC,4) - prefetcht0 24(CO1,LDC,4) - prefetcht0 (CO1,LDC,8) - prefetcht0 24(CO1,LDC,8) - subq LDC,CO1 - subq LDC,CO1 .endm /*******************************************************************************************/ @@ -1820,12 +1798,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. dec %rax jne .L12_12 - PREFETCHT0_C .L12_12a: - + PREFETCHT0_C + addq LDC,CO1 KERNEL4x12_M1 + PREFETCHT0_C + leaq (CO1,LDC,2),CO1 KERNEL4x12_M2 + PREFETCHT0_C + subq LDC,CO1 KERNEL4x12_M1 + PREFETCHT0_C + subq LDC,CO1 + subq LDC,CO1 KERNEL4x12_M2 KERNEL4x12_M1 @@ -2133,9 +2118,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L13_12a: + PREFETCHT0_C + addq LDC,CO1 KERNEL4x12_M1 + PREFETCHT0_C + leaq (CO1,LDC,2),CO1 KERNEL4x12_M2 + PREFETCHT0_C + subq LDC,CO1 KERNEL4x12_M1 + PREFETCHT0_C + subq LDC,CO1 + subq LDC,CO1 KERNEL4x12_M2 KERNEL4x12_M1 @@ -2145,7 +2139,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jmp .L13_16 - PREFETCHT0_C .L13_13: test $1, %rax From 9440fa607d146f1b91d70e35404f0d4abe50ffc5 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 20 Jul 2019 22:08:22 +0800 Subject: [PATCH 112/127] Add files via upload --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 6a8619e32..c834239be 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1622,7 +1622,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro PREFETCHT0_C - prefetcht0 ALPHA prefetcht0 (CO1) prefetcht0 24(CO1) prefetcht0 (CO1,LDC,4) @@ -1799,6 +1798,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jne .L12_12 .L12_12a: + prefetcht0 ALPHA PREFETCHT0_C addq LDC,CO1 KERNEL4x12_M1 @@ -2117,7 +2117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jne .L13_12 .L13_12a: - + prefetcht0 ALPHA PREFETCHT0_C addq LDC,CO1 KERNEL4x12_M1 From 4801c6d36bd87421b08e60efa1b6e0217fd41672 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sun, 21 Jul 2019 00:47:45 +0800 Subject: [PATCH 113/127] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index c834239be..26eea0acf 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1866,7 +1866,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 /* here for the prefetch of next b source block */ - /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ + /* the increment should be proportional to GEMM_Q/GEMM_P */ salq $3, K #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ @@ -2184,19 +2184,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 /* here for the prefetch of next b source block */ - /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ + /* the increment should be proportional to GEMM_Q/GEMM_P */ salq $3, K #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ prefetcht2 (B) prefetcht2 (B, K, 8) - addq $64, B + addq $64, B /* increment */ #else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ prefetcht2 (B) prefetcht2 (B, K, 8) prefetcht2 64(B) prefetcht2 64(B, K, 8) - addq $128, B + addq $128, B /* increment */ #endif sarq $3, K From 95fb98f556adcbbccc5f42318c7c645ec1837e1a Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sun, 21 Jul 2019 01:10:32 +0800 Subject: [PATCH 114/127] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 26eea0acf..082e62a7c 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -279,43 +279,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm9 , %ymm9 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 -#if B_PR1 >= 96 +#if B_PR1 > 32 prefetcht0 128 + BUFFER1 #endif vmulpd %ymm0 , %ymm12, %ymm12 vmulpd %ymm0 , %ymm13, %ymm13 vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 -#if B_PR1 >= 160 +#if B_PR1 > 96 prefetcht0 192 + BUFFER1 #endif vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 -#if B_PR1 >= 224 +#if B_PR1 > 160 prefetcht0 256 + BUFFER1 #endif vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 -#if B_PR1 >= 288 +#if B_PR1 > 224 prefetcht0 320 + BUFFER1 #endif vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 -#if B_PR1 >= 352 +#if B_PR1 > 288 prefetcht0 384 + BUFFER1 #endif vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#if B_PR1 >= 416 +#if B_PR1 > 352 prefetcht0 448 + BUFFER1 #endif leaq (CO1, LDC, 2), %rax -#if B_PR1 >= 480 +#if B_PR1 > 416 prefetcht0 512 + BUFFER1 #endif From 28e96458e5a4b2d8039ed16048a07892a7c960bf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 22 Jul 2019 08:28:16 +0200 Subject: [PATCH 115/127] Replace vpermpd with vpermilpd to improve performance on Zen/Zen2 (as demonstrated by wjc404 in #2180) --- kernel/x86_64/zdot_microk_haswell-2.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c index 9f2fc2c1d..4eade7bfd 100644 --- a/kernel/x86_64/zdot_microk_haswell-2.c +++ b/kernel/x86_64/zdot_microk_haswell-2.c @@ -66,13 +66,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" - "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t" +// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" +// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" - "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + "vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t" +// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" +// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r "addq $16 , %0 \n\t" @@ -151,13 +155,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" - "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t" +// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" +// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" - "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + "vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t" +// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" +// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r "addq $16 , %0 \n\t" From 3f6ab1582aca019cf5514aac3af98dcb66c9bbd6 Mon Sep 17 00:00:00 2001 From: Tyler Reddy Date: Mon, 22 Jul 2019 21:24:57 -0600 Subject: [PATCH 116/127] MAINT: remove legacy CMake endif() * clean up a case where CMake endif() contained the conditional used in the if(), which is no longer needed / discouraged since our minimum required CMake version supports the modern syntax --- cmake/system_check.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 94d3ba643..610f689e0 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -15,7 +15,7 @@ if (${HOST_OS} STREQUAL "LINUX") EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM) if(${OPERATING_SYSTEM} MATCHES "Android") set(HOST_OS ANDROID) - endif(${OPERATING_SYSTEM} MATCHES "Android") + endif() endif() From af2e7f28fce42e39fd3d4e108dfb4d55b377b5ee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 23 Jul 2019 16:56:40 +0200 Subject: [PATCH 117/127] Override special make variables as seen in https://github.com/xianyi/OpenBLAS/issues/1912#issuecomment-514183900 , any external setting of TARGET_ARCH (which could result from building OpenBLAS as part of a larger project that actually uses this variable) would cause the utest build to fail. (Other subtargets appear to be unaffected as they do not use implicit make rules) --- utest/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utest/Makefile b/utest/Makefile index cbe639cdb..5846db0bb 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -1,6 +1,9 @@ UTEST_CHECK = 1 TOPDIR = .. +override TARGET_ARCH= +override TARGET_MACH= + UTESTBIN=openblas_utest .PHONY : all From 30efed14d1aa9e1fba887aeddac964b841dd4720 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 Jul 2019 15:26:09 +0200 Subject: [PATCH 118/127] Unset special make variables in ctest Makefile as well --- ctest/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ctest/Makefile b/ctest/Makefile index 569a5dda3..f562c9bb3 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -6,6 +6,8 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system override CFLAGS += -DADD$(BU) -DCBLAS +override TARGET_ARCH= +override TARGET_MACH= LIB = $(TOPDIR)/$(LIBNAME) From 7eecd8e39cfd3bf3f8eddc1154b8b2bfec19ea33 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sun, 28 Jul 2019 07:39:09 +0800 Subject: [PATCH 119/127] Add files via upload --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 334 ++++++++++++++++++++++- 1 file changed, 325 insertions(+), 9 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 082e62a7c..19e32ef2c 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -107,6 +107,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define A_PR1 512 #define B_PR1 160 +#define BROADCASTKERNEL /******************************************************************************************* * Macro definitions @@ -133,7 +134,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 A_PR1(AO) vmovups -12 * SIZE(BO), %ymm1 prefetcht0 B_PR1(BO) +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else vmovups -16 * SIZE(AO), %ymm0 +# endif prefetcht0 B_PR1+64(BO) vmovups -8 * SIZE(BO), %ymm2 prefetcht0 B_PR1+128(BO) @@ -143,17 +148,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 ,%ymm2 , %ymm8 vmulpd %ymm0 ,%ymm3 , %ymm12 prefetcht0 B_PR1+256(BO) +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 vmulpd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vmulpd %ymm0 ,%ymm1 , %ymm6 vmulpd %ymm0 ,%ymm2 , %ymm10 addq $ 12*SIZE, BO vmulpd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 @@ -165,23 +182,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x12_M1 prefetcht0 A_PR1(AO) +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else vmovups -16 * SIZE(AO), %ymm0 +# endif prefetcht0 B_PR1(BO) vfmadd231pd %ymm0 ,%ymm1 , %ymm4 prefetcht0 B_PR1+64(BO) vfmadd231pd %ymm0 ,%ymm2 , %ymm8 prefetcht0 B_PR1+128(BO) vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -192,21 +224,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x12_M2 +# if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +# else vmovups -12 * SIZE(AO), %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups 0 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -218,21 +266,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x12_E +# if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +# else vmovups -12 * SIZE(AO), %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 @@ -241,23 +305,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x12_SUB vmovups -12 * SIZE(BO), %ymm1 +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else vmovups -16 * SIZE(AO), %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vmovups -8 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vmovups -4 * SIZE(BO), %ymm3 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 addq $ 12*SIZE, BO vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 4*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 @@ -289,27 +369,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if B_PR1 > 96 prefetcht0 192 + BUFFER1 #endif + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 +#else vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 +#endif + #if B_PR1 > 160 prefetcht0 256 + BUFFER1 #endif + +#if defined BROADCASTKERNEL + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 +#endif + #if B_PR1 > 224 prefetcht0 320 + BUFFER1 #endif + +#ifndef BROADCASTKERNEL vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 +#endif + #if B_PR1 > 288 prefetcht0 384 + BUFFER1 #endif + +#ifndef BROADCASTKERNEL vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + #if B_PR1 > 352 prefetcht0 448 + BUFFER1 #endif @@ -338,11 +444,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht1 56(%rax) prefetcht1 56(%rax,LDC) - vpermilpd $ 0x05 , %ymm9 , %ymm9 +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 + vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 + vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 + vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm9, %ymm9 vpermilpd $ 0x05 , %ymm11, %ymm11 - vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 - vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 + vblendpd $ 0x0a, %ymm9, %ymm8, %ymm0 + vblendpd $ 0x05, %ymm9, %ymm8, %ymm1 vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 @@ -353,7 +469,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - +#endif leaq (%rax, LDC, 2), %rax leaq (%rax, LDC, 2), %rbp @@ -377,6 +493,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht1 56(%rbp) prefetcht1 56(%rbp,LDC) +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm14, %ymm12 , %ymm0 + vperm2f128 $ 0x20 , %ymm15, %ymm13 , %ymm1 + vperm2f128 $ 0x31 , %ymm14, %ymm12 , %ymm2 + vperm2f128 $ 0x31 , %ymm15, %ymm13 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else vpermilpd $ 0x05 , %ymm13, %ymm13 vpermilpd $ 0x05 , %ymm15, %ymm15 @@ -392,7 +518,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - +#endif leaq (%rax, LDC, 4), %rax leaq (%rbp, LDC, 4), %rbp @@ -693,19 +819,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_I vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vmovups -8 * SIZE(BO), %ymm2 vmulpd %ymm0 ,%ymm1 , %ymm4 vmulpd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm6 vmulpd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 @@ -715,19 +857,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_M1 prefetcht0 A_PR1(AO) +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif prefetcht0 B_PR1(BO) vfmadd231pd %ymm0 ,%ymm1 , %ymm4 prefetcht0 B_PR1+64(BO) vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -736,18 +893,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M2 +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else vmovups -12 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -4 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -757,18 +930,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_E +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else vmovups -12 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 addq $ 8*SIZE, BO @@ -776,19 +965,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_SUB vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vmovups -8 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 addq $ 8*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 4*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -809,6 +1014,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 @@ -824,6 +1039,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif leaq (CO1, LDC, 2), %rax @@ -847,6 +1063,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 56(%rax) prefetcht0 56(%rax,LDC) +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 + vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 + vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 + vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else vpermilpd $ 0x05 , %ymm9 , %ymm9 vpermilpd $ 0x05 , %ymm11, %ymm11 @@ -862,7 +1088,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - +#endif leaq (%rax, LDC, 2), %rax leaq (%rax, LDC, 2), %rbp @@ -1088,15 +1314,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I prefetcht0 A_PR1(AO) vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 @@ -1104,29 +1346,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M1 prefetcht0 A_PR1(AO) +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 .endm .macro KERNEL4x4_M2 +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else vmovups -12 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -8 * SIZE(BO), %ymm1 addq $ 8*SIZE, BO @@ -1134,30 +1407,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_E +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else vmovups -12 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 addq $ 4*SIZE, BO .endm .macro KERNEL4x4_SUB vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 addq $ 4*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 .endm @@ -1171,6 +1476,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 @@ -1186,6 +1501,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif leaq (CO1, LDC, 2), %rax From 2dfb804cb943ac12035fe51859d109daca76b4f4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Jul 2019 23:17:28 +0200 Subject: [PATCH 120/127] Replace vpermpd with vpermilpd in the Haswell DTRMM kernel to improve performance on AMD Zen (#2180) applying wjc404's improvement of the DGEMM kernel from #2186 --- kernel/x86_64/dtrmm_kernel_4x8_haswell.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c index 651736b89..2acdc4615 100644 --- a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c +++ b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c @@ -33,7 +33,7 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm4 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm8 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm5 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm9 \n\t" @@ -41,7 +41,7 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm6 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm10 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm7 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm11 \n\t" @@ -62,18 +62,16 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vmulpd %%ymm0 , %%ymm10, %%ymm10 \n\t" " vmulpd %%ymm0 , %%ymm11, %%ymm11 \n\t" - " vpermpd $0xb1 , %%ymm5 , %%ymm5 \n\t" - " vpermpd $0xb1 , %%ymm7 , %%ymm7 \n\t" + " vpermilpd $0x05 , %%ymm5 , %%ymm5 \n\t" + " vpermilpd $0x05 , %%ymm7 , %%ymm7 \n\t" " vblendpd $0x0a , %%ymm5 , %%ymm4 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm5 , %%ymm4 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm7 , %%ymm6 , %%ymm2 \n\t" " vblendpd $0x05 , %%ymm7 , %%ymm6 , %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" @@ -85,18 +83,16 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vmovups %%ymm6 , (%7) \n\t" " vmovups %%ymm7 , (%8) \n\t" - " vpermpd $0xb1 , %%ymm9 , %%ymm9 \n\t" - " vpermpd $0xb1 , %%ymm11, %%ymm11 \n\t" + " vpermilpd $0x05 , %%ymm9 , %%ymm9 \n\t" + " vpermilpd $0x05 , %%ymm11, %%ymm11 \n\t" " vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" From 648491e1aa5cec7e8b8947d8ce47a825ceba705d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Aug 2019 22:51:09 +0200 Subject: [PATCH 121/127] Autodetect Intel Ice Lake (as SKYLAKEX target) --- cpuid_x86.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 884d4b78a..141d6044e 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1211,7 +1211,7 @@ int get_cpuname(void){ return CPUTYPE_CORE2; } break; - case 1: + case 1: // family 6 exmodel 1 switch (model) { case 6: return CPUTYPE_CORE2; @@ -1228,7 +1228,7 @@ int get_cpuname(void){ return CPUTYPE_DUNNINGTON; } break; - case 2: + case 2: // family 6 exmodel 2 switch (model) { case 5: //Intel Core (Clarkdale) / Core (Arrandale) @@ -1257,7 +1257,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 3: + case 3: // family 6 exmodel 3 switch (model) { case 7: // Bay Trail @@ -1287,7 +1287,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 4: + case 4: // family 6 exmodel 4 switch (model) { case 5: case 6: @@ -1321,7 +1321,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 5: + case 5: // family 6 exmodel 5 switch (model) { case 6: //Broadwell @@ -1364,7 +1364,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 6: + case 6: // family 6 exmodel 6 switch (model) { case 6: // Cannon Lake if(support_avx512()) @@ -1376,7 +1376,20 @@ int get_cpuname(void){ else return CPUTYPE_NEHALEM; } - break; + break; + case 7: // family 6 exmodel 7 + switch (model) { + case 14: // Ice Lake + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } + break; case 9: case 8: switch (model) { From 3d36c4511693bfd7c117465a701c5ff1f19f8565 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Aug 2019 22:52:35 +0200 Subject: [PATCH 122/127] Add CPUID identification of Intel Ice Lake --- driver/others/dynamic.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 045fc65b8..f1cd3c6e6 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -585,9 +585,27 @@ static gotoblas_t *get_coretype(void){ } } return NULL; + case 7: + if (model == 14) { + // Ice Lake + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); + return &gotoblas_HASWELL; + } + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } + return NULL; case 9: case 8: - if (model == 14 ) { // Kaby Lake + if (model == 14 ) { // Kaby Lake, Coffee Lake if(support_avx2()) return &gotoblas_HASWELL; if(support_avx()) { From acf6002ab242f98460845bb71db8fefdbdb26a1f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 3 Aug 2019 12:40:13 +0200 Subject: [PATCH 123/127] Replace most vpermpd calls in the Haswell DTRSM_RN kernel --- kernel/x86_64/dtrsm_kernel_RN_haswell.c | 36 +++++++++++-------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c index 9ab78fc8e..cb939e762 100644 --- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c +++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c @@ -132,7 +132,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "1: \n\t" " vmovups (%8,%1,4), %%ymm4 \n\t" // read a - " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm3 \n\t" // was vpermpd 0xb1 " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" @@ -143,7 +143,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t" " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1 - " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm3 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" @@ -160,7 +160,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" @@ -170,7 +170,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " addq $8, %1 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm11 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm15 \n\t" @@ -185,7 +185,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm9 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm13 \n\t" @@ -193,7 +193,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm11 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm15 \n\t" @@ -204,7 +204,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" @@ -212,42 +212,38 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm11 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm15 \n\t" "3: \n\t" - " vpermpd $0xb1 , %%ymm9 , %%ymm9 \n\t" - " vpermpd $0xb1 , %%ymm11, %%ymm11 \n\t" + " vpermilpd $0x05 , %%ymm9 , %%ymm9 \n\t" + " vpermilpd $0x05 , %%ymm11, %%ymm11 \n\t" " vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm8 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm9 \n\t" " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm10 \n\t" " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm11 \n\t" - " vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" - " vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + " vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t" + " vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t" " vblendpd $0x0a , %%ymm13, %%ymm12, %%ymm0 \n\t" " vblendpd $0x05 , %%ymm13, %%ymm12, %%ymm1 \n\t" " vblendpd $0x0a , %%ymm15, %%ymm14, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm15, %%ymm14, %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm12 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm13 \n\t" From 4e2f81cfa1f6dfa24912c3ff88470471b39b695e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Aug 2019 23:15:35 +0200 Subject: [PATCH 124/127] Provide more information on mmap/munmap failure for #2207 --- driver/others/memory.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index f67cb01f4..77d2b72fa 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2041,8 +2041,12 @@ static BLASULONG alloc_lock = 0UL; static void alloc_mmap_free(struct release_t *release){ +if (!release->address) return 0; + if (munmap(release -> address, BUFFER_SIZE)) { - printf("OpenBLAS : munmap failed\n"); + int errsv=errno; + perror("OpenBLAS : munmap failed:"); + printf("error code=%d,\trelease->address=%lx\n",errsv,release->address); } } @@ -2073,6 +2077,12 @@ static void *alloc_mmap(void *address){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif + } else { +#ifdef DEBUG + int errsv=errno; + perror("OpenBLAS : mmap failed:"); + printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); +#endif } #ifdef OS_LINUX From 1776ad82c01c0f9efeeda043eb02e10187084066 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 9 Aug 2019 00:08:11 +0200 Subject: [PATCH 125/127] Add files via upload --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 77d2b72fa..534d6d9fc 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2041,7 +2041,7 @@ static BLASULONG alloc_lock = 0UL; static void alloc_mmap_free(struct release_t *release){ -if (!release->address) return 0; +if (!release->address) return; if (munmap(release -> address, BUFFER_SIZE)) { int errsv=errno; From b7bbb02447ed612e380dc1ca6d6e7a26f48dc868 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Aug 2019 12:46:05 +0200 Subject: [PATCH 126/127] Silence two nuisance warnings from gcc --- cpuid_arm64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index a5e731d74..e8aa29813 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -94,7 +94,7 @@ int get_feature(char *search) if( p == NULL ) return 0; t = strtok(p," "); - while( t = strtok(NULL," ")) + while( (t = strtok(NULL," "))) { if (!strcmp(t, search)) { return(1); } } @@ -344,7 +344,7 @@ void get_features(void) if( p == NULL ) return; t = strtok(p," "); - while( t = strtok(NULL," ")) + while( (t = strtok(NULL," "))) { } From be147a9f28889d831019c6f860d501b2546e3771 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Aug 2019 16:24:39 +0200 Subject: [PATCH 127/127] Avoid adding a spurious dependency on the fortran runtime despite NOFORTRAN=1 for cases where a fortran compiler is present but not wanted (e.g. not fully functional) --- Makefile.system | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 16791bcc2..835c76e78 100644 --- a/Makefile.system +++ b/Makefile.system @@ -267,9 +267,10 @@ OBJCOPY = $(CROSS_SUFFIX)objcopy OBJCONV = $(CROSS_SUFFIX)objconv -# For detect fortran failed, only build BLAS. +# When fortran support was either not detected or actively deselected, only build BLAS. ifeq ($(NOFORTRAN), 1) NO_LAPACK = 1 +override FEXTRALIB = endif #