From 1a10d3e09ded85be92a7b4860113c864d16e8172 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Wed, 27 Oct 2021 16:37:18 +0200 Subject: [PATCH 01/15] add sve dgemm prototype --- kernel/arm64/dgemm_kernel_sve_v1x8.S | 851 +++++++++++++++++++++++++++ 1 file changed, 851 insertions(+) create mode 100644 kernel/arm64/dgemm_kernel_sve_v1x8.S diff --git a/kernel/arm64/dgemm_kernel_sve_v1x8.S b/kernel/arm64/dgemm_kernel_sve_v1x8.S new file mode 100644 index 000000000..c2bbbee25 --- /dev/null +++ b/kernel/arm64/dgemm_kernel_sve_v1x8.S @@ -0,0 +1,851 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pCRow3 x15 +#define pA x16 +#define alpha x17 + +#define alpha0 d10 +#define alphaZ z10.d +#define alphaV0 v10.d[0] + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pCRow3 +// 16 pA +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0 +//v01 pA0_1 +//v02 pA0_2 +//v03 pA0_3 +//v04 pA0_4 +//v05 pA0_5 +//v06 pA0_6 +//v07 pA0_7 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 --> ALPHA0 +//v11 must save pB0_3 +//v12 must save pB1_0 +//v13 must save pB1_1 +//v14 must save pB1_2 +//v15 must save pB1_3 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x8 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv1x8_I + ld1d z0.d, p1/z, [pA] + ld1d z1.d, p1/z, [pA, x18, lsl #3] // next one + //incb pA, all, mul #2 + add pA, pA, x18, lsl #4 // pA = pA + cnt_active * 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M1 + ld1d z1.d, p1/z, [pA] + add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M2 + ld1d z0.d, p1/z, [pA] + add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + + fmla z16.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z1.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z1.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_E + fmla z16.d, p1/m, z1.d, z8.d + fmla z17.d, p1/m, z1.d, z9.d + fmla z18.d, p1/m, z1.d, z10.d + fmla z19.d, p1/m, z1.d, z11.d + fmla z20.d, p1/m, z1.d, z12.d + fmla z21.d, p1/m, z1.d, z13.d + fmla z22.d, p1/m, z1.d, z14.d + fmla z23.d, p1/m, z1.d, z15.d +.endm + +.macro KERNELv1x8_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + fmla z18.d, p1/m, z0.d, z10.d + fmla z19.d, p1/m, z0.d, z11.d + fmla z20.d, p1/m, z0.d, z12.d + fmla z21.d, p1/m, z0.d, z13.d + fmla z22.d, p1/m, z0.d, z14.d + fmla z23.d, p1/m, z0.d, z15.d + +.endm + +.macro SAVEv1x8 + dup alphaZ, alpha + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + + add pCRow2, pCRow1, LDC + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + + add pCRow1, pCRow2, LDC + ld1d z26.d, p1/z, [pCRow2] + fmla z26.d, p1/m, z18.d, alphaZ + st1d z26.d, p1, [pCRow2] + + add pCRow2, pCRow1, LDC + ld1d z27.d, p1/z, [pCRow1] + fmla z27.d, p1/m, z19.d, alphaZ + st1d z27.d, p1, [pCRow1] + + add pCRow1, pCRow2, LDC + ld1d z28.d, p1/z, [pCRow2] + fmla z28.d, p1/m, z20.d, alphaZ + st1d z28.d, p1, [pCRow2] + + add pCRow2, pCRow1, LDC + ld1d z29.d, p1/z, [pCRow1] + fmla z29.d, p1/m, z21.d, alphaZ + st1d z29.d, p1, [pCRow1] + + add pCRow1, pCRow2, LDC + ld1d z30.d, p1/z, [pCRow2] + fmla z30.d, p1/m, z22.d, alphaZ + st1d z30.d, p1, [pCRow2] + + add pCRow2, pCRow1, LDC + ld1d z31.d, p1/z, [pCRow1] + fmla z31.d, p1/m, z23.d, alphaZ + st1d z31.d, p1, [pCRow1] + + add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv1x4_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + add pB, pB, 32 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + fmla z18.d, p1/m, z0.d, z10.d + fmla z19.d, p1/m, z0.d, z11.d + +.endm + +.macro SAVEv1x4 + dup alphaZ, alpha + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + + add pCRow2, pCRow1, LDC + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + + add pCRow1, pCRow2, LDC + ld1d z26.d, p1/z, [pCRow2] + fmla z26.d, p1/m, z18.d, alphaZ + st1d z26.d, p1, [pCRow2] + + add pCRow2, pCRow1, LDC + ld1d z27.d, p1/z, [pCRow1] + fmla z27.d, p1/m, z19.d, alphaZ + st1d z27.d, p1, [pCRow1] + + add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.d, #0 + dup z17.d, #0 +.endm + +.macro KERNELv1x2_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + +.endm + +.macro SAVEv1x2 + dup alphaZ, alpha + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + + add pCRow2, pCRow1, LDC + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + + add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.d, #0 +.endm + +.macro KERNELv1x1_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + + ld1rd z8.d, p0/z, [pB] + + add pB, pB, 8 + + fmla z16.d, p1/m, z0.d, z8.d + +.endm + +.macro SAVEv1x1 + dup alphaZ, alpha + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + + + add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, d0 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + ptrue p0.d // create true predicate + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Ldgemm_kernel_L4_BEGIN + +/******************************************************************************/ + + .align 5 +.Ldgemm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pCRow0, LDC, lsl #3 // add 8 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L8_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp x18, p0, p1.d + /* mov counterI, origM */ + /* asr counterI, counterI, #3 // counterI = counterI / 8 */ + /* cmp counterI, #0 */ + /* ble .Ldgemm_kernel_L4_M4_BEGIN */ + + .align 5 +.Ldgemm_kernel_L8_Mv1_20: + + mov pB, origPB + INITv1x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Ldgemm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Ldgemm_kernel_L8_Mv1_22a + + .align 5 +.Ldgemm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L8_Mv1_22 + + .align 5 +.Ldgemm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Ldgemm_kernel_L8_Mv1_44 + + .align 5 +.Ldgemm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Ldgemm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Ldgemm_kernel_L8_Mv1_44 + +.Ldgemm_kernel_L8_Mv1_40: + + INITv1x8 + +.Ldgemm_kernel_L8_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L8_Mv1_100 + + .align 5 +.Ldgemm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L8_Mv1_46 + +.Ldgemm_kernel_L8_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x8 + +.Ldgemm_kernel_L8_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp x18, p0, p1.d + b.any .Ldgemm_kernel_L8_Mv1_20 + +.Ldgemm_kernel_L8_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 8 * 8 + + subs counterJ, counterJ , #1 // j-- + bgt .Ldgemm_kernel_L8_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + + .align 5 +.Ldgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Ldgemm_kernel_L2_BEGIN + + + mov pCRow0, pC + + add pC, pCRow0, LDC, lsl #2 // add 4 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L4_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp x18, p0, p1.d + + .align 5 +.Ldgemm_kernel_L4_Mv1_20: + + mov pB, origPB + INITv1x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + blt .Ldgemm_kernel_L4_Mv1_44 + + .align 5 +.Ldgemm_kernel_L4_Mv1_22: + + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L4_Mv1_22 + +.Ldgemm_kernel_L4_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L4_Mv1_100 + + .align 5 +.Ldgemm_kernel_L4_Mv1_46: + + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L4_Mv1_46 + +.Ldgemm_kernel_L4_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Ldgemm_kernel_L4_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp x18, p0, p1.d + b.any .Ldgemm_kernel_L4_Mv1_20 + + +.Ldgemm_kernel_L4_END: + add origPB, origPB, origK, lsl #5 // B = B + K * 4 * 8 + +/******************************************************************************/ +/******************************************************************************/ + + .align 5 +.Ldgemm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Ldgemm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pCRow0, LDC, lsl #1 // add 2 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp x18, p0, p1.d + + .align 5 +.Ldgemm_kernel_L2_Mv1_20: + + mov pB, origPB + INITv1x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + blt .Ldgemm_kernel_L2_Mv1_44 + + .align 5 +.Ldgemm_kernel_L2_Mv1_22: + + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L2_Mv1_22 + +.Ldgemm_kernel_L2_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L2_Mv1_100 + + .align 5 +.Ldgemm_kernel_L2_Mv1_46: + + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L2_Mv1_46 + +.Ldgemm_kernel_L2_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x2 + +.Ldgemm_kernel_L2_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp x18, p0, p1.d + b.any .Ldgemm_kernel_L2_Mv1_20 + + +.Ldgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ +/******************************************************************************/ + + .align 5 +.Ldgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Ldgemm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pCRow0, LDC, lsl #1 // add 2 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp x18, p0, p1.d + + .align 5 +.Ldgemm_kernel_L1_Mv1_20: + + mov pB, origPB + INITv1x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + blt .Ldgemm_kernel_L1_Mv1_44 + + .align 5 +.Ldgemm_kernel_L1_Mv1_22: + + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_22 + +.Ldgemm_kernel_L1_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L1_Mv1_100 + + .align 5 +.Ldgemm_kernel_L1_Mv1_46: + + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L1_Mv1_46 + +.Ldgemm_kernel_L1_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x1 + +.Ldgemm_kernel_L1_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp x18, p0, p1.d + b.any .Ldgemm_kernel_L1_Mv1_20 + + +.Ldgemm_kernel_L1_END: + +/******************************************************************************/ + +.Ldgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + From 746b4f0f175051b67bfdd5197542871c23790e4b Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sat, 30 Oct 2021 12:11:44 +0200 Subject: [PATCH 02/15] added SVE ncopy and tcopy --- kernel/arm64/dgemm_ncopy_sve_v1.c | 79 +++++++++++++++++++++++++++++++ kernel/arm64/dgemm_tcopy_sve_v1.c | 78 ++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 kernel/arm64/dgemm_ncopy_sve_v1.c create mode 100644 kernel/arm64/dgemm_tcopy_sve_v1.c diff --git a/kernel/arm64/dgemm_ncopy_sve_v1.c b/kernel/arm64/dgemm_ncopy_sve_v1.c new file mode 100644 index 000000000..342812107 --- /dev/null +++ b/kernel/arm64/dgemm_ncopy_sve_v1.c @@ -0,0 +1,79 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + svint64_t lda_vec = svindex_s64(0LL, lda); + uint64_t sve_size = svcntd(); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b64(j, n); + uint64_t active = svcntp_b64(svptrue_b64(), pg); + do { + + aoffset1 = aoffset; + + uint64_t i_cnt = m; + while (i_cnt--) { + svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec); + svst1_f64(pg, (double *) boffset, a_vec); + aoffset1++; + boffset += active; + } + aoffset += sve_size * lda; + + j += svcntd(); + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + + + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} diff --git a/kernel/arm64/dgemm_tcopy_sve_v1.c b/kernel/arm64/dgemm_tcopy_sve_v1.c new file mode 100644 index 000000000..21bfdf3db --- /dev/null +++ b/kernel/arm64/dgemm_tcopy_sve_v1.c @@ -0,0 +1,78 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + svint64_t lda_vec = svindex_s64(0LL, lda); + uint64_t sve_size = svcntd(); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b64(j, n); + uint64_t active = svcntp_b64(svptrue_b64(), pg); + do { + + aoffset1 = aoffset; + + uint64_t i_cnt = m; + while (i_cnt--) { + svfloat64_t a_vec = svld1(pg, (double *)aoffset1); + svst1_f64(pg, (double *) boffset, a_vec); + aoffset1 += lda; + boffset += active; + } + aoffset += sve_size; + + j += svcntd(); + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} From a8fbdbac34f61c06a212876c07e89fb02b1c9dad Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 31 Oct 2021 10:24:25 +0100 Subject: [PATCH 03/15] fix sve dgemm kernel + sve dtrmm --- kernel/arm64/dgemm_kernel_sve_v1x8.S | 140 ++-- kernel/arm64/dtrmm_kernel_sve_v1x8.S | 1007 ++++++++++++++++++++++++++ 2 files changed, 1088 insertions(+), 59 deletions(-) create mode 100644 kernel/arm64/dtrmm_kernel_sve_v1x8.S diff --git a/kernel/arm64/dgemm_kernel_sve_v1x8.S b/kernel/arm64/dgemm_kernel_sve_v1x8.S index c2bbbee25..94682aea9 100644 --- a/kernel/arm64/dgemm_kernel_sve_v1x8.S +++ b/kernel/arm64/dgemm_kernel_sve_v1x8.S @@ -46,16 +46,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pCRow3 x15 + +#define lanes x15 #define pA x16 #define alpha x17 #define alpha0 d10 -#define alphaZ z10.d -#define alphaV0 v10.d[0] +#define alphaZ z2.d #define A_PRE_SIZE 2560 -#define B_PRE_SIZE 448 +#define B_PRE_SIZE 512 #define C_PRE_SIZE 128 // 00 origM @@ -73,9 +73,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 -// 15 pCRow3 +// 15 lanes // 16 pA -// 17 +// 17 // 18 must save // 19 must save // 20 must save @@ -93,20 +93,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v00 ALPHA -> pA0_0 //v01 pA0_1 -//v02 pA0_2 -//v03 pA0_3 -//v04 pA0_4 -//v05 pA0_5 -//v06 pA0_6 -//v07 pA0_7 +//v02 ALPHA0 +//v03 +//v04 +//v05 +//v06 +//v07 //v08 must save pB0_0 //v09 must save pB0_1 -//v10 must save pB0_2 --> ALPHA0 +//v10 must save pB0_2 //v11 must save pB0_3 -//v12 must save pB1_0 -//v13 must save pB1_1 -//v14 must save pB1_2 -//v15 must save pB1_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 //v16 must save C0 //v17 must save C1 //v18 must save C2 @@ -133,9 +133,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x8_I ld1d z0.d, p1/z, [pA] - ld1d z1.d, p1/z, [pA, x18, lsl #3] // next one + ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one //incb pA, all, mul #2 - add pA, pA, x18, lsl #4 // pA = pA + cnt_active * 2 * 8 + add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 ld1rd z8.d, p0/z, [pB] ld1rd z9.d, p0/z, [pB, 8] @@ -157,12 +157,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla z19.d, p1/m, z0.d, z11.d ld1rd z11.d, p0/z, [pB, 24] fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] ld1rd z12.d, p0/z, [pB, 32] fmla z21.d, p1/m, z0.d, z13.d ld1rd z13.d, p0/z, [pB, 40] fmla z22.d, p1/m, z0.d, z14.d ld1rd z14.d, p0/z, [pB, 48] fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] ld1rd z15.d, p0/z, [pB, 56] add pB, pB, 64 @@ -170,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x8_M1 ld1d z1.d, p1/z, [pA] - add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 fmla z16.d, p1/m, z0.d, z8.d ld1rd z8.d, p0/z, [pB] @@ -181,12 +183,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla z19.d, p1/m, z0.d, z11.d ld1rd z11.d, p0/z, [pB, 24] fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] ld1rd z12.d, p0/z, [pB, 32] fmla z21.d, p1/m, z0.d, z13.d ld1rd z13.d, p0/z, [pB, 40] fmla z22.d, p1/m, z0.d, z14.d ld1rd z14.d, p0/z, [pB, 48] fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] ld1rd z15.d, p0/z, [pB, 56] add pB, pB, 64 @@ -194,7 +198,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x8_M2 ld1d z0.d, p1/z, [pA] - add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 fmla z16.d, p1/m, z1.d, z8.d ld1rd z8.d, p0/z, [pB] @@ -206,6 +210,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1rd z11.d, p0/z, [pB, 24] fmla z20.d, p1/m, z1.d, z12.d ld1rd z12.d, p0/z, [pB, 32] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla z21.d, p1/m, z1.d, z13.d ld1rd z13.d, p0/z, [pB, 40] fmla z22.d, p1/m, z1.d, z14.d @@ -222,6 +227,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla z18.d, p1/m, z1.d, z10.d fmla z19.d, p1/m, z1.d, z11.d fmla z20.d, p1/m, z1.d, z12.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla z21.d, p1/m, z1.d, z13.d fmla z22.d, p1/m, z1.d, z14.d fmla z23.d, p1/m, z1.d, z15.d @@ -229,7 +235,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x8_SUB ld1d z0.d, p1/z, [pA] - add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 ld1rd z8.d, p0/z, [pB] ld1rd z9.d, p0/z, [pB, 8] @@ -245,16 +251,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla z16.d, p1/m, z0.d, z8.d fmla z17.d, p1/m, z0.d, z9.d fmla z18.d, p1/m, z0.d, z10.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla z19.d, p1/m, z0.d, z11.d fmla z20.d, p1/m, z0.d, z12.d fmla z21.d, p1/m, z0.d, z13.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla z22.d, p1/m, z0.d, z14.d fmla z23.d, p1/m, z0.d, z15.d .endm .macro SAVEv1x8 - dup alphaZ, alpha prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] @@ -262,43 +269,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1d z24.d, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaZ st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1d z25.d, p1/z, [pCRow1] fmla z25.d, p1/m, z17.d, alphaZ st1d z25.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1d z26.d, p1/z, [pCRow2] fmla z26.d, p1/m, z18.d, alphaZ st1d z26.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1d z27.d, p1/z, [pCRow1] fmla z27.d, p1/m, z19.d, alphaZ st1d z27.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1d z28.d, p1/z, [pCRow2] fmla z28.d, p1/m, z20.d, alphaZ st1d z28.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1d z29.d, p1/z, [pCRow1] fmla z29.d, p1/m, z21.d, alphaZ st1d z29.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1d z30.d, p1/z, [pCRow2] fmla z30.d, p1/m, z22.d, alphaZ st1d z30.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - add pCRow2, pCRow1, LDC ld1d z31.d, p1/z, [pCRow1] fmla z31.d, p1/m, z23.d, alphaZ st1d z31.d, p1, [pCRow1] - add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 .endm @@ -313,7 +326,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x4_SUB ld1d z0.d, p1/z, [pA] - add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 ld1rd z8.d, p0/z, [pB] ld1rd z9.d, p0/z, [pB, 8] @@ -324,13 +337,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla z16.d, p1/m, z0.d, z8.d fmla z17.d, p1/m, z0.d, z9.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla z18.d, p1/m, z0.d, z10.d fmla z19.d, p1/m, z0.d, z11.d .endm .macro SAVEv1x4 - dup alphaZ, alpha prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] @@ -338,23 +351,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1d z24.d, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaZ st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1d z25.d, p1/z, [pCRow1] fmla z25.d, p1/m, z17.d, alphaZ st1d z25.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1d z26.d, p1/z, [pCRow2] fmla z26.d, p1/m, z18.d, alphaZ st1d z26.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - add pCRow2, pCRow1, LDC ld1d z27.d, p1/z, [pCRow1] fmla z27.d, p1/m, z19.d, alphaZ st1d z27.d, p1, [pCRow1] - add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 .endm @@ -367,7 +382,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x2_SUB ld1d z0.d, p1/z, [pA] - add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 ld1rd z8.d, p0/z, [pB] ld1rd z9.d, p0/z, [pB, 8] @@ -375,12 +390,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pB, pB, 16 fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla z17.d, p1/m, z0.d, z9.d .endm .macro SAVEv1x2 - dup alphaZ, alpha prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] @@ -388,13 +403,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1d z24.d, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaZ st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - add pCRow2, pCRow1, LDC ld1d z25.d, p1/z, [pCRow1] fmla z25.d, p1/m, z17.d, alphaZ st1d z25.d, p1, [pCRow1] - add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 .endm @@ -406,28 +421,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x1_SUB ld1d z0.d, p1/z, [pA] - add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 ld1rd z8.d, p0/z, [pB] add pB, pB, 8 fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] .endm .macro SAVEv1x1 - dup alphaZ, alpha prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - add pCRow1, pCRow0, LDC ld1d z24.d, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaZ st1d z24.d, p1, [pCRow0] - add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 .endm @@ -456,6 +470,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prfm PLDL1KEEP, [origPA] fmov alpha, d0 + dup alphaZ, alpha lsl LDC, LDC, #3 // ldc = ldc * 8 ptrue p0.d // create true predicate @@ -473,7 +488,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .Ldgemm_kernel_L8_BEGIN: mov pCRow0, pC - add pC, pCRow0, LDC, lsl #3 // add 8 x LDC + add pC, pC, LDC, lsl #3 // add 8 x LDC mov pA, origPA // pA = start of A array @@ -481,11 +496,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterI, #0 whilelt p1.d, counterI, origM //SVE instruction - cntp x18, p0, p1.d - /* mov counterI, origM */ - /* asr counterI, counterI, #3 // counterI = counterI / 8 */ - /* cmp counterI, #0 */ - /* ble .Ldgemm_kernel_L4_M4_BEGIN */ + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension .align 5 .Ldgemm_kernel_L8_Mv1_20: @@ -584,7 +595,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. incd counterI whilelt p1.d, counterI, origM //SVE instruction - cntp x18, p0, p1.d + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension b.any .Ldgemm_kernel_L8_Mv1_20 .Ldgemm_kernel_L8_END: @@ -608,7 +619,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pCRow0, pC - add pC, pCRow0, LDC, lsl #2 // add 4 x LDC + add pC, pC, LDC, lsl #2 // add 4 x LDC mov pA, origPA // pA = start of A array @@ -616,7 +627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterI, #0 whilelt p1.d, counterI, origM //SVE instruction - cntp x18, p0, p1.d + cntp lanes, p0, p1.d .align 5 .Ldgemm_kernel_L4_Mv1_20: @@ -626,17 +637,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. asr counterL , origK, #3 // L = K / 8 cmp counterL , #0 // is there at least 4 to do? - blt .Ldgemm_kernel_L4_Mv1_44 + ble .Ldgemm_kernel_L4_Mv1_44 .align 5 .Ldgemm_kernel_L4_Mv1_22: + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x4_SUB KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x4_SUB KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x4_SUB KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x4_SUB KERNELv1x4_SUB @@ -651,6 +666,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 .Ldgemm_kernel_L4_Mv1_46: + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x4_SUB subs counterL, counterL, #1 @@ -667,12 +683,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. incd counterI whilelt p1.d, counterI, origM //SVE instruction - cntp x18, p0, p1.d + cntp lanes, p0, p1.d b.any .Ldgemm_kernel_L4_Mv1_20 .Ldgemm_kernel_L4_END: - add origPB, origPB, origK, lsl #5 // B = B + K * 4 * 8 + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 /******************************************************************************/ /******************************************************************************/ @@ -686,7 +703,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pCRow0, pC - add pC, pCRow0, LDC, lsl #1 // add 2 x LDC + add pC, pC, LDC, lsl #1 // add 2 x LDC mov pA, origPA // pA = start of A array @@ -694,7 +711,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterI, #0 whilelt p1.d, counterI, origM //SVE instruction - cntp x18, p0, p1.d + cntp lanes, p0, p1.d .align 5 .Ldgemm_kernel_L2_Mv1_20: @@ -704,15 +721,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. asr counterL , origK, #3 // L = K / 8 cmp counterL , #0 // is there at least 4 to do? - blt .Ldgemm_kernel_L2_Mv1_44 + ble .Ldgemm_kernel_L2_Mv1_44 .align 5 .Ldgemm_kernel_L2_Mv1_22: + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x2_SUB KERNELv1x2_SUB KERNELv1x2_SUB KERNELv1x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x2_SUB KERNELv1x2_SUB KERNELv1x2_SUB @@ -729,6 +748,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 .Ldgemm_kernel_L2_Mv1_46: + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x2_SUB subs counterL, counterL, #1 @@ -745,7 +765,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. incd counterI whilelt p1.d, counterI, origM //SVE instruction - cntp x18, p0, p1.d + cntp lanes, p0, p1.d b.any .Ldgemm_kernel_L2_Mv1_20 @@ -764,7 +784,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pCRow0, pC - add pC, pCRow0, LDC, lsl #1 // add 2 x LDC + add pC, pC, LDC // add 1 x LDC mov pA, origPA // pA = start of A array @@ -772,7 +792,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterI, #0 whilelt p1.d, counterI, origM //SVE instruction - cntp x18, p0, p1.d + cntp lanes, p0, p1.d .align 5 .Ldgemm_kernel_L1_Mv1_20: @@ -781,12 +801,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. INITv1x1 // fill with zeros asr counterL , origK, #3 // L = K / 8 - cmp counterL , #0 // is there at least 4 to do? - blt .Ldgemm_kernel_L1_Mv1_44 + cmp counterL , #0 // is there at least 8 to do? + ble .Ldgemm_kernel_L1_Mv1_44 .align 5 .Ldgemm_kernel_L1_Mv1_22: + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x1_SUB KERNELv1x1_SUB KERNELv1x1_SUB @@ -807,10 +828,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 .Ldgemm_kernel_L1_Mv1_46: + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x1_SUB subs counterL, counterL, #1 - bne .Ldgemm_kernel_L1_Mv1_46 + bgt .Ldgemm_kernel_L1_Mv1_46 .Ldgemm_kernel_L1_Mv1_100: prfm PLDL1KEEP, [pA] @@ -823,7 +845,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. incd counterI whilelt p1.d, counterI, origM //SVE instruction - cntp x18, p0, p1.d + cntp lanes, p0, p1.d b.any .Ldgemm_kernel_L1_Mv1_20 diff --git a/kernel/arm64/dtrmm_kernel_sve_v1x8.S b/kernel/arm64/dtrmm_kernel_sve_v1x8.S new file mode 100644 index 000000000..458090411 --- /dev/null +++ b/kernel/arm64/dtrmm_kernel_sve_v1x8.S @@ -0,0 +1,1007 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 + +#define lanes x15 +#define pA x16 +#define alpha x17 +//#define temp x18 +#define tempOffset x19 +#define tempK x20 +#define temp x21 + +#define alpha0 d10 +#define alphaZ z2.d + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 512 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 lanes +// 16 pA +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0 +//v01 pA0_1 +//v02 ALPHA0 +//v03 +//v04 +//v05 +//v06 +//v07 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 +//v11 must save pB0_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x8 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv1x8_I + ld1d z0.d, p1/z, [pA] + ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one + //incb pA, all, mul #2 + add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M1 + ld1d z1.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M2 + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + fmla z16.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.d, p1/m, z1.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z1.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_E + fmla z16.d, p1/m, z1.d, z8.d + fmla z17.d, p1/m, z1.d, z9.d + fmla z18.d, p1/m, z1.d, z10.d + fmla z19.d, p1/m, z1.d, z11.d + fmla z20.d, p1/m, z1.d, z12.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.d, p1/m, z1.d, z13.d + fmla z22.d, p1/m, z1.d, z14.d + fmla z23.d, p1/m, z1.d, z15.d +.endm + +.macro KERNELv1x8_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + fmla z18.d, p1/m, z0.d, z10.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z19.d, p1/m, z0.d, z11.d + fmla z20.d, p1/m, z0.d, z12.d + fmla z21.d, p1/m, z0.d, z13.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z22.d, p1/m, z0.d, z14.d + fmla z23.d, p1/m, z0.d, z15.d + +.endm + +.macro SAVEv1x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + fmul z16.d, p1/m, z16.d, alphaZ + st1d z16.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z17.d, p1/m, z17.d, alphaZ + st1d z17.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z18.d, p1/m, z18.d, alphaZ + st1d z18.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z19.d, p1/m, z19.d, alphaZ + st1d z19.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z20.d, p1/m, z20.d, alphaZ + st1d z20.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z21.d, p1/m, z21.d, alphaZ + st1d z21.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z22.d, p1/m, z22.d, alphaZ + st1d z22.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmul z23.d, p1/m, z23.d, alphaZ + st1d z23.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv1x4_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + add pB, pB, 32 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z18.d, p1/m, z0.d, z10.d + fmla z19.d, p1/m, z0.d, z11.d + +.endm + +.macro SAVEv1x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + fmla z16.d, p1/m, z16.d, alphaZ + st1d z16.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmla z17.d, p1/m, z17.d, alphaZ + st1d z17.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmla z18.d, p1/m, z18.d, alphaZ + st1d z18.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmla z19.d, p1/m, z19.d, alphaZ + st1d z19.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.d, #0 + dup z17.d, #0 +.endm + +.macro KERNELv1x2_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z17.d, p1/m, z0.d, z9.d + +.endm + +.macro SAVEv1x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + fmla z16.d, p1/m, z16.d, alphaZ + st1d z16.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmla z17.d, p1/m, z17.d, alphaZ + st1d z17.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.d, #0 +.endm + +.macro KERNELv1x1_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + + add pB, pB, 8 + + fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + +.endm + +.macro SAVEv1x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + fmla z16.d, p1/m, z16.d, alphaZ + st1d z16.d, p1, [pCRow0] + + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, d0 + dup alphaZ, alpha + + lsl LDC, LDC, #3 // ldc = ldc * 8 + ptrue p0.d // create true predicate + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Ldtrmm_kernel_L4_BEGIN + +/******************************************************************************/ + + .align 5 +.Ldtrmm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pC, LDC, lsl #3 // add 8 x LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +.Ldtrmm_kernel_L8_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldtrmm_kernel_L8_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempOffset, #6 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #8 +#endif + + INITv1x8 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Ldtrmm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Ldtrmm_kernel_L8_Mv1_22a + + .align 5 +.Ldtrmm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Ldtrmm_kernel_L8_Mv1_22 + + .align 5 +.Ldtrmm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Ldtrmm_kernel_L8_Mv1_44 + + .align 5 +.Ldtrmm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Ldtrmm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Ldtrmm_kernel_L8_Mv1_44 + +.Ldtrmm_kernel_L8_Mv1_40: + + INITv1x8 + +.Ldtrmm_kernel_L8_Mv1_44: + + ands counterL , tempK, #7 + ble .Ldtrmm_kernel_L8_Mv1_100 + + .align 5 +.Ldtrmm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Ldtrmm_kernel_L8_Mv1_46 + +.Ldtrmm_kernel_L8_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #8 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempK, #6 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Ldtrmm_kernel_L8_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldtrmm_kernel_L8_Mv1_20 + +.Ldtrmm_kernel_L8_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 8 * 8 + +#if !defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt .Ldtrmm_kernel_L8_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + + .align 5 +.Ldtrmm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Ldtrmm_kernel_L2_BEGIN + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pCRow0, pC + + add pC, pC, LDC, lsl #2 // add 4 x LDC + + mov pA, origPA // pA = start of A array + +.Ldtrmm_kernel_L4_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldtrmm_kernel_L4_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #4 +#endif + + INITv1x4 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldtrmm_kernel_L4_Mv1_44 + + .align 5 +.Ldtrmm_kernel_L4_Mv1_22: + + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Ldtrmm_kernel_L4_Mv1_22 + +.Ldtrmm_kernel_L4_Mv1_44: + + ands counterL , tempK, #7 + ble .Ldtrmm_kernel_L4_Mv1_100 + + .align 5 +.Ldtrmm_kernel_L4_Mv1_46: + + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Ldtrmm_kernel_L4_Mv1_46 + +.Ldtrmm_kernel_L4_Mv1_100: + + SAVEv1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #4 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Ldtrmm_kernel_L4_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldtrmm_kernel_L4_Mv1_20 + + +.Ldtrmm_kernel_L4_END: + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +/******************************************************************************/ +/******************************************************************************/ + + .align 5 +.Ldtrmm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Ldtrmm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pC, LDC, lsl #1 // add 2 x LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +.Ldtrmm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldtrmm_kernel_L2_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #2 +#endif + + INITv1x2 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldtrmm_kernel_L2_Mv1_44 + + .align 5 +.Ldtrmm_kernel_L2_Mv1_22: + + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Ldtrmm_kernel_L2_Mv1_22 + +.Ldtrmm_kernel_L2_Mv1_44: + + ands counterL , tempK, #7 + ble .Ldtrmm_kernel_L2_Mv1_100 + + .align 5 +.Ldtrmm_kernel_L2_Mv1_46: + + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Ldtrmm_kernel_L2_Mv1_46 + +.Ldtrmm_kernel_L2_Mv1_100: + + SAVEv1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #2 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + + +.Ldtrmm_kernel_L2_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldtrmm_kernel_L2_Mv1_20 + + +.Ldtrmm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +/******************************************************************************/ +/******************************************************************************/ + + .align 5 +.Ldtrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Ldtrmm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pC, LDC // add 1 x LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +.Ldtrmm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldtrmm_kernel_L1_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #1 +#endif + + INITv1x1 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Ldtrmm_kernel_L1_Mv1_44 + + .align 5 +.Ldtrmm_kernel_L1_Mv1_22: + + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldtrmm_kernel_L1_Mv1_22 + +.Ldtrmm_kernel_L1_Mv1_44: + + ands counterL , tempK, #7 + ble .Ldtrmm_kernel_L1_Mv1_100 + + .align 5 +.Ldtrmm_kernel_L1_Mv1_46: + + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldtrmm_kernel_L1_Mv1_46 + +.Ldtrmm_kernel_L1_Mv1_100: + + SAVEv1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #1 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + + + +.Ldtrmm_kernel_L1_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldtrmm_kernel_L1_Mv1_20 + + +.Ldtrmm_kernel_L1_END: + +/******************************************************************************/ + +.Ldtrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + From 7093372e3283e221e6598bc7ed93abf5f8e8a523 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Mon, 1 Nov 2021 22:53:21 +0100 Subject: [PATCH 04/15] add ARMV8SVE target --- Makefile.arm64 | 7 ++ getarch.c | 14 +++ kernel/arm64/KERNEL.ARMV8SVE | 183 ++++++++++++++++++++++++++++++ kernel/arm64/dgemm_tcopy_sve_v1.c | 1 - param.h | 30 +++++ 5 files changed, 234 insertions(+), 1 deletion(-) create mode 100644 kernel/arm64/KERNEL.ARMV8SVE diff --git a/Makefile.arm64 b/Makefile.arm64 index a07d0892b..801601030 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -20,6 +20,13 @@ FCOMMON_OPT += -march=armv8-a endif endif +ifeq ($(CORE), ARMV8SVE) +CCOMMON_OPT += -march=armv8-a+sve +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a+sve +endif +endif + ifeq ($(CORE), CORTEXA53) CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 ifneq ($(F_COMPILER), NAG) diff --git a/getarch.c b/getarch.c index 60bfe05ce..7ae7591c5 100644 --- a/getarch.c +++ b/getarch.c @@ -1198,6 +1198,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_ARMV8SVE +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "ARMV8SVE" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DARMV8SVE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "armv8sve" +#define CORENAME "ARMV8SVE" +#endif + #ifdef FORCE_ARMV8 #define FORCE diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE new file mode 100644 index 000000000..572c96fac --- /dev/null +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -0,0 +1,183 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(SGEMM_UNROLL_N), 16) +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +else +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +endif +ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +else +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +endif +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_sve_v1x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S + +DGEMMINCOPY = dgemm_ncopy_sve_v1.c +DGEMMITCOPY = dgemm_tcopy_sve_v1.c +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/dgemm_tcopy_sve_v1.c b/kernel/arm64/dgemm_tcopy_sve_v1.c index 21bfdf3db..33e69bf0c 100644 --- a/kernel/arm64/dgemm_tcopy_sve_v1.c +++ b/kernel/arm64/dgemm_tcopy_sve_v1.c @@ -46,7 +46,6 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG j; IFLOAT *aoffset, *aoffset1, *boffset; - svint64_t lda_vec = svindex_s64(0LL, lda); uint64_t sve_size = svcntd(); aoffset = a; diff --git a/param.h b/param.h index 23f406d74..8c2061931 100644 --- a/param.h +++ b/param.h @@ -3294,6 +3294,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#elif defined(ARMV8SVE) + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 8 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #else /* Other/undetected ARMv8 cores */ #define SGEMM_DEFAULT_UNROLL_M 16 @@ -3325,6 +3354,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #endif /* Cores */ + #endif /* ARMv8 */ #if defined(ARMV5) From ab7917910d05c9d55f7511e440c0b0e4178f4511 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 7 Nov 2021 20:37:51 +0100 Subject: [PATCH 05/15] add v2x8 kernel + fix sve dtrmm --- kernel/arm64/KERNEL.A64FX | 28 +- kernel/arm64/dgemm_kernel_sve_v2x8.S | 1665 ++++++++++++++++++++++++++ kernel/arm64/dtrmm_kernel_sve_v1x8.S | 14 +- param.h | 4 +- 4 files changed, 1682 insertions(+), 29 deletions(-) create mode 100644 kernel/arm64/dgemm_kernel_sve_v2x8.S diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index c8a53c86b..4c2921e03 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -143,34 +143,22 @@ endif SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S -DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S -ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) +DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S -ifeq ($(DGEMM_UNROLL_M), 8) -DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S -DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S -else -DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c -DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c -endif +DGEMMINCOPY = dgemm_ncopy_sve_v1.c +DGEMMITCOPY = dgemm_tcopy_sve_v1.c +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(DGEMM_UNROLL_N), 4) -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S -else -DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c -DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c -endif - DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + + CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) diff --git a/kernel/arm64/dgemm_kernel_sve_v2x8.S b/kernel/arm64/dgemm_kernel_sve_v2x8.S new file mode 100644 index 000000000..59e41559f --- /dev/null +++ b/kernel/arm64/dgemm_kernel_sve_v2x8.S @@ -0,0 +1,1665 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 + +#define lanes x15 +#define pA1 x16 +#define pA2 x17 +#define alpha x18 +#define vec_len x19 +#define vec_lenx2 x20 + +#define alpha0 d10 +#define alphaZ z7.d + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 512 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 lanes +// 16 pA1 +// 17 pA1 +// 18 must save alpha +// 19 must save vec_len +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA10_0 +//v01 pA10_1 +//v02 +//v03 +//v04 +//v05 +//v06 +//v07 ALPHA0 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 +//v11 must save pB0_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv2x8 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 + dup z24.d, #0 + dup z25.d, #0 + dup z26.d, #0 + dup z27.d, #0 + dup z28.d, #0 + dup z29.d, #0 + dup z30.d, #0 + dup z31.d, #0 +.endm + +.macro KERNELv2x8_I + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + ld1d z2.d, p0/z, [pA1, vec_len, lsl #3] + ld1d z3.d, p0/z, [pA2, vec_len, lsl #3] + add pA1, pA1, vec_len, lsl #4 // pA1 = pA1 + vec_len * 8 *2 + add pA2, pA2, vec_len, lsl #4 // pA1 = pA1 + vec_len * 8 *2 + + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z18.d, p0/m, z0.d, z9.d + fmla z19.d, p0/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z20.d, p0/m, z0.d, z10.d + fmla z21.d, p0/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z22.d, p0/m, z0.d, z11.d + fmla z23.d, p0/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z24.d, p0/m, z0.d, z12.d + fmla z25.d, p0/m, z1.d, z12.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z26.d, p0/m, z0.d, z13.d + fmla z27.d, p0/m, z1.d, z13.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + ld1rd z13.d, p0/z, [pB, 40] + fmla z28.d, p0/m, z0.d, z14.d + fmla z29.d, p0/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z30.d, p0/m, z0.d, z15.d + fmla z31.d, p0/m, z1.d, z15.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE+64] + + add pB, pB, 64 +.endm + +.macro KERNELv2x8_M1 + ld1d z2.d, p0/z, [pA1] + ld1d z3.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z18.d, p0/m, z0.d, z9.d + fmla z19.d, p0/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z20.d, p0/m, z0.d, z10.d + fmla z21.d, p0/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z22.d, p0/m, z0.d, z11.d + fmla z23.d, p0/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z24.d, p0/m, z0.d, z12.d + fmla z25.d, p0/m, z1.d, z12.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z26.d, p0/m, z0.d, z13.d + fmla z27.d, p0/m, z1.d, z13.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + ld1rd z13.d, p0/z, [pB, 40] + fmla z28.d, p0/m, z0.d, z14.d + fmla z29.d, p0/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z30.d, p0/m, z0.d, z15.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + fmla z31.d, p0/m, z1.d, z15.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv2x8_M2 + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 2 * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 2 * 8 + + fmla z16.d, p0/m, z2.d, z8.d + fmla z17.d, p0/m, z3.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z18.d, p0/m, z2.d, z9.d + fmla z19.d, p0/m, z3.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z20.d, p0/m, z2.d, z10.d + fmla z21.d, p0/m, z3.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z22.d, p0/m, z2.d, z11.d + fmla z23.d, p0/m, z3.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z24.d, p0/m, z2.d, z12.d + fmla z25.d, p0/m, z3.d, z12.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z26.d, p0/m, z2.d, z13.d + fmla z27.d, p0/m, z3.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z28.d, p0/m, z2.d, z14.d + fmla z29.d, p0/m, z3.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z30.d, p0/m, z2.d, z15.d + fmla z31.d, p0/m, z3.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv2x8_E + fmla z16.d, p0/m, z2.d, z8.d + fmla z17.d, p0/m, z3.d, z8.d + fmla z18.d, p0/m, z2.d, z9.d + fmla z19.d, p0/m, z3.d, z9.d + fmla z20.d, p0/m, z2.d, z10.d + fmla z21.d, p0/m, z3.d, z10.d + fmla z22.d, p0/m, z2.d, z11.d + fmla z23.d, p0/m, z3.d, z11.d + fmla z24.d, p0/m, z2.d, z12.d + fmla z25.d, p0/m, z3.d, z12.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z26.d, p0/m, z2.d, z13.d + fmla z27.d, p0/m, z3.d, z13.d + fmla z28.d, p0/m, z2.d, z14.d + fmla z29.d, p0/m, z3.d, z14.d + fmla z30.d, p0/m, z2.d, z15.d + fmla z31.d, p0/m, z3.d, z15.d +.endm + +.macro KERNELv2x8_SUB + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + fmla z18.d, p0/m, z0.d, z9.d + fmla z19.d, p0/m, z1.d, z9.d + fmla z20.d, p0/m, z0.d, z10.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z21.d, p0/m, z1.d, z10.d + fmla z22.d, p0/m, z0.d, z11.d + fmla z23.d, p0/m, z1.d, z11.d + fmla z24.d, p0/m, z0.d, z12.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + fmla z25.d, p0/m, z1.d, z12.d + fmla z26.d, p0/m, z0.d, z13.d + fmla z27.d, p0/m, z1.d, z13.d + fmla z28.d, p0/m, z0.d, z14.d + fmla z29.d, p0/m, z1.d, z14.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z30.d, p0/m, z0.d, z15.d + fmla z31.d, p0/m, z1.d, z15.d +.endm + +.macro SAVEv2x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z8.d, p0/z, [pCRow0] + ld1d z9.d, p0/z, [pCRow0, #1, mul vl] + fmla z8.d, p0/m, z16.d, alphaZ + fmla z9.d, p0/m, z17.d, alphaZ + st1d z8.d, p0, [pCRow0] + st1d z9.d, p0, [pCRow0, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z10.d, p0/z, [pCRow1] + ld1d z11.d, p0/z, [pCRow1, #1, mul vl] + fmla z10.d, p0/m, z18.d, alphaZ + fmla z11.d, p0/m, z19.d, alphaZ + st1d z10.d, p0, [pCRow1] + st1d z11.d, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z12.d, p0/z, [pCRow2] + ld1d z13.d, p0/z, [pCRow2, #1, mul vl] + fmla z12.d, p0/m, z20.d, alphaZ + fmla z13.d, p0/m, z21.d, alphaZ + st1d z12.d, p0, [pCRow2] + st1d z13.d, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z14.d, p0/z, [pCRow1] + ld1d z15.d, p0/z, [pCRow1, #1, mul vl] + fmla z14.d, p0/m, z22.d, alphaZ + fmla z15.d, p0/m, z23.d, alphaZ + st1d z14.d, p0, [pCRow1] + st1d z15.d, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z8.d, p0/z, [pCRow2] + ld1d z9.d, p0/z, [pCRow2, #1, mul vl] + fmla z8.d, p0/m, z24.d, alphaZ + fmla z9.d, p0/m, z25.d, alphaZ + st1d z8.d, p0, [pCRow2] + st1d z9.d, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z10.d, p0/z, [pCRow1] + ld1d z11.d, p0/z, [pCRow1, #1, mul vl] + fmla z10.d, p0/m, z26.d, alphaZ + fmla z11.d, p0/m, z27.d, alphaZ + st1d z10.d, p0, [pCRow1] + st1d z11.d, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z12.d, p0/z, [pCRow2] + ld1d z13.d, p0/z, [pCRow2, #1, mul vl] + fmla z12.d, p0/m, z28.d, alphaZ + fmla z13.d, p0/m, z29.d, alphaZ + st1d z12.d, p0, [pCRow2] + st1d z13.d, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z14.d, p0/z, [pCRow1] + ld1d z15.d, p0/z, [pCRow1, #1, mul vl] + fmla z14.d, p0/m, z30.d, alphaZ + fmla z15.d, p0/m, z31.d, alphaZ + st1d z14.d, p0, [pCRow1] + st1d z15.d, p0, [pCRow1, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2 + +.endm + +.macro INITv2x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv2x4_SUB + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + add pB, pB, 32 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + fmla z18.d, p0/m, z0.d, z9.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z19.d, p0/m, z1.d, z9.d + fmla z20.d, p0/m, z0.d, z10.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + fmla z21.d, p0/m, z1.d, z10.d + fmla z22.d, p0/m, z0.d, z11.d + fmla z23.d, p0/m, z1.d, z11.d +.endm + +.macro SAVEv2x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z8.d, p0/z, [pCRow0] + ld1d z9.d, p0/z, [pCRow0, #1, mul vl] + fmla z8.d, p0/m, z16.d, alphaZ + fmla z9.d, p0/m, z17.d, alphaZ + st1d z8.d, p0, [pCRow0] + st1d z9.d, p0, [pCRow0, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z10.d, p0/z, [pCRow1] + ld1d z11.d, p0/z, [pCRow1, #1, mul vl] + fmla z10.d, p0/m, z18.d, alphaZ + fmla z11.d, p0/m, z19.d, alphaZ + st1d z10.d, p0, [pCRow1] + st1d z11.d, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z12.d, p0/z, [pCRow2] + ld1d z13.d, p0/z, [pCRow2, #1, mul vl] + fmla z12.d, p0/m, z20.d, alphaZ + fmla z13.d, p0/m, z21.d, alphaZ + st1d z12.d, p0, [pCRow2] + st1d z13.d, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z14.d, p0/z, [pCRow1] + ld1d z15.d, p0/z, [pCRow1, #1, mul vl] + fmla z14.d, p0/m, z22.d, alphaZ + fmla z15.d, p0/m, z23.d, alphaZ + st1d z14.d, p0, [pCRow1] + st1d z15.d, p0, [pCRow1, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2 + +.endm + +.macro INITv2x2 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv2x2_SUB + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z18.d, p0/m, z0.d, z9.d + fmla z19.d, p0/m, z1.d, z9.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] +.endm + +.macro SAVEv2x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z8.d, p0/z, [pCRow0] + ld1d z9.d, p0/z, [pCRow0, #1, mul vl] + fmla z8.d, p0/m, z16.d, alphaZ + fmla z9.d, p0/m, z17.d, alphaZ + st1d z8.d, p0, [pCRow0] + st1d z9.d, p0, [pCRow0, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z10.d, p0/z, [pCRow1] + ld1d z11.d, p0/z, [pCRow1, #1, mul vl] + fmla z10.d, p0/m, z18.d, alphaZ + fmla z11.d, p0/m, z19.d, alphaZ + st1d z10.d, p0, [pCRow1] + st1d z11.d, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + + + add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2 +.endm + +.macro INITv2x1 + dup z16.d, #0 + dup z17.d, #0 +.endm + +.macro KERNELv2x1_SUB + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + + ld1rd z8.d, p0/z, [pB] + + add pB, pB, 8 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] +.endm + +.macro SAVEv2x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z8.d, p0/z, [pCRow0] + ld1d z9.d, p0/z, [pCRow0, #1, mul vl] + fmla z8.d, p0/m, z16.d, alphaZ + fmla z9.d, p0/m, z17.d, alphaZ + st1d z8.d, p0, [pCRow0] + st1d z9.d, p0, [pCRow0, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2 + +.endm + +.macro INITv1x8 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv1x8_I + ld1d z0.d, p1/z, [pA1] + ld1d z1.d, p1/z, [pA1, lanes, lsl #3] // next one + //incb pA1, all, mul #2 + add pA1, pA1, lanes, lsl #4 // pA1 = pA1 + lanes * 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M1 + ld1d z1.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M2 + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + fmla z16.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.d, p1/m, z1.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z1.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_E + fmla z16.d, p1/m, z1.d, z8.d + fmla z17.d, p1/m, z1.d, z9.d + fmla z18.d, p1/m, z1.d, z10.d + fmla z19.d, p1/m, z1.d, z11.d + fmla z20.d, p1/m, z1.d, z12.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.d, p1/m, z1.d, z13.d + fmla z22.d, p1/m, z1.d, z14.d + fmla z23.d, p1/m, z1.d, z15.d +.endm + +.macro KERNELv1x8_SUB + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + fmla z18.d, p1/m, z0.d, z10.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z19.d, p1/m, z0.d, z11.d + fmla z20.d, p1/m, z0.d, z12.d + fmla z21.d, p1/m, z0.d, z13.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z22.d, p1/m, z0.d, z14.d + fmla z23.d, p1/m, z0.d, z15.d + + +.endm + +.macro SAVEv1x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z26.d, p1/z, [pCRow2] + fmla z26.d, p1/m, z18.d, alphaZ + st1d z26.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z27.d, p1/z, [pCRow1] + fmla z27.d, p1/m, z19.d, alphaZ + st1d z27.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z28.d, p1/z, [pCRow2] + fmla z28.d, p1/m, z20.d, alphaZ + st1d z28.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z29.d, p1/z, [pCRow1] + fmla z29.d, p1/m, z21.d, alphaZ + st1d z29.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z30.d, p1/z, [pCRow2] + fmla z30.d, p1/m, z22.d, alphaZ + st1d z30.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z31.d, p1/z, [pCRow1] + fmla z31.d, p1/m, z23.d, alphaZ + st1d z31.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv1x4_SUB + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + add pB, pB, 32 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z18.d, p1/m, z0.d, z10.d + fmla z19.d, p1/m, z0.d, z11.d + +.endm + +.macro SAVEv1x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z26.d, p1/z, [pCRow2] + fmla z26.d, p1/m, z18.d, alphaZ + st1d z26.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z27.d, p1/z, [pCRow1] + fmla z27.d, p1/m, z19.d, alphaZ + st1d z27.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.d, #0 + dup z17.d, #0 +.endm + +.macro KERNELv1x2_SUB + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z17.d, p1/m, z0.d, z9.d + +.endm + +.macro SAVEv1x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.d, #0 +.endm + +.macro KERNELv1x1_SUB + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + + add pB, pB, 8 + + fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + +.endm + +.macro SAVEv1x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, d0 + dup alphaZ, alpha + cntd vec_len + lsl vec_lenx2, vec_len, #1 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + ptrue p0.d // create true predicate + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Ldgemm_kernel_L4_BEGIN + +/******************************************************************************/ + + .align 5 +.Ldgemm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pC, LDC, lsl #3 // add 8 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Ldgemm_kernel_L8_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Ldgemm_kernel_L8_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // pA1 = start of A array + prfm PLDL1KEEP, [pA2] + + .align 5 +.Ldgemm_kernel_L8_Mv2_20: + + mov pB, origPB + INITv2x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Ldgemm_kernel_L8_Mv2_32 + + KERNELv2x8_I + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Ldgemm_kernel_L8_Mv2_22a + + .align 5 +.Ldgemm_kernel_L8_Mv2_22: + + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L8_Mv2_22 + + .align 5 +.Ldgemm_kernel_L8_Mv2_22a: + + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_E + + b .Ldgemm_kernel_L8_Mv2_44 + + .align 5 +.Ldgemm_kernel_L8_Mv2_32: + + tst counterL, #1 + ble .Ldgemm_kernel_L8_Mv2_40 + + KERNELv2x8_I + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_E + + + b .Ldgemm_kernel_L8_Mv2_44 + +.Ldgemm_kernel_L8_Mv2_40: + + INITv2x8 + +.Ldgemm_kernel_L8_Mv2_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L8_Mv2_100 + + .align 5 +.Ldgemm_kernel_L8_Mv2_46: + + KERNELv2x8_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L8_Mv2_46 + +.Ldgemm_kernel_L8_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [pA2] + prfm PLDL1KEEP, [pA2, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x8 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // + +.Ldgemm_kernel_L8_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Ldgemm_kernel_L8_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Ldgemm_kernel_L8_END + +////////////////////////////////// +.Ldgemm_kernel_L8_Mv1_BEGIN: + + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Ldgemm_kernel_L8_Mv1_20: + + mov pB, origPB + INITv1x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Ldgemm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Ldgemm_kernel_L8_Mv1_22a + + .align 5 +.Ldgemm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L8_Mv1_22 + + .align 5 +.Ldgemm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Ldgemm_kernel_L8_Mv1_44 + + .align 5 +.Ldgemm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Ldgemm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Ldgemm_kernel_L8_Mv1_44 + +.Ldgemm_kernel_L8_Mv1_40: + + INITv1x8 + +.Ldgemm_kernel_L8_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L8_Mv1_100 + + .align 5 +.Ldgemm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L8_Mv1_46 + +.Ldgemm_kernel_L8_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x8 + +.Ldgemm_kernel_L8_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + b.any .Ldgemm_kernel_L8_Mv1_20 + +.Ldgemm_kernel_L8_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 8 * 8 + + subs counterJ, counterJ , #1 // j-- + bgt .Ldgemm_kernel_L8_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + + .align 5 +.Ldgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Ldgemm_kernel_L2_BEGIN + + + mov pCRow0, pC + + add pC, pC, LDC, lsl #2 // add 4 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Ldgemm_kernel_L4_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Ldgemm_kernel_L4_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // pA1 = start of A array + + .align 5 +.Ldgemm_kernel_L4_Mv2_20: + + mov pB, origPB + INITv2x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L4_Mv2_44 + + .align 5 +.Ldgemm_kernel_L4_Mv2_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L4_Mv2_22 + +.Ldgemm_kernel_L4_Mv2_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L4_Mv2_100 + + .align 5 +.Ldgemm_kernel_L4_Mv2_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L4_Mv2_46 + +.Ldgemm_kernel_L4_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [pA2] + prfm PLDL1KEEP, [pA2, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x4 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // + +.Ldgemm_kernel_L4_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Ldgemm_kernel_L4_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Ldgemm_kernel_L4_END + +////////////////////////////////// +.Ldgemm_kernel_L4_Mv1_BEGIN: + + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Ldgemm_kernel_L4_Mv1_20: + + mov pB, origPB + INITv1x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L4_Mv1_44 + + .align 5 +.Ldgemm_kernel_L4_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L4_Mv1_22 + +.Ldgemm_kernel_L4_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L4_Mv1_100 + + .align 5 +.Ldgemm_kernel_L4_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L4_Mv1_46 + +.Ldgemm_kernel_L4_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Ldgemm_kernel_L4_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldgemm_kernel_L4_Mv1_20 + + +.Ldgemm_kernel_L4_END: + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + +/******************************************************************************/ +/******************************************************************************/ + + .align 5 +.Ldgemm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Ldgemm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pC, LDC, lsl #1 // add 2 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Ldgemm_kernel_L2_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Ldgemm_kernel_L2_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // pA1 = start of A array + + .align 5 +.Ldgemm_kernel_L2_Mv2_20: + + mov pB, origPB + INITv2x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L2_Mv2_44 + + .align 5 +.Ldgemm_kernel_L2_Mv2_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L2_Mv2_22 + +.Ldgemm_kernel_L2_Mv2_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L2_Mv2_100 + + .align 5 +.Ldgemm_kernel_L2_Mv2_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x2_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L2_Mv2_46 + +.Ldgemm_kernel_L2_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [pA2] + prfm PLDL1KEEP, [pA2, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x2 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // + +.Ldgemm_kernel_L2_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Ldgemm_kernel_L2_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Ldgemm_kernel_L2_END + + +////////////////////////////////// +.Ldgemm_kernel_L2_Mv1_BEGIN: + + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldgemm_kernel_L2_Mv1_20: + + mov pB, origPB + INITv1x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L2_Mv1_44 + + .align 5 +.Ldgemm_kernel_L2_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L2_Mv1_22 + +.Ldgemm_kernel_L2_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L2_Mv1_100 + + .align 5 +.Ldgemm_kernel_L2_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L2_Mv1_46 + +.Ldgemm_kernel_L2_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x2 + +.Ldgemm_kernel_L2_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldgemm_kernel_L2_Mv1_20 + + +.Ldgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ +/******************************************************************************/ + + .align 5 +.Ldgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Ldgemm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pC, LDC // add 1 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Ldgemm_kernel_L1_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Ldgemm_kernel_L1_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // pA1 = start of A array + + + .align 5 +.Ldgemm_kernel_L1_Mv2_20: + + mov pB, origPB + INITv2x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Ldgemm_kernel_L1_Mv2_44 + + .align 5 +.Ldgemm_kernel_L1_Mv2_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv2_22 + +.Ldgemm_kernel_L1_Mv2_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L1_Mv2_100 + + .align 5 +.Ldgemm_kernel_L1_Mv2_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv2_46 + +.Ldgemm_kernel_L1_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x1 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // + +.Ldgemm_kernel_L1_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Ldgemm_kernel_L1_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Ldgemm_kernel_L1_END + + +////////////////////////////////// +.Ldgemm_kernel_L1_Mv1_BEGIN: + + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldgemm_kernel_L1_Mv1_20: + + mov pB, origPB + INITv1x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Ldgemm_kernel_L1_Mv1_44 + + .align 5 +.Ldgemm_kernel_L1_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_22 + +.Ldgemm_kernel_L1_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L1_Mv1_100 + + .align 5 +.Ldgemm_kernel_L1_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_46 + +.Ldgemm_kernel_L1_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x1 + +.Ldgemm_kernel_L1_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldgemm_kernel_L1_Mv1_20 + + +.Ldgemm_kernel_L1_END: + +/******************************************************************************/ + +.Ldgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/dtrmm_kernel_sve_v1x8.S b/kernel/arm64/dtrmm_kernel_sve_v1x8.S index 458090411..1d4df08fb 100644 --- a/kernel/arm64/dtrmm_kernel_sve_v1x8.S +++ b/kernel/arm64/dtrmm_kernel_sve_v1x8.S @@ -344,21 +344,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC - fmla z16.d, p1/m, z16.d, alphaZ + fmul z16.d, p1/m, z16.d, alphaZ st1d z16.d, p1, [pCRow0] prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC - fmla z17.d, p1/m, z17.d, alphaZ + fmul z17.d, p1/m, z17.d, alphaZ st1d z17.d, p1, [pCRow1] prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC - fmla z18.d, p1/m, z18.d, alphaZ + fmul z18.d, p1/m, z18.d, alphaZ st1d z18.d, p1, [pCRow2] prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - fmla z19.d, p1/m, z19.d, alphaZ + fmul z19.d, p1/m, z19.d, alphaZ st1d z19.d, p1, [pCRow1] add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 @@ -392,11 +392,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC - fmla z16.d, p1/m, z16.d, alphaZ + fmul z16.d, p1/m, z16.d, alphaZ st1d z16.d, p1, [pCRow0] prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - fmla z17.d, p1/m, z17.d, alphaZ + fmul z17.d, p1/m, z17.d, alphaZ st1d z17.d, p1, [pCRow1] add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 @@ -426,7 +426,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - fmla z16.d, p1/m, z16.d, alphaZ + fmul z16.d, p1/m, z16.d, alphaZ st1d z16.d, p1, [pCRow0] diff --git a/param.h b/param.h index 8c2061931..ad0cecda7 100644 --- a/param.h +++ b/param.h @@ -3328,8 +3328,8 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 8 #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 From 7d996b1c365f43fe37fd2127d95c2a82d76f3e2e Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sat, 13 Nov 2021 18:48:53 +0100 Subject: [PATCH 06/15] dtrmm_utcopy sve function --- kernel/arm64/trmm_utcopy_sve_v1.c | 128 ++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 kernel/arm64/trmm_utcopy_sve_v1.c diff --git a/kernel/arm64/trmm_utcopy_sve_v1.c b/kernel/arm64/trmm_utcopy_sve_v1.c new file mode 100644 index 000000000..e44e67373 --- /dev/null +++ b/kernel/arm64/trmm_utcopy_sve_v1.c @@ -0,0 +1,128 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +#define MIN(a,b) (((a)<(b))?(a):(b)) +#define MAX(a,b) (((a)>(b))?(a):(b)) + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, j; + BLASLONG X; + + int sve_len = svcntd(); + + FLOAT *ao; + js = 0; + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); + do + { + X = posX; + + if (posX <= posY) { + ao = a + posX + (posY + j) * lda; + } else { + ao = a + posY + (posX + j) * lda; + } + + i = 0; + /* svbool_t pm = svwhilelt_b64(i, m); */ + /* int m_active = svcntp_b64(svptrue_b64(), pm); */ + do + { + if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl + ao ++; + b += n_active; + X ++; + i ++; + } else + if (X > posY) { + svfloat64_t aj_vec = svld1(pn, ao); + svst1(pn, b, aj_vec); + ao += lda; + b += n_active; + X ++; + i ++; + } else { +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = *(ao+j*lda+k); + } + b[temp++] = ONE; + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k <= j; k++) { + b[temp++] = *(ao+j*lda+k); + } + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + } + } +#endif + ao += n_active * lda; + b += n_active*n_active; + X += n_active; + i += n_active; + } + } while (i < m); + + //printf("\n"); + + + posY += n_active; + js += n_active; + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); + + return 0; +} From 3c7eed0e53c4ed8bd5169946fbd06854e193a2b2 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 14 Nov 2021 16:00:10 +0100 Subject: [PATCH 07/15] add remaining trmm copy rutines for SVE --- kernel/arm64/trmm_lncopy_sve_v1.c | 127 +++++++++++++++++++++++++++++ kernel/arm64/trmm_ltcopy_sve_v1.c | 128 +++++++++++++++++++++++++++++ kernel/arm64/trmm_uncopy_sve_v1.c | 130 ++++++++++++++++++++++++++++++ kernel/arm64/trmm_utcopy_sve_v1.c | 10 +-- 4 files changed, 389 insertions(+), 6 deletions(-) create mode 100644 kernel/arm64/trmm_lncopy_sve_v1.c create mode 100644 kernel/arm64/trmm_ltcopy_sve_v1.c create mode 100644 kernel/arm64/trmm_uncopy_sve_v1.c diff --git a/kernel/arm64/trmm_lncopy_sve_v1.c b/kernel/arm64/trmm_lncopy_sve_v1.c new file mode 100644 index 000000000..e454e28d4 --- /dev/null +++ b/kernel/arm64/trmm_lncopy_sve_v1.c @@ -0,0 +1,127 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + //printf("Using trmm_ln.\n"); + + int sve_len = svcntd(); + svint64_t index = svindex_s64(0LL, lda); + + FLOAT *ao; + js = 0; + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); + do + { + X = posX; + + if (posX <= posY) { + ao = a + posY + posX * lda; + } else { + ao = a + posX + posY * lda; + } + + i = 0; + /* svbool_t pm = svwhilelt_b64(i, m); */ + /* int m_active = svcntp_b64(svptrue_b64(), pm); */ + do + { + if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl + svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); + svst1(pn, b, aj_vec); + ao ++; + b += n_active; + X ++; + i ++; + } else + if (X < posY) { + ao += lda; + b += n_active; + X ++; + i ++; + } else { +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = *(ao+k*lda+j); + } + b[temp++] = ONE; + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k <= j; k++) { + b[temp++] = *(ao+k*lda+j); + } + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + } + } +#endif + ao += n_active; + b += n_active*n_active; + X += n_active; + i += n_active; + } + } while (i < m); + + //printf("\n"); + + + posY += n_active; + js += n_active; + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); + + return 0; +} diff --git a/kernel/arm64/trmm_ltcopy_sve_v1.c b/kernel/arm64/trmm_ltcopy_sve_v1.c new file mode 100644 index 000000000..86433f230 --- /dev/null +++ b/kernel/arm64/trmm_ltcopy_sve_v1.c @@ -0,0 +1,128 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + //printf("Using trmm_lt.\n"); + + int sve_len = svcntd(); + + FLOAT *ao; + js = 0; + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); + do + { + X = posX; + + if (posX <= posY) { + ao = a + posY + posX * lda; + } else { + ao = a + posX + posY * lda; + } + + i = 0; + /* svbool_t pm = svwhilelt_b64(i, m); */ + /* int m_active = svcntp_b64(svptrue_b64(), pm); */ + do + { + if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl + ao ++; + b += n_active; + X ++; + i ++; + } else + if (X < posY) { + svfloat64_t aj_vec = svld1(pn, ao); + svst1(pn, b, aj_vec); + ao += lda; + b += n_active; + X ++; + i ++; + } else { +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + } + b[temp++] = ONE; + for (int k = j+1; k < n_active; k++) { + b[temp++] = *(ao+j*lda+k); + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + } + for (int k = j; k < n_active; k++) { + b[temp++] = *(ao+j*lda+k); + } + } +#endif + ao += n_active * lda; + b += n_active*n_active; + X += n_active; + i += n_active; + } + } while (i < m); + + //printf("\n"); + + + posY += n_active; + js += n_active; + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); + + + return 0; +} diff --git a/kernel/arm64/trmm_uncopy_sve_v1.c b/kernel/arm64/trmm_uncopy_sve_v1.c new file mode 100644 index 000000000..21f392b62 --- /dev/null +++ b/kernel/arm64/trmm_uncopy_sve_v1.c @@ -0,0 +1,130 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + //printf("Using trmm_un.\n"); + //printf("Using m %ld, n %ld.\n", m, n); + //printf("Using lda %ld.\n", lda); + //printf("Using posX %ld, posY %ld.\n", posX, posY); + + int sve_len = svcntd(); + svint64_t index = svindex_s64(0LL, lda); + + FLOAT *ao; + js = 0; + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); + do + { + X = posX; + + if (posX <= posY) { + ao = a + posX + posY * lda; + } else { + ao = a + posY + posX * lda; + } + + i = 0; + /* svbool_t pm = svwhilelt_b64(i, m); */ + /* int m_active = svcntp_b64(svptrue_b64(), pm); */ + do + { + if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl + svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); + svst1(pn, b, aj_vec); + ao ++; + b += n_active; + X ++; + i ++; + } else + if (X > posY) { + ao += lda; + b += n_active; + X ++; + i ++; + } else { +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + } + b[temp++] = ONE; + for (int k = j+1; k < n_active; k++) { + b[temp++] = *(ao+k*lda+j); + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + } + for (int k = j; k < n_active; k++) { + b[temp++] = *(ao+k*lda+j); + } + } +#endif + ao += n_active; + b += n_active*n_active; + X += n_active; + i += n_active; + } + } while (i < m); + + //printf("\n"); + + + posY += n_active; + js += n_active; + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); + + return 0; +} diff --git a/kernel/arm64/trmm_utcopy_sve_v1.c b/kernel/arm64/trmm_utcopy_sve_v1.c index e44e67373..38b88dc8c 100644 --- a/kernel/arm64/trmm_utcopy_sve_v1.c +++ b/kernel/arm64/trmm_utcopy_sve_v1.c @@ -43,13 +43,11 @@ #include #endif -#define MIN(a,b) (((a)<(b))?(a):(b)) -#define MAX(a,b) (((a)>(b))?(a):(b)) - int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - BLASLONG i, js, j; + BLASLONG i, js; BLASLONG X; + //printf("Using trmm_ut.\n"); int sve_len = svcntd(); @@ -62,9 +60,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON X = posX; if (posX <= posY) { - ao = a + posX + (posY + j) * lda; + ao = a + posX + posY * lda; } else { - ao = a + posY + (posX + j) * lda; + ao = a + posY + posX * lda; } i = 0; From e6ed4be02e9716924e544dd77c2d160ce5c38a05 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sat, 20 Nov 2021 16:35:29 +0100 Subject: [PATCH 08/15] symm SVE copy rutines --- kernel/arm64/symm_lcopy_sve.c | 96 +++++++++++++++++++++++++++++++++++ kernel/arm64/symm_ucopy_sve.c | 96 +++++++++++++++++++++++++++++++++++ 2 files changed, 192 insertions(+) create mode 100644 kernel/arm64/symm_lcopy_sve.c create mode 100644 kernel/arm64/symm_ucopy_sve.c diff --git a/kernel/arm64/symm_lcopy_sve.c b/kernel/arm64/symm_lcopy_sve.c new file mode 100644 index 000000000..c3f7ea6b5 --- /dev/null +++ b/kernel/arm64/symm_lcopy_sve.c @@ -0,0 +1,96 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + + uint64_t sve_size = svcntd(); + svfloat64_t ao_vec; + svint64_t posY_vec = svdup_s64(posY); + svint64_t posX_vec = svdup_s64(posX); + svint64_t lda_vec = svdup_s64(lda); + svint64_t one_vec = svdup_s64(1LL); + + int64_t j = 0; + svbool_t pg = svwhilelt_b64(j, n); + int64_t active = svcntp_b64(svptrue_b64(), pg); + svint64_t index_neg = svindex_s64(0LL, -1LL); + svint64_t index = svindex_s64(0LL, 1LL); + do { + offset = posX - posY; + svint64_t vec_off = svdup_s64(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint64_t temp = svadd_z(pg, posX_vec, index); + svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); + svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda); + svint64_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, lda_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec); + + svst1(pg, b, data_vec); + + b += active; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s64(posX); + j += sve_size; + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} diff --git a/kernel/arm64/symm_ucopy_sve.c b/kernel/arm64/symm_ucopy_sve.c new file mode 100644 index 000000000..3de416cf5 --- /dev/null +++ b/kernel/arm64/symm_ucopy_sve.c @@ -0,0 +1,96 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + + uint64_t sve_size = svcntd(); + svfloat64_t ao_vec; + svint64_t posY_vec = svdup_s64(posY); + svint64_t posX_vec = svdup_s64(posX); + svint64_t lda_vec = svdup_s64(lda); + svint64_t one_vec = svdup_s64(1LL); + + int64_t j = 0; + svbool_t pg = svwhilelt_b64(j, n); + int64_t active = svcntp_b64(svptrue_b64(), pg); + svint64_t index_neg = svindex_s64(0LL, -1LL); + svint64_t index = svindex_s64(0LL, 1LL); + do { + offset = posX - posY; + svint64_t vec_off = svdup_s64(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint64_t temp = svadd_z(pg, posX_vec, index); + svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); + svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda); + svint64_t gat_ind = svsel(cmp, temp2, temp1); + + i = m; + while (i>0) { + svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, one_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); + + svst1(pg, b, data_vec); + + b += active; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s64(posX); + j += sve_size; + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} From b58d4f31abf55446d4707036df0a0c5c7ef26047 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 21 Nov 2021 14:56:27 +0100 Subject: [PATCH 09/15] some clean-up & commentary --- kernel/arm64/KERNEL.ARMV8SVE | 2 +- kernel/arm64/dgemm_kernel_sve_v1x8.S | 15 +++++---- kernel/arm64/dgemm_kernel_sve_v2x8.S | 38 +++++++++++++++------ kernel/arm64/dgemm_ncopy_sve_v1.c | 50 ++++++++++++++-------------- kernel/arm64/dgemm_tcopy_sve_v1.c | 48 +++++++++++++------------- kernel/arm64/dtrmm_kernel_sve_v1x8.S | 17 +++++----- kernel/arm64/trmm_lncopy_sve_v1.c | 9 ++--- kernel/arm64/trmm_ltcopy_sve_v1.c | 9 ++--- kernel/arm64/trmm_uncopy_sve_v1.c | 12 ++----- kernel/arm64/trmm_utcopy_sve_v1.c | 11 ++---- 10 files changed, 104 insertions(+), 107 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index 572c96fac..dbf11fdca 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -143,7 +143,7 @@ endif SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = dgemm_kernel_sve_v1x$(DGEMM_UNROLL_N).S +DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S DGEMMINCOPY = dgemm_ncopy_sve_v1.c diff --git a/kernel/arm64/dgemm_kernel_sve_v1x8.S b/kernel/arm64/dgemm_kernel_sve_v1x8.S index 94682aea9..bbbd0fd95 100644 --- a/kernel/arm64/dgemm_kernel_sve_v1x8.S +++ b/kernel/arm64/dgemm_kernel_sve_v1x8.S @@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alpha0 d10 #define alphaZ z2.d -#define A_PRE_SIZE 2560 +#define A_PRE_SIZE 1536 #define B_PRE_SIZE 512 #define C_PRE_SIZE 128 @@ -134,7 +134,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x8_I ld1d z0.d, p1/z, [pA] ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one - //incb pA, all, mul #2 add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 ld1rd z8.d, p0/z, [pB] @@ -476,13 +475,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ptrue p0.d // create true predicate mov pB, origPB - +// Loop over N mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 ble .Ldgemm_kernel_L4_BEGIN /******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ .align 5 .Ldgemm_kernel_L8_BEGIN: @@ -494,8 +494,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .Ldgemm_kernel_L8_Mv1_BEGIN: +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ mov counterI, #0 - whilelt p1.d, counterI, origM //SVE instruction + whilelt p1.d, counterI, origM cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension .align 5 @@ -607,7 +608,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bgt .Ldgemm_kernel_L8_BEGIN /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ .align 5 .Ldgemm_kernel_L4_BEGIN: @@ -692,7 +693,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add origPB, origPB, temp // B = B + K * 4 * 8 /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ .align 5 .Ldgemm_kernel_L2_BEGIN: @@ -773,7 +774,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ .align 5 .Ldgemm_kernel_L1_BEGIN: diff --git a/kernel/arm64/dgemm_kernel_sve_v2x8.S b/kernel/arm64/dgemm_kernel_sve_v2x8.S index 59e41559f..023d5ba92 100644 --- a/kernel/arm64/dgemm_kernel_sve_v2x8.S +++ b/kernel/arm64/dgemm_kernel_sve_v2x8.S @@ -25,6 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ +/* This is an SVE dgemm kernel with size 2*SVE_LEN x 8. +However, the data layout is the same as for the kernel 1*SVE_LEN x 8. +This means that we sweep two panels of packed A when iterating in a loop over K. +With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ + #define ASSEMBLER #include "common.h" @@ -57,7 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alpha0 d10 #define alphaZ z7.d -#define A_PRE_SIZE 2560 +#define A_PRE_SIZE 1536 #define B_PRE_SIZE 512 #define C_PRE_SIZE 128 @@ -96,8 +101,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v00 ALPHA -> pA10_0 //v01 pA10_1 -//v02 -//v03 +//v02 pA20_0 +//v03 pA20_1 //v04 //v05 //v06 @@ -118,6 +123,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v21 must save C5 //v22 must save C6 //v23 must save C7 +//v24 must save C8 +//v25 must save C9 +//v26 must save C10 +//v27 must save C11 +//v28 must save C12 +//v29 must save C13 +//v30 must save C14 +//v31 must save C15 /******************************************************************************* * Macro definitions @@ -583,7 +596,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x8_I ld1d z0.d, p1/z, [pA1] ld1d z1.d, p1/z, [pA1, lanes, lsl #3] // next one - //incb pA1, all, mul #2 add pA1, pA1, lanes, lsl #4 // pA1 = pA1 + lanes * 2 * 8 ld1rd z8.d, p0/z, [pB] @@ -928,13 +940,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ptrue p0.d // create true predicate mov pB, origPB - +// Loop over N mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 ble .Ldgemm_kernel_L4_BEGIN /******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ .align 5 .Ldgemm_kernel_L8_BEGIN: @@ -947,11 +960,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .Ldgemm_kernel_L8_Mv2_BEGIN: mov counterI, #0 - cmp origM, vec_lenx2 + cmp origM, vec_lenx2 // Check if M < 2*SVE_LEN blt .Ldgemm_kernel_L8_Mv1_BEGIN mov counterI, origM +/* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */ mul temp, vec_len, origK // generate address of pA2 add pA2, pA1, temp, lsl #3 // pA1 = start of A array prfm PLDL1KEEP, [pA2] @@ -1063,7 +1077,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp counterI, origM beq .Ldgemm_kernel_L8_END -////////////////////////////////// +////////////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x8 kernel. .Ldgemm_kernel_L8_Mv1_BEGIN: whilelt p1.d, counterI, origM //SVE instruction @@ -1178,7 +1193,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bgt .Ldgemm_kernel_L8_BEGIN /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ .align 5 .Ldgemm_kernel_L4_BEGIN: @@ -1270,6 +1285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. beq .Ldgemm_kernel_L4_END ////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x4 kernel. .Ldgemm_kernel_L4_Mv1_BEGIN: whilelt p1.d, counterI, origM //SVE instruction @@ -1338,7 +1354,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add origPB, origPB, temp // B = B + K * 4 * 8 /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ .align 5 .Ldgemm_kernel_L2_BEGIN: @@ -1428,6 +1444,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x2 kernel. .Ldgemm_kernel_L2_Mv1_BEGIN: whilelt p1.d, counterI, origM //SVE instruction @@ -1493,7 +1510,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ .align 5 .Ldgemm_kernel_L1_BEGIN: @@ -1581,6 +1598,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x1 kernel. .Ldgemm_kernel_L1_Mv1_BEGIN: whilelt p1.d, counterI, origM //SVE instruction diff --git a/kernel/arm64/dgemm_ncopy_sve_v1.c b/kernel/arm64/dgemm_ncopy_sve_v1.c index 342812107..1f812c775 100644 --- a/kernel/arm64/dgemm_ncopy_sve_v1.c +++ b/kernel/arm64/dgemm_ncopy_sve_v1.c @@ -40,40 +40,40 @@ #include "common.h" #include -// TODO: write in assembly with proper unrolling +// TODO: write in assembly with proper unrolling of inner loop int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ - BLASLONG j; - IFLOAT *aoffset, *aoffset1, *boffset; + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; - svint64_t lda_vec = svindex_s64(0LL, lda); - uint64_t sve_size = svcntd(); + svint64_t lda_vec = svindex_s64(0LL, lda); + uint64_t sve_size = svcntd(); - aoffset = a; - boffset = b; + aoffset = a; + boffset = b; - j = 0; - svbool_t pg = svwhilelt_b64(j, n); - uint64_t active = svcntp_b64(svptrue_b64(), pg); - do { + j = 0; + svbool_t pg = svwhilelt_b64(j, n); + uint64_t active = svcntp_b64(svptrue_b64(), pg); + do { - aoffset1 = aoffset; + aoffset1 = aoffset; - uint64_t i_cnt = m; - while (i_cnt--) { - svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec); - svst1_f64(pg, (double *) boffset, a_vec); - aoffset1++; - boffset += active; - } - aoffset += sve_size * lda; + uint64_t i_cnt = m; + while (i_cnt--) { + svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec); + svst1_f64(pg, (double *) boffset, a_vec); + aoffset1++; + boffset += active; + } + aoffset += sve_size * lda; - j += svcntd(); - pg = svwhilelt_b64(j, n); - active = svcntp_b64(svptrue_b64(), pg); + j += svcntd(); + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); - } while (svptest_any(svptrue_b64(), pg)); + } while (svptest_any(svptrue_b64(), pg)); - return 0; + return 0; } diff --git a/kernel/arm64/dgemm_tcopy_sve_v1.c b/kernel/arm64/dgemm_tcopy_sve_v1.c index 33e69bf0c..cb645a1b6 100644 --- a/kernel/arm64/dgemm_tcopy_sve_v1.c +++ b/kernel/arm64/dgemm_tcopy_sve_v1.c @@ -40,38 +40,38 @@ #include "common.h" #include -// TODO: write in assembly with proper unrolling +// TODO: write in assembly with proper unrolling of inner loop int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ - BLASLONG j; - IFLOAT *aoffset, *aoffset1, *boffset; + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; - uint64_t sve_size = svcntd(); + uint64_t sve_size = svcntd(); - aoffset = a; - boffset = b; + aoffset = a; + boffset = b; - j = 0; - svbool_t pg = svwhilelt_b64(j, n); - uint64_t active = svcntp_b64(svptrue_b64(), pg); - do { + j = 0; + svbool_t pg = svwhilelt_b64(j, n); + uint64_t active = svcntp_b64(svptrue_b64(), pg); + do { - aoffset1 = aoffset; + aoffset1 = aoffset; - uint64_t i_cnt = m; - while (i_cnt--) { - svfloat64_t a_vec = svld1(pg, (double *)aoffset1); - svst1_f64(pg, (double *) boffset, a_vec); - aoffset1 += lda; - boffset += active; - } - aoffset += sve_size; + uint64_t i_cnt = m; + while (i_cnt--) { + svfloat64_t a_vec = svld1(pg, (double *)aoffset1); + svst1_f64(pg, (double *) boffset, a_vec); + aoffset1 += lda; + boffset += active; + } + aoffset += sve_size; - j += svcntd(); - pg = svwhilelt_b64(j, n); - active = svcntp_b64(svptrue_b64(), pg); + j += svcntd(); + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); - } while (svptest_any(svptrue_b64(), pg)); + } while (svptest_any(svptrue_b64(), pg)); - return 0; + return 0; } diff --git a/kernel/arm64/dtrmm_kernel_sve_v1x8.S b/kernel/arm64/dtrmm_kernel_sve_v1x8.S index 1d4df08fb..1f8c9b20f 100644 --- a/kernel/arm64/dtrmm_kernel_sve_v1x8.S +++ b/kernel/arm64/dtrmm_kernel_sve_v1x8.S @@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alpha0 d10 #define alphaZ z2.d -#define A_PRE_SIZE 2560 +#define A_PRE_SIZE 1536 #define B_PRE_SIZE 512 #define C_PRE_SIZE 128 @@ -138,7 +138,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x8_I ld1d z0.d, p1/z, [pA] ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one - //incb pA, all, mul #2 add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 ld1rd z8.d, p0/z, [pB] @@ -469,13 +468,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif mov pB, origPB - +// Loop over N mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 ble .Ldtrmm_kernel_L4_BEGIN /******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ .align 5 .Ldtrmm_kernel_L8_BEGIN: @@ -491,9 +491,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .Ldtrmm_kernel_L8_Mv1_BEGIN: +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ mov counterI, #0 - whilelt p1.d, counterI, origM //SVE instruction - cntp lanes, p0, p1.d + whilelt p1.d, counterI, origM + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension .align 5 .Ldtrmm_kernel_L8_Mv1_20: @@ -641,7 +642,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bgt .Ldtrmm_kernel_L8_BEGIN /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ .align 5 .Ldtrmm_kernel_L4_BEGIN: @@ -757,7 +758,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ .align 5 .Ldtrmm_kernel_L2_BEGIN: @@ -873,7 +874,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ .align 5 .Ldtrmm_kernel_L1_BEGIN: diff --git a/kernel/arm64/trmm_lncopy_sve_v1.c b/kernel/arm64/trmm_lncopy_sve_v1.c index e454e28d4..6c38cb3eb 100644 --- a/kernel/arm64/trmm_lncopy_sve_v1.c +++ b/kernel/arm64/trmm_lncopy_sve_v1.c @@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - //printf("Using trmm_ln.\n"); int sve_len = svcntd(); svint64_t index = svindex_s64(0LL, lda); @@ -67,11 +66,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } i = 0; - /* svbool_t pm = svwhilelt_b64(i, m); */ - /* int m_active = svcntp_b64(svptrue_b64(), pm); */ do { - if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl + if (X > posY) { svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); svst1(pn, b, aj_vec); ao ++; @@ -85,6 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON X ++; i ++; } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ #ifdef UNIT int temp = 0; for (int j = 0; j < n_active; j++) { @@ -114,9 +112,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } } while (i < m); - //printf("\n"); - - posY += n_active; js += n_active; pn = svwhilelt_b64(js, n); diff --git a/kernel/arm64/trmm_ltcopy_sve_v1.c b/kernel/arm64/trmm_ltcopy_sve_v1.c index 86433f230..365be06c3 100644 --- a/kernel/arm64/trmm_ltcopy_sve_v1.c +++ b/kernel/arm64/trmm_ltcopy_sve_v1.c @@ -48,8 +48,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - //printf("Using trmm_lt.\n"); - int sve_len = svcntd(); FLOAT *ao; @@ -67,11 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } i = 0; - /* svbool_t pm = svwhilelt_b64(i, m); */ - /* int m_active = svcntp_b64(svptrue_b64(), pm); */ do { - if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl + if (X > posY) { ao ++; b += n_active; X ++; @@ -85,6 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON X ++; i ++; } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ #ifdef UNIT int temp = 0; for (int j = 0; j < n_active; j++) { @@ -114,8 +111,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } } while (i < m); - //printf("\n"); - posY += n_active; js += n_active; diff --git a/kernel/arm64/trmm_uncopy_sve_v1.c b/kernel/arm64/trmm_uncopy_sve_v1.c index 21f392b62..502b79928 100644 --- a/kernel/arm64/trmm_uncopy_sve_v1.c +++ b/kernel/arm64/trmm_uncopy_sve_v1.c @@ -47,10 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - //printf("Using trmm_un.\n"); - //printf("Using m %ld, n %ld.\n", m, n); - //printf("Using lda %ld.\n", lda); - //printf("Using posX %ld, posY %ld.\n", posX, posY); int sve_len = svcntd(); svint64_t index = svindex_s64(0LL, lda); @@ -70,11 +66,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } i = 0; - /* svbool_t pm = svwhilelt_b64(i, m); */ - /* int m_active = svcntp_b64(svptrue_b64(), pm); */ do { - if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl + if (X < posY) { svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); svst1(pn, b, aj_vec); ao ++; @@ -88,6 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON X ++; i ++; } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ #ifdef UNIT int temp = 0; for (int j = 0; j < n_active; j++) { @@ -117,9 +112,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } } while (i < m); - //printf("\n"); - - posY += n_active; js += n_active; pn = svwhilelt_b64(js, n); diff --git a/kernel/arm64/trmm_utcopy_sve_v1.c b/kernel/arm64/trmm_utcopy_sve_v1.c index 38b88dc8c..b45cbd7da 100644 --- a/kernel/arm64/trmm_utcopy_sve_v1.c +++ b/kernel/arm64/trmm_utcopy_sve_v1.c @@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - //printf("Using trmm_ut.\n"); int sve_len = svcntd(); @@ -66,11 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } i = 0; - /* svbool_t pm = svwhilelt_b64(i, m); */ - /* int m_active = svcntp_b64(svptrue_b64(), pm); */ do { - if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl + if (X < posY) { ao ++; b += n_active; X ++; @@ -83,7 +80,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += n_active; X ++; i ++; - } else { + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ #ifdef UNIT int temp = 0; for (int j = 0; j < n_active; j++) { @@ -113,9 +111,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } } while (i < m); - //printf("\n"); - - posY += n_active; js += n_active; pn = svwhilelt_b64(js, n); From 9388f05a3cab3b8850bb47c80ab8d10c1017692c Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 21 Nov 2021 18:33:43 +0100 Subject: [PATCH 10/15] configure SVE Makefile --- getarch.c | 4 ++-- kernel/Makefile.L3 | 42 ++++++++++++++++++++++++++++++++++++++++++ param.h | 6 ++++-- 3 files changed, 48 insertions(+), 4 deletions(-) diff --git a/getarch.c b/getarch.c index 7ae7591c5..fa1fb582e 100644 --- a/getarch.c +++ b/getarch.c @@ -1207,7 +1207,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ - "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8" #define LIBNAME "armv8sve" #define CORENAME "ARMV8SVE" #endif @@ -1450,7 +1450,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=8388608 -DL2_LINESIZE=256 -DL2_ASSOCIATIVE=8 " \ "-DL3_SIZE=0 -DL3_LINESIZE=0 -DL3_ASSOCIATIVE=0 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ - "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8" #define LIBNAME "a64fx" #define CORENAME "A64FX" #else diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 89691ef6f..05d91cded 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -77,6 +77,14 @@ ifeq ($(CORE), Z14) USE_TRMM = 1 endif +ifeq ($(CORE), A64FX) +HAVE_SVE = 1 +endif + +ifeq ($(CORE), ARMV8SVE) +HAVE_SVE = 1 +endif + ifdef USE_DIRECT_SGEMM ifndef SGEMMDIRECTKERNEL SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c @@ -1531,6 +1539,31 @@ $(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N $(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ +ifdef HAVE_SVE +$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_uncopy_sve_v1.c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_uncopy_sve_v1.c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_lncopy_sve_v1.c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_lncopy_sve_v1.c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_utcopy_sve_v1.c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_utcopy_sve_v1.c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_ltcopy_sve_v1.c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_ltcopy_sve_v1.c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ @@ -1554,6 +1587,7 @@ $(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M $(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -1789,11 +1823,19 @@ $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N). $(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ +ifdef HAVE_SVE +$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : arm64/symm_ucopy_sve.c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : arm64/symm_lcopy_sve.c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ +else $(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ $(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ +endif $(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ diff --git a/param.h b/param.h index ad0cecda7..bbc52fac4 100644 --- a/param.h +++ b/param.h @@ -3294,12 +3294,14 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(ARMV8SVE) +#elif defined(ARMV8SVE) || defined(A64FX) #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_M 4 +/* When all BLAS3 routines are implemeted with SVE, DGEMM_DEFAULT_UNROLL_M should be "sve_vl". +Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ +#define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 8 #define CGEMM_DEFAULT_UNROLL_M 8 From 9b9cb90bb138208502ba913c22b11a5fb3516156 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Mon, 22 Nov 2021 09:54:20 +0100 Subject: [PATCH 11/15] modify Makefile for SVE copy --- kernel/Makefile.L3 | 68 ++++++++++++++++++------------------ kernel/arm64/KERNEL.A64FX | 6 ++++ kernel/arm64/KERNEL.ARMV8SVE | 8 +++++ 3 files changed, 48 insertions(+), 34 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 05d91cded..695f8ae70 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -77,14 +77,6 @@ ifeq ($(CORE), Z14) USE_TRMM = 1 endif -ifeq ($(CORE), A64FX) -HAVE_SVE = 1 -endif - -ifeq ($(CORE), ARMV8SVE) -HAVE_SVE = 1 -endif - ifdef USE_DIRECT_SGEMM ifndef SGEMMDIRECTKERNEL SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c @@ -1539,49 +1531,55 @@ $(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N $(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ -ifdef HAVE_SVE -$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_uncopy_sve_v1.c +ifdef DTRMMUNCOPY_M +$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ -$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_uncopy_sve_v1.c +$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_lncopy_sve_v1.c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_lncopy_sve_v1.c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_utcopy_sve_v1.c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_utcopy_sve_v1.c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_ltcopy_sve_v1.c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_ltcopy_sve_v1.c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ else $(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef DTRMMLNCOPY_M +$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif +ifdef DTRMMUTCOPY_M +$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef DTRMMLTCOPY_M +$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ @@ -1823,16 +1821,18 @@ $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N). $(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ -ifdef HAVE_SVE -$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : arm64/symm_ucopy_sve.c +ifdef DSYMMUCOPY_M +$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMUCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ - -$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : arm64/symm_lcopy_sve.c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ else $(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ +endif +ifdef DSYMMLCOPY_M +$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMLCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ +else $(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ endif diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index 4c2921e03..83536f12d 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -157,7 +157,13 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c +DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c +DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c +DSYMMUCOPY_M = symm_ucopy_sve.c +DSYMMLCOPY_M = symm_lcopy_sve.c CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index dbf11fdca..1f605d10b 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -156,6 +156,14 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c +DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c +DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +DSYMMUCOPY_M = symm_ucopy_sve.c +DSYMMLCOPY_M = symm_lcopy_sve.c + CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) From 531a28b6a0fdb908a82d5e1e6404146282b5af5a Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Mon, 22 Nov 2021 10:12:34 +0100 Subject: [PATCH 12/15] removed unused code (compiler warnings) --- kernel/arm64/symm_lcopy_sve.c | 5 +---- kernel/arm64/symm_ucopy_sve.c | 5 +---- kernel/arm64/trmm_lncopy_sve_v1.c | 1 - kernel/arm64/trmm_ltcopy_sve_v1.c | 2 -- kernel/arm64/trmm_uncopy_sve_v1.c | 1 - kernel/arm64/trmm_utcopy_sve_v1.c | 2 -- 6 files changed, 2 insertions(+), 14 deletions(-) diff --git a/kernel/arm64/symm_lcopy_sve.c b/kernel/arm64/symm_lcopy_sve.c index c3f7ea6b5..94a68ad7c 100644 --- a/kernel/arm64/symm_lcopy_sve.c +++ b/kernel/arm64/symm_lcopy_sve.c @@ -42,12 +42,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - BLASLONG i, js, offset; - - FLOAT data01, data02, data03, data04; + BLASLONG i, offset; uint64_t sve_size = svcntd(); - svfloat64_t ao_vec; svint64_t posY_vec = svdup_s64(posY); svint64_t posX_vec = svdup_s64(posX); svint64_t lda_vec = svdup_s64(lda); diff --git a/kernel/arm64/symm_ucopy_sve.c b/kernel/arm64/symm_ucopy_sve.c index 3de416cf5..3cf18e0fd 100644 --- a/kernel/arm64/symm_ucopy_sve.c +++ b/kernel/arm64/symm_ucopy_sve.c @@ -42,12 +42,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - BLASLONG i, js, offset; - - FLOAT data01, data02, data03, data04; + BLASLONG i, offset; uint64_t sve_size = svcntd(); - svfloat64_t ao_vec; svint64_t posY_vec = svdup_s64(posY); svint64_t posX_vec = svdup_s64(posX); svint64_t lda_vec = svdup_s64(lda); diff --git a/kernel/arm64/trmm_lncopy_sve_v1.c b/kernel/arm64/trmm_lncopy_sve_v1.c index 6c38cb3eb..fc1b61325 100644 --- a/kernel/arm64/trmm_lncopy_sve_v1.c +++ b/kernel/arm64/trmm_lncopy_sve_v1.c @@ -48,7 +48,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - int sve_len = svcntd(); svint64_t index = svindex_s64(0LL, lda); FLOAT *ao; diff --git a/kernel/arm64/trmm_ltcopy_sve_v1.c b/kernel/arm64/trmm_ltcopy_sve_v1.c index 365be06c3..14c6762d2 100644 --- a/kernel/arm64/trmm_ltcopy_sve_v1.c +++ b/kernel/arm64/trmm_ltcopy_sve_v1.c @@ -48,8 +48,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - int sve_len = svcntd(); - FLOAT *ao; js = 0; svbool_t pn = svwhilelt_b64(js, n); diff --git a/kernel/arm64/trmm_uncopy_sve_v1.c b/kernel/arm64/trmm_uncopy_sve_v1.c index 502b79928..b8344d474 100644 --- a/kernel/arm64/trmm_uncopy_sve_v1.c +++ b/kernel/arm64/trmm_uncopy_sve_v1.c @@ -48,7 +48,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - int sve_len = svcntd(); svint64_t index = svindex_s64(0LL, lda); FLOAT *ao; diff --git a/kernel/arm64/trmm_utcopy_sve_v1.c b/kernel/arm64/trmm_utcopy_sve_v1.c index b45cbd7da..9be1c0abb 100644 --- a/kernel/arm64/trmm_utcopy_sve_v1.c +++ b/kernel/arm64/trmm_utcopy_sve_v1.c @@ -48,8 +48,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - int sve_len = svcntd(); - FLOAT *ao; js = 0; svbool_t pn = svwhilelt_b64(js, n); From f4da23dcb6ac0de6a4c5fc07c704fb0b61ff5b25 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Tue, 23 Nov 2021 21:18:08 +0100 Subject: [PATCH 13/15] reduced dgemm_unroll_m to work with 128-bit sve --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index bbc52fac4..0ccc4a4d6 100644 --- a/param.h +++ b/param.h @@ -3301,7 +3301,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d /* When all BLAS3 routines are implemeted with SVE, DGEMM_DEFAULT_UNROLL_M should be "sve_vl". Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ -#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_N 8 #define CGEMM_DEFAULT_UNROLL_M 8 From 1af73ce38e75863c06d434b8f3bd2105df9143b1 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Fri, 26 Nov 2021 10:35:01 +0100 Subject: [PATCH 14/15] Adapt CMake for SVE --- cmake/cc.cmake | 18 ++++++++++++++++++ kernel/CMakeLists.txt | 38 ++++++++++++++++++++++++++++---------- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 0ab1d4c1b..153cdce61 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -139,6 +139,24 @@ if (${CORE} STREQUAL SAPPHIRERAPIDS) endif () endif () +if (${CORE} STREQUAL A64FX) + if (NOT DYNAMIC_ARCH) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + endif() + endif () +endif () + +if (${CORE} STREQUAL ARMV8SVE) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + endif () +endif () + + if (NOT DYNAMIC_ARCH) if (HAVE_AVX2) set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2") diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 9c8460723..80c7dcd8b 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -418,32 +418,50 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type}) # symm for s and d +if (NOT DEFINED ${float_char}SYMMUCOPY_M) + set(SYMMUCOPY_M "generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c") + set(SYMMLCOPY_M "generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}") + set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}") +endif() GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type}) + GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) + GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type}) # These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define. # Could simplify it a bit by pairing up by -UUNIT/-DUNIT. - GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type}) +if (NOT DEFINED ${float_char}TRMMUNCOPY_M) + set(TRMMUNCOPY_M "generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRMMLNCOPY_M "generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRMMUTCOPY_M "generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRMMLTCOPY_M "generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}") + set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}") + set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}") + set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}") +endif () + GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) From ca65a4e91d945f6df8fdbe3cca55af943725653e Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Fri, 26 Nov 2021 13:11:19 +0100 Subject: [PATCH 15/15] update CONTRIBUTORS.md --- CONTRIBUTORS.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 6be41960c..39ec96246 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -197,3 +197,7 @@ In chronological order: * River Dillon * [2021-07-10] fix compilation with musl libc + +* Bine Brank + * [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE + * [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM