From 894fde9bfe36fe1988b595d3529a7f808a5a6534 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Dec 2021 21:21:47 +0100 Subject: [PATCH 01/77] Update version to 0.3.19.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c1d69da13..913017c63 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 19) +set(OpenBLAS_PATCH_VERSION 19.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") From 8cec83bdfb82effda2075309af5ca36df79f1a8e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Dec 2021 21:22:19 +0100 Subject: [PATCH 02/77] Update version to 0.3.19.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 3359860b9..4b4b9bcf9 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.19 +VERSION = 0.3.19.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 8d9b9c6b2a6f015cafcf3e0e568874a1aabcc223 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 21 Dec 2021 09:22:59 +0800 Subject: [PATCH 03/77] loongarch64: Optimize dgemm_kernel --- kernel/loongarch64/KERNEL.LOONGSON3R5 | 15 +- kernel/loongarch64/dgemm_kernel_16x4.S | 4250 ++++++++++++++++++++++++ kernel/loongarch64/dgemm_ncopy_16.S | 691 ++++ kernel/loongarch64/dgemm_ncopy_4.S | 237 ++ kernel/loongarch64/dgemm_tcopy_16.S | 710 ++++ kernel/loongarch64/dgemm_tcopy_4.S | 270 ++ param.h | 10 +- 7 files changed, 6177 insertions(+), 6 deletions(-) create mode 100644 kernel/loongarch64/dgemm_kernel_16x4.S create mode 100644 kernel/loongarch64/dgemm_ncopy_16.S create mode 100644 kernel/loongarch64/dgemm_ncopy_4.S create mode 100644 kernel/loongarch64/dgemm_tcopy_16.S create mode 100644 kernel/loongarch64/dgemm_tcopy_4.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index cce4093e3..bb0441ab2 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -1 +1,14 @@ -#TODO: Add loongarch64 SIMD optimizations +DGEMMKERNEL = dgemm_kernel_16x4.S +DGEMMINCOPY = dgemm_ncopy_16.S +DGEMMITCOPY = dgemm_tcopy_16.S +DGEMMONCOPY = dgemm_ncopy_4.S +DGEMMOTCOPY = dgemm_tcopy_4.S +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c diff --git a/kernel/loongarch64/dgemm_kernel_16x4.S b/kernel/loongarch64/dgemm_kernel_16x4.S new file mode 100644 index 000000000..13faa977e --- /dev/null +++ b/kernel/loongarch64/dgemm_kernel_16x4.S @@ -0,0 +1,4250 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA $f0 // param 4: alpha +#define A $r7 // param 5: ba +#define B $r8 // param 6: bb +#define C $r9 // param 7: bc +#define LDC $r10 // param 8: ldc + +#ifdef TRMMKERNEL +#define OFFSET $r11 // param 9: offset +#endif +#define OFF $r12 + +/* Cycle control parameters */ +#define I $r13 +#define J $r14 +#define L $r15 +#define TL $r16 +/* Matrix address */ +#define A0 $r17 +#define B0 $r18 +#define C0 $r19 +#define C1 $r20 +#define C2 $r23 +#define C3 $r24 +#define T0 $r25 /* !! DO NOT USE $r21 and $r22 !! */ +#define T1 $r26 +#define T2 $r27 +#define ZERO $r0 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define D0 $xr7 +#define D1 $xr8 +#define D2 $xr9 +#define D3 $xr10 +#define D4 $xr11 +#define D5 $xr12 +#define D6 $xr13 +#define D7 $xr14 +#define D8 $xr15 +#define D9 $xr16 +#define D10 $xr17 +#define D11 $xr18 +#define D12 $xr19 +#define D13 $xr20 +#define D14 $xr21 +#define D15 $xr22 +#define VALPHA $xr23 + +/* Prefetch interval */ +#define A_PRE 0x200 +#define B_PRE 0x100 + + PROLOGUE + + addi.d $sp, $sp, -56 + /* Store regs */ + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST ALPHA, $sp, 48 + + /* VALPHA = {ALPHA, ALPHA, ALPHA, ALPHA} */ + xvld VALPHA, $sp, 48 + xvreplve0.d VALPHA, VALPHA + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, ZERO, OFFSET +#else + xor OFF, OFF, OFF +#endif + + /* if (!(N >> 2)) goto L_N3 */ + srai.d J, N, 2 /* J = bn >> 2 */ + andi N, N, 0x03 + beq ZERO, J, .L_N3 + +.L_J1: /* J-- && This loop include Condition 1 */ + +/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! ************************* +* dgemm_core_16x4 */ + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + addi.d J, J, -1 /* J-- */ + add.d C2, C1, T0 + add.d C3, C2, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_M8 */ + srai.d I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_M8 + +.L_I1: /* I-- */ +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x07 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 16 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + /* Calculate the first set of D0~D15, + * avoidig set 0 operation + * Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + preld 0, C0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + preld 0, C0, 0x40 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + xvldrepl.d U4, B0, 0x08 + preld 0, C1, 0x00 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + preld 0, C1, 0x40 + xvfmul.d D6, U2, U4 + xvfmul.d D7, U3, U4 + + xvldrepl.d U4, B0, 0x10 + preld 0, C2, 0x00 + /* line 3 */ + xvfmul.d D8, U0, U4 + xvfmul.d D9, U1, U4 + preld 0, C2, 0x40 + xvfmul.d D10, U2, U4 + xvfmul.d D11, U3, U4 + + xvldrepl.d U4, B0, 0x18 + preld 0, C3, 0x00 + /* line 4 */ + xvfmul.d D12, U0, U4 + xvfmul.d D13, U1, U4 + preld 0, C3, 0x40 + xvfmul.d D14, U2, U4 + xvfmul.d D15, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_L7 */ + beq ZERO,TL, .L_L7 + + /* Calculate 8 sets of D0~D15 */ +.L_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-2***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-3***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-4***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-5***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-6***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-7***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-8***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_TL1 + + /* Maybe we need calculate the last + * 7 sets of D0~D15? + */ +.L_L7: + /* if (!(L & 7)) goto L_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_L0 + +.L_L71: + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_L71 + +.L_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D2, D2, VALPHA + xvfmul.d D3, D3, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D6, D6, VALPHA + xvfmul.d D7, D7, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D9, D9, VALPHA + xvfmul.d D10, D10, VALPHA + xvfmul.d D11, D11, VALPHA + xvfmul.d D12, D12, VALPHA + xvfmul.d D13, D13, VALPHA + xvfmul.d D14, D14, VALPHA + xvfmul.d D15, D15, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + xvfmadd.d D2, D2, VALPHA, U2 + xvfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvld U2, C1, 0x40 + xvld U3, C1, 0x60 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 + xvfmadd.d D6, D6, VALPHA, U2 + xvfmadd.d D7, D7, VALPHA, U3 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvld U1, C2, 0x20 + xvld U2, C2, 0x40 + xvld U3, C2, 0x60 + xvfmadd.d D8, D8, VALPHA, U0 + xvfmadd.d D9, D9, VALPHA, U1 + xvfmadd.d D10, D10, VALPHA, U2 + xvfmadd.d D11, D11, VALPHA, U3 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvld U1, C3, 0x20 + xvld U2, C3, 0x40 + xvld U3, C3, 0x60 + xvfmadd.d D12, D12, VALPHA, U0 + xvfmadd.d D13, D13, VALPHA, U1 + xvfmadd.d D14, D14, VALPHA, U2 + xvfmadd.d D15, D15, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + xvst D2, C0, 0x40 + xvst D3, C0, 0x60 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + xvst D6, C1, 0x40 + xvst D7, C1, 0x60 + /* Store C2 */ + xvst D8, C2, 0x00 + xvst D9, C2, 0x20 + xvst D10, C2, 0x40 + xvst D11, C2, 0x60 + /* Store C3 */ + xvst D12, C3, 0x00 + xvst D13, C3, 0x20 + xvst D14, C3, 0x40 + xvst D15, C3, 0x60 + + /* Add stride for C */ + addi.d C0, C0, 0x80 + addi.d C1, C1, 0x80 + addi.d C2, C2, 0x80 + addi.d C3, C3, 0x80 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -16 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x07 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_I1 + +.L_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_M0 + + andi I, M, 8 + beq ZERO,I, .L_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif // #if defined(TRMMKERNEL) + + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + xvfmul.d D9, U1, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + xvfmul.d D13, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M8_L7 */ + beq ZERO,TL, .L_M8_L7 + +.L_M8_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-2***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-3***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-4***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-5***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-6***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-7***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-8***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M8_TL1 + +.L_M8_L7: + /* if (!(L & 7)) goto L_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M8_L0 + +.L_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M8_L71 + +.L_M8_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D9, D9, VALPHA + xvfmul.d D12, D12, VALPHA + xvfmul.d D13, D13, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvld U1, C2, 0x20 + xvfmadd.d D8, D8, VALPHA, U0 + xvfmadd.d D9, D9, VALPHA, U1 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvld U1, C3, 0x20 + xvfmadd.d D12, D12, VALPHA, U0 + xvfmadd.d D13, D13, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + /* Store C2 */ + xvst D8, C2, 0x00 + xvst D9, C2, 0x20 + /* Store C3 */ + xvst D12, C3, 0x00 + xvst D13, C3, 0x20 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + addi.d C2, C2, 0x40 + addi.d C3, C3, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -8 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 8)) End************/ + +.L_M4: + andi I, M, 4 + beq ZERO,I, .L_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M4_L7 */ + beq ZERO,TL, .L_M4_L7 + +.L_M4_TL1: /* TL-- */ + /***8-1***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M4_TL1 + +.L_M4_L7: + /* if (!(L & 7)) goto L_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M4_L0 + +.L_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_L71 + +.L_M4_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U0 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U0 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + /* Store C1 */ + xvst D4, C1, 0x00 + /* Store C2 */ + xvst D8, C2, 0x00 + /* Store C3 */ + xvst D12, C3, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + addi.d C2, C2, 0x20 + addi.d C3, C3, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -4 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ + +.L_M2: + andi I, M, 2 + beq ZERO,I, .L_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M2_L7 */ + beq ZERO,TL, .L_M2_L7 + +.L_M2_TL1: /* TL-- */ + /***8-1***/ + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M2_TL1 + +.L_M2_L7: + /* if (!(L & 7)) goto L_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M2_L0 + +.L_M2_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M2_L71 + +.L_M2_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U0 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U0 +#endif // #if defined(TRMMKERNEL) + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + xvstelm.d D8, C2, 0x00, 0x00 + xvstelm.d D12, C3, 0x00, 0x00 + xvstelm.d D0, C0, 0x08, 0x01 + xvstelm.d D4, C1, 0x08, 0x01 + xvstelm.d D8, C2, 0x08, 0x01 + xvstelm.d D12, C3, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + addi.d C2, C2, 0x10 + addi.d C3, C3, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -2 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ + +.L_M1: + andi I, M, 1 + beq ZERO,I, .L_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M1_L7 */ + beq ZERO,TL, .L_M1_L7 + +.L_M1_TL1: /* TL-- */ + /***8-1***/ + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M1_TL1 + +.L_M1_L7: + /* if (!(L & 7)) goto L_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M1_L0 + +.L_M1_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M1_L71 + +.L_M1_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U0 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U0 +#endif // #if defined(TRMMKERNEL) + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + xvstelm.d D8, C2, 0x00, 0x00 + xvstelm.d D12, C3, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + addi.d C2, C2, 0x08 + addi.d C3, C3, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -1 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 1) ) End************/ + +.L_M0: + /* Add stride for B and C + * B += (K * 32) + * C += (LDC * 32) + */ + /* since the array type is double, + * so we must mul 32 + */ + slli.d T0, K, 5 + slli.d T1, LDC, 5 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x04 +#endif + + blt ZERO, J, .L_J1 + +//////////////// go back to L_J1 ///////////////// +///////////////////////////////////////////////// +/************************ Condition 1 if((N >> 2) && (M >> 4)) END !!! ************************/ + +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 + +/************************* Condition 2 if((N & 2) && (M >> 4)) START !!! ************************* +* dgemm_core_16x2 */ + + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_N3_M8 */ + srai.d I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N3_M8 + +.L_N3_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x07 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 16 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + xvfmul.d D6, U2, U4 + xvfmul.d D7, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_L7 */ + beq ZERO,TL, .L_N3_L7 + +.L_N3_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-2***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-3***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-4***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-5***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-6***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-7***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-8***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_TL1 + +.L_N3_L7: + /* if (!(L & 7)) goto L_N3_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_L0 + +.L_N3_L71: + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_L71 + +.L_N3_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D2, D2, VALPHA + xvfmul.d D3, D3, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D6, D6, VALPHA + xvfmul.d D7, D7, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + xvfmadd.d D2, D2, VALPHA, U2 + xvfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvld U2, C1, 0x40 + xvld U3, C1, 0x60 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 + xvfmadd.d D6, D6, VALPHA, U2 + xvfmadd.d D7, D7, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + xvst D2, C0, 0x40 + xvst D3, C0, 0x60 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + xvst D6, C1, 0x40 + xvst D7, C1, 0x60 + + /* Add stride for C */ + addi.d C0, C0, 0x80 + addi.d C1, C1, 0x80 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -16 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x07 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N3_I1 + +.L_N3_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N3_M0 + + andi I, M, 8 + beq ZERO,I, .L_N3_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M8_L7 */ + beq ZERO,TL, .L_N3_M8_L7 + +.L_N3_M8_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-2***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-3***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-4***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-5***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-6***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-7***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-8***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M8_TL1 + +.L_N3_M8_L7: + /* if (!(L & 7)) goto L_N3_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M8_L0 + +.L_N3_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M8_L71 + +.L_N3_M8_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2) && (M & 8) ) End************/ + +.L_N3_M4: + andi I, M, 4 + beq ZERO,I, .L_N3_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M4_L7 */ + beq ZERO,TL, .L_N3_M4_L7 + +.L_N3_M4_TL1: /* TL-- */ + /***8-1***/ + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-5***/ + xvld U0, A0, 0x00 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M4_TL1 + +.L_N3_M4_L7: + /* if (!(L & 7)) goto L_N3_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M4_L0 + +.L_N3_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M4_L71 + +.L_N3_M4_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + /* Store C1 */ + xvst D4, C1, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 4) ) End************/ + +.L_N3_M2: + andi I, M, 2 + beq ZERO,I, .L_N3_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M2_L7 */ + beq ZERO,TL, .L_N3_M2_L7 + +.L_N3_M2_TL1: /* TL-- */ + /***8-1***/ + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M2_TL1 + +.L_N3_M2_L7: + /* if (!(L & 7)) goto L_N3_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M2_L0 + +.L_N3_M2_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M2_L71 + +.L_N3_M2_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 +#endif // #if defined(TRMMKERNEL) + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + xvstelm.d D0, C0, 0x08, 0x01 + xvstelm.d D4, C1, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 2) ) End************/ + +.L_N3_M1: + andi I, M, 1 + beq ZERO,I, .L_N3_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M1_L7 */ + beq ZERO,TL, .L_N3_M1_L7 + +.L_N3_M1_TL1: /* TL-- */ + /***8-1***/ + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M1_TL1 + +.L_N3_M1_L7: + /* if (!(L & 7)) goto L_N3_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M1_L0 + +.L_N3_M1_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M1_L71 + +.L_N3_M1_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 +#endif // #if defined(TRMMKERNEL) + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 1) ) End************/ + +.L_N3_M0: + /* Add stride for B and C + * B += (K * 16) + * C += (LDC * 16) + */ + /* since the array type is double, + * so we must mul 16 + */ + slli.d T0, K, 4 + slli.d T1, LDC, 4 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x02 +#endif + + /* We must reinit I */ + srai.d I, M, 4 /* I = bm >> 4 */ + +/************************* Condition 2 if((N & 2) && (M >> 4)) End !!! ************************* +* dgemm_core_16x2 */ + +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + +/************************* Condition 3 if((N & 1) && (M >> 4)) START !!! ************************* +* dgemm_core_16x1 */ + + move C0, C + move A0, A + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_N1_M8 */ + srai.d I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N1_M8 + +.L_N1_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x07 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 16 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_L7 */ + beq ZERO,TL, .L_N1_L7 + +.L_N1_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-2***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-3***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-4***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-5***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-6***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-7***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-8***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_TL1 + +.L_N1_L7: + /* if (!(L & 7)) goto L_N1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_L0 + +.L_N1_L71: + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_L71 + +.L_N1_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D2, D2, VALPHA + xvfmul.d D3, D3, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + xvfmadd.d D2, D2, VALPHA, U2 + xvfmadd.d D3, D3, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + xvst D2, C0, 0x40 + xvst D3, C0, 0x60 + + /* Add stride for C */ + addi.d C0, C0, 0x80 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -16 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x07 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N1_I1 + +.L_N1_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N1_M0 + + andi I, M, 8 + beq ZERO,I, .L_N1_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M8_L7 */ + beq ZERO,TL, .L_N1_M8_L7 + +.L_N1_M8_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-2***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-3***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-4***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-5***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-6***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-7***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-8***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M8_TL1 + +.L_N1_M8_L7: + /* if (!(L & 7)) goto L_N1_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M8_L0 + +.L_N1_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M8_L71 + +.L_N1_M8_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1) && (M & 8) ) End************/ + +.L_N1_M4: + andi I, M, 4 + beq ZERO,I, .L_N1_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M4_L7 */ + beq ZERO,TL, .L_N1_M4_L7 + +.L_N1_M4_TL1: /* TL-- */ + /***8-1***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M4_TL1 + +.L_N1_M4_L7: + /* if (!(L & 7)) goto L_N1_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M4_L0 + +.L_N1_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M4_L71 + +.L_N1_M4_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1) && (M & 4) ) End************/ + +.L_N1_M2: + andi I, M, 2 + beq ZERO,I, .L_N1_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M2_L7 */ + beq ZERO,TL, .L_N1_M2_L7 + +.L_N1_M2_TL1: /* TL-- */ + /***8-1***/ + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M2_TL1 + +.L_N1_M2_L7: + /* if (!(L & 7)) goto L_N1_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M2_L0 + +.L_N1_M2_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M2_L71 + +.L_N1_M2_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D0, C0, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1 ) && (M & 2) ) End************/ + +.L_N1_M1: + andi I, M, 1 + beq ZERO,I, .L_N1_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M1_L7 */ + beq ZERO,TL, .L_N1_M1_L7 + +.L_N1_M1_TL1: /* TL-- */ + /***8-1***/ + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M1_TL1 + +.L_N1_M1_L7: + /* if (!(L & 7)) goto L_N1_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M1_L0 + +.L_N1_M1_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M1_L71 + +.L_N1_M1_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + xvstelm.d D0, C0, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1 ) && (M & 1) ) End************/ + +.L_N1_M0: + +/************************* Condition 3 if((N & 1) && (M >> 4)) End !!! ************************* +* dgemm_core_16x1 */ + +.L_N0: + /* Restore regs */ + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LD $f23, $sp, 40 + addi.d $sp, $sp, 56 + + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_ncopy_16.S b/kernel/loongarch64/dgemm_ncopy_16.S new file mode 100644 index 000000000..95c879031 --- /dev/null +++ b/kernel/loongarch64/dgemm_ncopy_16.S @@ -0,0 +1,691 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define S11 $r24 +#define S12 $r25 +#define S13 $r26 +#define S14 $r27 +#define S15 $r28 +#define S16 $r29 +#define TD $r30 +#define TS $r31 +#define TL $r7 +#define T0 $r6 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define D14 $xr30 +#define D15 $xr31 + + PROLOGUE + + addi.d $sp, $sp, -0x90 + SDARG $r23, $sp, 0x00 + SDARG $r24, $sp, 0x08 + SDARG $r25, $sp, 0x10 + SDARG $r26, $sp, 0x18 + SDARG $r27, $sp, 0x20 + SDARG $r28, $sp, 0x28 + SDARG $r29, $sp, 0x30 + SDARG $r30, $sp, 0x38 + SDARG $r31, $sp, 0x40 + ST $f23, $sp, 0x48 + ST $f24, $sp, 0x50 + ST $f25, $sp, 0x58 + ST $f26, $sp, 0x60 + ST $f27, $sp, 0x68 + ST $f28, $sp, 0x70 + ST $f29, $sp, 0x78 + ST $f30, $sp, 0x80 + ST $f31, $sp, 0x88 + + move TD, DST + move TS, SRC + slli.d TL, LDA, 0x03 + slli.d T0, TL, 0x01 + srai.d J, N, 0x04 + beq J, ZERO, .L_N8 + +.L_J1: /* J-- */ + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x03 + add.d S3, S2, TL + addi.d J, J, -1 + add.d S4, S3, TL + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d S9, S7, T0 + add.d S10, S8, T0 + add.d S11, S9, T0 + add.d S12, S10, T0 + add.d S13, S11, T0 + add.d S14, S12, T0 + add.d S15, S13, T0 + add.d S16, S14, T0 + add.d TS, S15, T0 + beq I, ZERO, .L_I7 + +.L_I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + xvld U8, S9, 0x00 + xvld U9, S10, 0x00 + xvld U10, S11, 0x00 + xvld U11, S12, 0x00 + xvld U12, S13, 0x00 + xvld U13, S14, 0x00 + xvld U14, S15, 0x00 + xvld U15, S16, 0x00 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + xvpackev.d D2, U3, U2 + xvpackod.d D3, U3, U2 + xvpackev.d D4, U5, U4 + xvpackod.d D5, U5, U4 + xvpackev.d D6, U7, U6 + xvpackod.d D7, U7, U6 + + xvpackev.d D8, U9, U8 + xvpackod.d D9, U9, U8 + xvpackev.d D10, U11, U10 + xvpackod.d D11, U11, U10 + xvpackev.d D12, U13, U12 + xvpackod.d D13, U13, U12 + xvpackev.d D14, U15, U14 + xvpackod.d D15, U15, U14 + + xvand.v U0, D0, D0 + xvpermi.q D0, D2, 0x02 // 0 + xvand.v U4, D4, D4 + xvpermi.q D4, D6, 0x02 // 1 + xvand.v U1, D1, D1 + xvpermi.q D1, D3, 0x02 // 4 + xvand.v U5, D5, D5 + xvpermi.q D5, D7, 0x02 // 5 + xvpermi.q D2, U0, 0x31 // 8 + xvpermi.q D6, U4, 0x31 // 9 + xvpermi.q D3, U1, 0x31 // 12 + xvpermi.q D7, U5, 0x31 // 13 + + xvand.v U8, D8, D8 + xvpermi.q D8, D10, 0x02 // 2 + xvand.v U12, D12, D12 + xvpermi.q D12, D14, 0x02 // 3 + xvand.v U9, D9, D9 + xvpermi.q D9, D11, 0x02 // 6 + xvand.v U13, D13, D13 + xvpermi.q D13, D15, 0x02 // 7 + xvpermi.q D10, U8, 0x31 // 10 + xvpermi.q D14, U12, 0x31 // 11 + xvpermi.q D11, U9, 0x31 // 14 + xvpermi.q D15, U13, 0x31 // 15 + + xvst D0, TD, 0x00 // 0 + xvst D4, TD, 0x20 // 1 + xvst D8, TD, 0x40 // 2 + xvst D12, TD, 0x60 // 3 + xvst D1, TD, 0x80 // 4 + xvst D5, TD, 0xA0 // 5 + xvst D9, TD, 0xC0 // 6 + xvst D13, TD, 0xE0 // 7 + addi.d TD, TD, 0x100 + xvst D2, TD, 0x00 // 8 + xvst D6, TD, 0x20 // 9 + xvst D10, TD, 0x40 // 10 + xvst D14, TD, 0x60 // 11 + xvst D3, TD, 0x80 // 12 + xvst D7, TD, 0xA0 // 13 + xvst D11, TD, 0xC0 // 14 + xvst D15, TD, 0xE0 // 15 + addi.d TD, TD, 0x100 + + xvld U0, S1, 0x20 + xvld U1, S2, 0x20 + xvld U2, S3, 0x20 + xvld U3, S4, 0x20 + xvld U4, S5, 0x20 + xvld U5, S6, 0x20 + xvld U6, S7, 0x20 + xvld U7, S8, 0x20 + xvld U8, S9, 0x20 + xvld U9, S10, 0x20 + xvld U10, S11, 0x20 + xvld U11, S12, 0x20 + xvld U12, S13, 0x20 + xvld U13, S14, 0x20 + xvld U14, S15, 0x20 + xvld U15, S16, 0x20 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + xvpackev.d D2, U3, U2 + xvpackod.d D3, U3, U2 + xvpackev.d D4, U5, U4 + xvpackod.d D5, U5, U4 + xvpackev.d D6, U7, U6 + xvpackod.d D7, U7, U6 + + xvpackev.d D8, U9, U8 + xvpackod.d D9, U9, U8 + xvpackev.d D10, U11, U10 + xvpackod.d D11, U11, U10 + xvpackev.d D12, U13, U12 + xvpackod.d D13, U13, U12 + xvpackev.d D14, U15, U14 + xvpackod.d D15, U15, U14 + + xvand.v U0, D0, D0 + xvpermi.q D0, D2, 0x02 // 0 + xvand.v U4, D4, D4 + xvpermi.q D4, D6, 0x02 // 1 + xvand.v U1, D1, D1 + xvpermi.q D1, D3, 0x02 // 4 + xvand.v U5, D5, D5 + xvpermi.q D5, D7, 0x02 // 5 + xvpermi.q D2, U0, 0x31 // 8 + xvpermi.q D6, U4, 0x31 // 9 + xvpermi.q D3, U1, 0x31 // 12 + xvpermi.q D7, U5, 0x31 // 13 + + xvand.v U8, D8, D8 + xvpermi.q D8, D10, 0x02 // 2 + xvand.v U12, D12, D12 + xvpermi.q D12, D14, 0x02 // 3 + xvand.v U9, D9, D9 + xvpermi.q D9, D11, 0x02 // 6 + xvand.v U13, D13, D13 + xvpermi.q D13, D15, 0x02 // 7 + xvpermi.q D10, U8, 0x31 // 10 + xvpermi.q D14, U12, 0x31 // 11 + xvpermi.q D11, U9, 0x31 // 14 + xvpermi.q D15, U13, 0x31 // 15 + + xvst D0, TD, 0x00 // 0 + xvst D4, TD, 0x20 // 1 + xvst D8, TD, 0x40 // 2 + xvst D12, TD, 0x60 // 3 + xvst D1, TD, 0x80 // 4 + xvst D5, TD, 0xA0 // 5 + xvst D9, TD, 0xC0 // 6 + xvst D13, TD, 0xE0 // 7 + addi.d TD, TD, 0x100 + xvst D2, TD, 0x00 // 8 + xvst D6, TD, 0x20 // 9 + xvst D10, TD, 0x40 // 10 + xvst D14, TD, 0x60 // 11 + xvst D3, TD, 0x80 // 12 + xvst D7, TD, 0xA0 // 13 + xvst D11, TD, 0xC0 // 14 + xvst D15, TD, 0xE0 // 15 + addi.d TD, TD, 0x100 + + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + addi.d S9, S9, 0x40 + addi.d S10, S10, 0x40 + addi.d S11, S11, 0x40 + addi.d S12, S12, 0x40 + addi.d S13, S13, 0x40 + addi.d S14, S14, 0x40 + addi.d S15, S15, 0x40 + addi.d S16, S16, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_I7: + andi I, M, 0x07 + beq I, ZERO, .L_I0 + +.L_II1: /* I-- */ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + fst.d F4, TD, 0x20 + addi.d S5, S5, 0x08 + fst.d F5, TD, 0x28 + addi.d S6, S6, 0x08 + fst.d F6, TD, 0x30 + addi.d S7, S7, 0x08 + fst.d F7, TD, 0x38 + addi.d S8, S8, 0x08 + addi.d TD, TD, 0x40 + + fld.d F0, S9, 0x00 + fld.d F1, S10, 0x00 + fld.d F2, S11, 0x00 + fld.d F3, S12, 0x00 + fld.d F4, S13, 0x00 + fld.d F5, S14, 0x00 + fld.d F6, S15, 0x00 + fld.d F7, S16, 0x00 + + fst.d F0, TD, 0x00 + addi.d S9, S9, 0x08 + fst.d F1, TD, 0x08 + addi.d S10, S10, 0x08 + fst.d F2, TD, 0x10 + addi.d S11, S11, 0x08 + fst.d F3, TD, 0x18 + addi.d S12, S12, 0x08 + fst.d F4, TD, 0x20 + addi.d S13, S13, 0x08 + fst.d F5, TD, 0x28 + addi.d S14, S14, 0x08 + fst.d F6, TD, 0x30 + addi.d S15, S15, 0x08 + fst.d F7, TD, 0x38 + addi.d S16, S16, 0x08 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_II1 + +.L_I0: + blt ZERO, J, .L_J1 + +.L_N8: + andi J, N, 0x08 + beq ZERO, J, .L_N4 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x03 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d TS, S7, T0 + beq I, ZERO, .L_8I3 + +.L_8I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + xvpackev.d D2, U3, U2 + xvpackod.d D3, U3, U2 + xvpackev.d D4, U5, U4 + xvpackod.d D5, U5, U4 + xvpackev.d D6, U7, U6 + xvpackod.d D7, U7, U6 + + xvand.v U0, D0, D0 + xvpermi.q D0, D2, 0x02 // 0 + xvand.v U4, D4, D4 + xvpermi.q D4, D6, 0x02 // 1 + xvand.v U1, D1, D1 + xvpermi.q D1, D3, 0x02 // 2 + xvand.v U5, D5, D5 + xvpermi.q D5, D7, 0x02 // 3 + xvpermi.q D2, U0, 0x31 // 4 + xvpermi.q D6, U4, 0x31 // 5 + xvpermi.q D3, U1, 0x31 // 6 + xvpermi.q D7, U5, 0x31 // 7 + + xvst D0, TD, 0x00 + xvst D4, TD, 0x20 + xvst D1, TD, 0x40 + xvst D5, TD, 0x60 + xvst D2, TD, 0x80 + xvst D6, TD, 0xA0 + xvst D3, TD, 0xC0 + xvst D7, TD, 0xE0 + addi.d TD, TD, 0x100 + + xvld U0, S1, 0x20 + xvld U1, S2, 0x20 + xvld U2, S3, 0x20 + xvld U3, S4, 0x20 + xvld U4, S5, 0x20 + xvld U5, S6, 0x20 + xvld U6, S7, 0x20 + xvld U7, S8, 0x20 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + xvpackev.d D2, U3, U2 + xvpackod.d D3, U3, U2 + xvpackev.d D4, U5, U4 + xvpackod.d D5, U5, U4 + xvpackev.d D6, U7, U6 + xvpackod.d D7, U7, U6 + + xvand.v U0, D0, D0 + xvpermi.q D0, D2, 0x02 // 0 + xvand.v U4, D4, D4 + xvpermi.q D4, D6, 0x02 // 1 + xvand.v U1, D1, D1 + xvpermi.q D1, D3, 0x02 // 2 + xvand.v U5, D5, D5 + xvpermi.q D5, D7, 0x02 // 3 + xvpermi.q D2, U0, 0x31 // 4 + xvpermi.q D6, U4, 0x31 // 5 + xvpermi.q D3, U1, 0x31 // 6 + xvpermi.q D7, U5, 0x31 // 7 + + xvst D0, TD, 0x00 + xvst D4, TD, 0x20 + xvst D1, TD, 0x40 + xvst D5, TD, 0x60 + xvst D2, TD, 0x80 + xvst D6, TD, 0xA0 + xvst D3, TD, 0xC0 + xvst D7, TD, 0xE0 + addi.d TD, TD, 0x100 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_8I1 + +.L_8I3: + andi I, M, 0x07 + beq I, ZERO, .L_N4 + +.L_8I11: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + fst.d F4, TD, 0x20 + addi.d S5, S5, 0x08 + fst.d F5, TD, 0x28 + addi.d S6, S6, 0x08 + fst.d F6, TD, 0x30 + addi.d S7, S7, 0x08 + fst.d F7, TD, 0x38 + addi.d S8, S8, 0x08 + + addi.d TD, TD, 0x40 + addi.d I, I, -1 + blt ZERO, I, .L_8I11 + +.L_N4: + andi J, N, 0x04 + beq ZERO, J, .L_N2 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d TS, S3, T0 + beq I, ZERO, .L_I3 + +.L_4I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + xvpackev.d D2, U3, U2 + xvpackod.d D3, U3, U2 + + xvand.v U0, D0, D0 + xvpermi.q D0, D2, 0x02 // 0 + xvand.v U1, D1, D1 + xvpermi.q D1, D3, 0x02 // 1 + xvpermi.q D2, U0, 0x31 // 2 + xvpermi.q D3, U1, 0x31 // 3 + + xvst D0, TD, 0x00 + xvst D1, TD, 0x20 + xvst D2, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_4I1 + +.L_I3: + andi I, M, 0x03 + beq I, ZERO, .L_N2 + +.L_4II1: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + + addi.d TD, TD, 0x20 + addi.d I, I, -1 + blt ZERO, I, .L_4II1 + +.L_N2: + andi J, N, 0x02 + beq ZERO, J, .L_N1 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x01 + add.d TS, S2, TL + beq I, ZERO, .L_NI1 + +.L_2I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + + xvpermi.q D0, D1, 0x02 // 0 + + xvst D0, TD, 0x00 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_2I1 + +.L_NI1: + andi I, M, 0x01 + beq I, ZERO, .L_N1 + + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 + +.L_M1: + fld.d F0, S1, 0x00 + addi.d S1, S1, 0x08 + fst.d F0, TD, 0x00 + addi.d TD, TD, 0x08 + addi.d M, M, -1 + blt ZERO, M, .L_M1 + +.L_N0: + LDARG $r23, $sp, 0x00 + LDARG $r24, $sp, 0x08 + LDARG $r25, $sp, 0x10 + LDARG $r26, $sp, 0x18 + LDARG $r27, $sp, 0x20 + LDARG $r28, $sp, 0x28 + LDARG $r29, $sp, 0x30 + LDARG $r30, $sp, 0x38 + LDARG $r31, $sp, 0x40 + LD $f23, $sp, 0x48 + LD $f24, $sp, 0x50 + LD $f25, $sp, 0x58 + LD $f26, $sp, 0x60 + LD $f27, $sp, 0x68 + LD $f28, $sp, 0x70 + LD $f29, $sp, 0x78 + LD $f30, $sp, 0x80 + LD $f31, $sp, 0x88 + addi.d $sp, $sp, 0x90 + jirl $r0, $r1, 0x00 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_ncopy_4.S b/kernel/loongarch64/dgemm_ncopy_4.S new file mode 100644 index 000000000..b1f322a06 --- /dev/null +++ b/kernel/loongarch64/dgemm_ncopy_4.S @@ -0,0 +1,237 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr14 +#define D1 $xr8 +#define D2 $xr9 +#define D3 $xr10 +#define D4 $xr11 +#define D5 $xr12 +#define D6 $xr13 +#define D7 $xr15 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST + move TS, SRC + slli.d TL, LDA, 0x03 + slli.d T0, TL, 0x01 + srai.d J, N, 0x02 + beq J, ZERO, .L_N2 + +.L_J1: /* J-- */ + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d TS, S3, T0 + addi.d J, J, -1 + beq I, ZERO, .L_I3 + +.L_I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + xvpackev.d D2, U3, U2 + xvpackod.d D3, U3, U2 + + xvand.v U0, D0, D0 + xvpermi.q D0, D2, 0x02 // 0 + xvand.v U1, D1, D1 + xvpermi.q D1, D3, 0x02 // 1 + xvpermi.q D2, U0, 0x31 // 2 + xvpermi.q D3, U1, 0x31 // 3 + + xvst D0, TD, 0x00 + xvst D1, TD, 0x20 + xvst D2, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_I3: + andi I, M, 0x03 + beq I, ZERO, .L_I0 + +.L_II1: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + + addi.d TD, TD, 0x20 + addi.d I, I, -1 + blt ZERO, I, .L_II1 + +.L_I0: + blt ZERO, J, .L_J1 + +.L_N2: + andi J, N, 0x02 + beq ZERO, J, .L_N1 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d TS, S2, TL + beq I, ZERO, .L_2I3 + +.L_2I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + + xvand.v U0, D0, D0 + xvpermi.q D0, D1, 0x02 // 0 + xvpermi.q D1, U0, 0x31 // 1 + + xvst D0, TD, 0x00 + xvst D1, TD, 0x20 + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 + addi.d I, I, -1 + blt ZERO, I, .L_2I1 + +.L_2I3: + andi I, M, 0x03 + beq ZERO, I, .L_N1 + +.L_2II1: /* I-- */ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fst.d F0, TD, 0x00 + addi.d I, I, -1 + fst.d F1, TD, 0x08 + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + blt ZERO, I, .L_2II1 + +.L_N1: + andi J, N, 0x01 + beq ZERO, J, .L_N0 + + move S1, TS + srai.d I, M, 0x02 + beq ZERO, I, .L_1I3 + +.L_1I1: + xvld U0, S1, 0x00 + addi.d S1, S1, 0x20 + xvst U0, TD, 0x00 + addi.d I, I, -1 + addi.d TD, TD, 0x20 + blt ZERO, I, .L_1I1 + +.L_1I3: + andi I, M, 0x03 + beq ZERO, I, .L_N0 + +.L_1II1: + fld.d F0, S1, 0x00 + addi.d S1, S1, 0x08 + fst.d F0, TD, 0x00 + addi.d I, I, -1 + addi.d TD, TD, 0x08 + blt ZERO, I, .L_1II1 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_16.S b/kernel/loongarch64/dgemm_tcopy_16.S new file mode 100644 index 000000000..afafe5b37 --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_16.S @@ -0,0 +1,710 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define P5 $r27 +#define T0 $r28 +#define T1 $r29 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 + + PROLOGUE + + addi.d $sp, $sp, -56 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + SDARG $r29, $sp, 48 + + move S0, SRC + move P0, DST + + srai.d T0, N, 0x04 + srai.d T1, N, 0x03 + slli.d T0, T0, 0x04 + slli.d T1, T1, 0x03 + mul.d P2, M, T0 + mul.d P3, M, T1 + slli.d P2, P2, 0x03 + slli.d P3, P3, 0x03 + add.d P2, DST, P2 + add.d P3, DST, P3 + + srai.d T0, N, 0x02 + srai.d T1, N, 0x01 + slli.d T0, T0, 0x02 + slli.d T1, T1, 0x01 + mul.d P4, M, T0 + mul.d P5, M, T1 + slli.d P4, P4, 0x03 + slli.d P5, P5, 0x03 + add.d P4, DST, P4 + add.d P5, DST, P5 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x03 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x07 + beq ZERO, J, .L_M7 + +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d S0, S7, T0 + + move P1, P0 + addi.d P0, P0, 0x400 + + srai.d I, N, 0x04 + addi.d J, J, -1 + beq ZERO, I, .L_N15 + +.L_I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + xvld U0, S3, 0x00 + xvld U1, S3, 0x20 + xvld U2, S3, 0x40 + xvld U3, S3, 0x60 + xvld U4, S4, 0x00 + xvld U5, S4, 0x20 + xvld U6, S4, 0x40 + xvld U7, S4, 0x60 + + xvst U0, P1, 0x100 + xvst U1, P1, 0x120 + xvst U2, P1, 0x140 + xvst U3, P1, 0x160 + xvst U4, P1, 0x180 + xvst U5, P1, 0x1A0 + xvst U6, P1, 0x1C0 + xvst U7, P1, 0x1E0 + + xvld U0, S5, 0x00 + xvld U1, S5, 0x20 + xvld U2, S5, 0x40 + xvld U3, S5, 0x60 + xvld U4, S6, 0x00 + xvld U5, S6, 0x20 + xvld U6, S6, 0x40 + xvld U7, S6, 0x60 + + xvst U0, P1, 0x200 + xvst U1, P1, 0x220 + xvst U2, P1, 0x240 + xvst U3, P1, 0x260 + xvst U4, P1, 0x280 + xvst U5, P1, 0x2A0 + xvst U6, P1, 0x2C0 + xvst U7, P1, 0x2E0 + + xvld U0, S7, 0x00 + xvld U1, S7, 0x20 + xvld U2, S7, 0x40 + xvld U3, S7, 0x60 + xvld U4, S8, 0x00 + xvld U5, S8, 0x20 + xvld U6, S8, 0x40 + xvld U7, S8, 0x60 + + xvst U0, P1, 0x300 + xvst U1, P1, 0x320 + xvst U2, P1, 0x340 + xvst U3, P1, 0x360 + xvst U4, P1, 0x380 + xvst U5, P1, 0x3A0 + xvst U6, P1, 0x3C0 + xvst U7, P1, 0x3E0 + + addi.d S1, S1, 0x80 + addi.d S2, S2, 0x80 + addi.d S3, S3, 0x80 + addi.d S4, S4, 0x80 + addi.d S5, S5, 0x80 + addi.d S6, S6, 0x80 + addi.d S7, S7, 0x80 + addi.d S8, S8, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_I1 + +.L_N15: + andi I, N, 0x08 + beq ZERO, I, .L_N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + xvst U4, P2, 0x80 + xvst U5, P2, 0xA0 + xvst U6, P2, 0xC0 + xvst U7, P2, 0xE0 + + xvld U0, S5, 0x00 + xvld U1, S5, 0x20 + xvld U2, S6, 0x00 + xvld U3, S6, 0x20 + xvld U4, S7, 0x00 + xvld U5, S7, 0x20 + xvld U6, S8, 0x00 + xvld U7, S8, 0x20 + + xvst U0, P2, 0x100 + xvst U1, P2, 0x120 + xvst U2, P2, 0x140 + xvst U3, P2, 0x160 + xvst U4, P2, 0x180 + xvst U5, P2, 0x1A0 + xvst U6, P2, 0x1C0 + xvst U7, P2, 0x1E0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + addi.d P2, P2, 0x200 + +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + xvst U0, P3, 0x00 + xvst U1, P3, 0x20 + xvst U2, P3, 0x40 + xvst U3, P3, 0x60 + xvst U4, P3, 0x80 + xvst U5, P3, 0xA0 + xvst U6, P3, 0xC0 + xvst U7, P3, 0xE0 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S5, S5, 0x20 + addi.d S6, S6, 0x20 + addi.d S7, S7, 0x20 + addi.d S8, S8, 0x20 + addi.d P3, P3, 0x100 + +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + xvpermi.q U0, U1, 0x02 + xvpermi.q U2, U3, 0x02 + xvpermi.q U4, U5, 0x02 + xvpermi.q U6, U7, 0x02 + + xvst U0, P4, 0x00 + xvst U2, P4, 0x20 + xvst U4, P4, 0x40 + xvst U6, P4, 0x60 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d P4, P4, 0x80 + +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, P5, 0x00 + fst.d F1, P5, 0x08 + fst.d F2, P5, 0x10 + fst.d F3, P5, 0x18 + fst.d F4, P5, 0x20 + fst.d F5, P5, 0x28 + fst.d F6, P5, 0x30 + fst.d F7, P5, 0x38 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d P5, P5, 0x40 + +.L_N0: + blt ZERO, J, .L_J1 + +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 + + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x200 + + srai.d I, N, 0x04 + beq ZERO, I, .L_4N15 + +.L_4I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + xvld U0, S3, 0x00 + xvld U1, S3, 0x20 + xvld U2, S3, 0x40 + xvld U3, S3, 0x60 + xvld U4, S4, 0x00 + xvld U5, S4, 0x20 + xvld U6, S4, 0x40 + xvld U7, S4, 0x60 + + xvst U0, P1, 0x100 + xvst U1, P1, 0x120 + xvst U2, P1, 0x140 + xvst U3, P1, 0x160 + xvst U4, P1, 0x180 + xvst U5, P1, 0x1A0 + xvst U6, P1, 0x1C0 + xvst U7, P1, 0x1E0 + + addi.d S1, S1, 0x80 + addi.d S2, S2, 0x80 + addi.d S3, S3, 0x80 + addi.d S4, S4, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_4I1 + +.L_4N15: + andi I, N, 0x08 + beq ZERO, I, .L_4N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + xvst U4, P2, 0x80 + xvst U5, P2, 0xA0 + xvst U6, P2, 0xC0 + xvst U7, P2, 0xE0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d P2, P2, 0x100 + +.L_4N7: + andi I, N, 0x04 + beq ZERO, I, .L_4N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, P3, 0x00 + xvst U1, P3, 0x20 + xvst U2, P3, 0x40 + xvst U3, P3, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d P3, P3, 0x80 + +.L_4N3: + andi I, N, 0x02 + beq ZERO, I, .L_4N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvpermi.q U0, U1, 0x02 + xvpermi.q U2, U3, 0x02 + + xvst U0, P4, 0x00 + xvst U2, P4, 0x20 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P4, P4, 0x40 + +.L_4N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P5, 0x00 + fst.d F1, P5, 0x08 + fst.d F2, P5, 0x10 + fst.d F3, P5, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P5, P5, 0x20 + +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x100 + + srai.d I, N, 0x04 + beq ZERO, I, .L_2N15 + +.L_2I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + addi.d S1, S1, 0x80 + addi.d S2, S2, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_2I1 + +.L_2N15: + andi I, N, 0x08 + beq ZERO, I, .L_2N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d P2, P2, 0x80 + +.L_2N7: + andi I, N, 0x04 + beq ZERO, I, .L_2N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, P3, 0x00 + xvst U1, P3, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d P3, P3, 0x40 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvpermi.q U0, U1, 0x02 + + xvst U0, P4, 0x00 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P4, P4, 0x20 + +.L_2N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P5, 0x00 + fst.d F1, P5, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P5, P5, 0x10 + +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + add.d S2, S0, TL + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x04 + beq ZERO, I, .L_1N15 + +.L_1I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + addi.d S1, S1, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_1I1 + +.L_1N15: + andi I, N, 0x08 + beq ZERO, I, .L_1N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + + addi.d S1, S1, 0x40 + addi.d P2, P2, 0x40 + +.L_1N7: + andi I, N, 0x04 + beq ZERO, I, .L_1N3 + + xvld U0, S1, 0x00 + + xvst U0, P3, 0x00 + + addi.d S1, S1, 0x20 + addi.d P3, P3, 0x20 + +.L_1N3: + andi I, N, 0x02 + beq ZERO, I, .L_1N1 + + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + + addi.d S1, S1, 0x10 + addi.d P4, P4, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P5, 0x00 + + addi.d S1, S1, 0x08 + addi.d P5, P5, 0x08 + +.L_M0: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + LDARG $r29, $sp, 48 + addi.d $sp, $sp, 56 + jirl $r0, $r1, 0x00 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_4.S b/kernel/loongarch64/dgemm_tcopy_4.S new file mode 100644 index 000000000..700989ca1 --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_4.S @@ -0,0 +1,270 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define P0 $r16 +#define P1 $r17 +#define P2 $r18 +#define P3 $r19 +#define T0 $r20 +#define T1 $r23 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move S0, SRC + move P0, DST + + srai.d T0, N, 0x02 + slli.d T0, T0, 0x02 + srai.d T1, N, 0x01 + slli.d T1, T1, 0x01 + mul.d T0, M, T0 + mul.d T1, M, T1 + slli.d T0, T0, 0x03 + slli.d T1, T1, 0x03 + add.d P2, DST, T0 + add.d P3, DST, T1 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x02 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x05 + beq ZERO, J, .L_M3 + +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x02 + addi.d J, J, -1 + beq ZERO, I, .L_N3 + +.L_I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + add.d P1, P1, T1 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvpermi.q U0, U1, 0x02 + xvpermi.q U2, U3, 0x02 + + xvst U0, P2, 0x00 + xvst U2, P2, 0x20 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P2, P2, 0x40 + +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + fst.d F2, P3, 0x10 + fst.d F3, P3, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P3, P3, 0x20 + +.L_N0: + blt ZERO, J, .L_J1 + +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_2N3 + +.L_2I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d I, I, -1 + add.d P1, P1, T1 + + blt ZERO, I, .L_2I1 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvpermi.q U0, U1, 0x02 + + xvst U0, P2, 0x00 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P2, P2, 0x20 + +.L_2N1: + addi.d I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P3, P3, 0x10 + +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + move P1, P0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_1N3 + +.L_1I1: + xvld U0, S1, 0x00 + + xvst U0, P1, 0x00 + + addi.d S1, S1, 0x20 + addi.d I, I, -1 + add.d P1, P1, T1 + + blt ZERO, I, .L_1I1 + +.L_1N3: + andi I, N, 0x02 + beq I, ZERO, .L_1N1 + + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, P2, 0x00 + fst.d F1, P2, 0x08 + + addi.d S1, S1, 0x10 + addi.d P2, P2, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq I, ZERO, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P3, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE diff --git a/param.h b/param.h index 8dd2a7461..2dffaae3c 100644 --- a/param.h +++ b/param.h @@ -2852,35 +2852,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_N 4 #define XGEMM_DEFAULT_UNROLL_N 1 #define SGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_M 16 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 1 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_P sgemm_p -#define DGEMM_DEFAULT_P dgemm_p +#define DGEMM_DEFAULT_P 32 #define QGEMM_DEFAULT_P qgemm_p #define CGEMM_DEFAULT_P cgemm_p #define ZGEMM_DEFAULT_P zgemm_p #define XGEMM_DEFAULT_P xgemm_p #define SGEMM_DEFAULT_R sgemm_r -#define DGEMM_DEFAULT_R dgemm_r +#define DGEMM_DEFAULT_R 858 #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r #define SGEMM_DEFAULT_Q 128 -#define DGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 152 #define QGEMM_DEFAULT_Q 128 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 128 From e3c9947c0f4338abc437126283576b63a2203623 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Tue, 21 Dec 2021 11:19:27 +0100 Subject: [PATCH 04/77] prepare kernel for sve zgemm --- kernel/arm64/KERNEL.A64FX | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index 80be4ddd0..04be0fab9 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -169,15 +169,24 @@ CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) -ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif +ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +ZGEMMINCOPY = zgemm_ncopy_sve_v1.c +ZGEMMITCOPY = zgemm_tcopy_sve_v1.c ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c +DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c +DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +DSYMMUCOPY_M = symm_ucopy_sve.c +DSYMMLCOPY_M = symm_lcopy_sve.c + From 07fe5b19a4957cafe3864e4af0296eb575a2e2f3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 21 Dec 2021 12:31:54 +0100 Subject: [PATCH 05/77] typecast function pointers --- driver/others/blas_server.c | 40 ++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index fa07a1ea4..ec79075fe 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -209,7 +209,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* REAL / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, - double *, BLASLONG, void *) = func; + double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((double *)args -> alpha)[0], @@ -220,7 +221,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* REAL / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, - float *, BLASLONG, void *) = func; + float *, BLASLONG, void *) = (void (*) + (BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((float *)args -> alpha)[0], @@ -232,7 +236,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* REAL / BFLOAT16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, - bfloat16 *, BLASLONG, void *) = func; + bfloat16 *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, bfloat16, + bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, + bfloat16 *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((bfloat16 *)args -> alpha)[0], @@ -243,7 +249,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* REAL / BLAS_STOBF16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, bfloat16 *, BLASLONG, - float *, BLASLONG, void *) = func; + float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, bfloat16 *, BLASLONG, + float *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((float *)args -> alpha)[0], @@ -254,7 +262,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* REAL / BLAS_DTOBF16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, bfloat16 *, BLASLONG, - double *, BLASLONG, void *) = func; + double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, bfloat16 *, BLASLONG, + double *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((double *)args -> alpha)[0], @@ -271,7 +281,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* COMPLEX / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, - xdouble *, BLASLONG, void *) = func; + xdouble *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, + xdouble *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((xdouble *)args -> alpha)[0], @@ -285,7 +297,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* COMPLEX / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, - double *, BLASLONG, void *) = func; + double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, + double *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((double *)args -> alpha)[0], @@ -297,7 +311,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* COMPLEX / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, - float *, BLASLONG, void *) = func; + float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((float *)args -> alpha)[0], @@ -425,7 +441,7 @@ blas_queue_t *tscq; #endif if (queue) { - int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; + int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = (int (*)(blas_arg_t *, void *, void *, void *, void *, BLASLONG))queue -> routine; atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1); @@ -503,7 +519,7 @@ blas_queue_t *tscq; legacy_exec(routine, queue -> mode, queue -> args, sb); } else if (queue -> mode & BLAS_PTHREAD) { - void (*pthreadcompat)(void *) = queue -> routine; + void (*pthreadcompat)(void *) = (void(*)(void*))queue -> routine; (pthreadcompat)(queue -> args); } else (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); @@ -871,13 +887,13 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ fprintf(STDERR, "\n"); #endif - routine = queue -> routine; + routine = (int (*)(blas_arg_t *, void *, void *, double *, double *, BLASLONG))queue -> routine; if (queue -> mode & BLAS_LEGACY) { legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); } else if (queue -> mode & BLAS_PTHREAD) { - void (*pthreadcompat)(void *) = queue -> routine; + void (*pthreadcompat)(void *) = (void (*)(void*))queue -> routine; (pthreadcompat)(queue -> args); } else (routine)(queue -> args, queue -> range_m, queue -> range_n, From d1ee6ff73fca6eecfb679d2a91c39ce91e80231b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 21 Dec 2021 18:45:28 +0100 Subject: [PATCH 06/77] fix function typecasts --- kernel/x86_64/dasum.c | 2 +- kernel/x86_64/ddot.c | 2 +- kernel/x86_64/drot.c | 2 +- kernel/x86_64/sasum.c | 2 +- kernel/x86_64/srot.c | 2 +- kernel/x86_64/zdot.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c index 8af9e798b..a9c40f38f 100644 --- a/kernel/x86_64/dasum.c +++ b/kernel/x86_64/dasum.c @@ -114,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) #else mode = BLAS_DOUBLE | BLAS_REAL; #endif - blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads); + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads); ptr = (FLOAT *)result; for (i = 0; i < nthreads; i++) { sumf += (*ptr); diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 5d0c32234..f3b9ee701 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -190,7 +190,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) #endif blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, y, inc_y, result, 0, - ( void *)dot_thread_function, nthreads); + (int (*)(void)) dot_thread_function, nthreads); ptr = (RETURN_TYPE *)result; for (i = 0; i < nthreads; i++) { diff --git a/kernel/x86_64/drot.c b/kernel/x86_64/drot.c index ab5048bd1..40c9cf19d 100644 --- a/kernel/x86_64/drot.c +++ b/kernel/x86_64/drot.c @@ -196,7 +196,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT #else int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD; #endif - blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads); + blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (int (*)(void))rot_thread_function, nthreads); } #else rot_compute(n, x, inc_x, y, inc_y, c, s); diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c index a021741c7..37a92468f 100644 --- a/kernel/x86_64/sasum.c +++ b/kernel/x86_64/sasum.c @@ -123,7 +123,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) #else mode = BLAS_DOUBLE | BLAS_REAL; #endif - blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads); + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads); ptr = (FLOAT *)result; for (i = 0; i < nthreads; i++) { sumf += (*ptr); diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c index 587cf8e40..a49544616 100644 --- a/kernel/x86_64/srot.c +++ b/kernel/x86_64/srot.c @@ -198,7 +198,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT #else int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD; #endif - blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads); + blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (int (*)(void))rot_thread_function, nthreads); } #else rot_compute(n, x, inc_x, y, inc_y, c, s); diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index 50c8a2678..c52575d07 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -215,7 +215,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, y, inc_y, result, 0, - ( void *)zdot_thread_function, nthreads); + (int (*)(void))zdot_thread_function, nthreads); ptr = (OPENBLAS_COMPLEX_FLOAT *)result; for (i = 0; i < nthreads; i++) { From 64365c919e63baaef31f5c52d39ae53d77a98c85 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 21 Dec 2021 18:47:35 +0100 Subject: [PATCH 07/77] fix function typecasts --- interface/axpy.c | 2 +- interface/scal.c | 2 +- interface/zaxpy.c | 4 ++-- interface/zscal.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/interface/axpy.c b/interface/axpy.c index eaa19f4df..5304ebec3 100644 --- a/interface/axpy.c +++ b/interface/axpy.c @@ -115,7 +115,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc #endif blas_level1_thread(mode, n, 0, 0, &alpha, - x, incx, y, incy, NULL, 0, (void *)AXPYU_K, nthreads); + x, incx, y, incy, NULL, 0, (int (*)(void))AXPYU_K, nthreads); } #endif diff --git a/interface/scal.c b/interface/scal.c index 6d07b1650..0a7fee640 100644 --- a/interface/scal.c +++ b/interface/scal.c @@ -102,7 +102,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ #else &alpha, #endif - x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads); + x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads); } #endif diff --git a/interface/zaxpy.c b/interface/zaxpy.c index da3b48ead..0e168606d 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -128,9 +128,9 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in blas_level1_thread(mode, n, 0, 0, ALPHA, x, incx, y, incy, NULL, 0, #ifndef CONJ - (void *)AXPYU_K, + (int (*)(void))AXPYU_K, #else - (void *)AXPYC_K, + (int (*)(void))AXPYC_K, #endif nthreads); } diff --git a/interface/zscal.c b/interface/zscal.c index bfaddc260..498377343 100644 --- a/interface/zscal.c +++ b/interface/zscal.c @@ -108,7 +108,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ mode = BLAS_SINGLE | BLAS_COMPLEX; #endif - blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads); + blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads); } #endif From c49d46f25f9c4f626f4a197b01bad749a9d5a7a6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 21 Dec 2021 18:49:18 +0100 Subject: [PATCH 08/77] fix function typecast --- lapack/getrf/getrf_parallel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index fc410b0e7..fed5c1de5 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -662,7 +662,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0, - ipiv, 1, (void *)LASWP_PLUS, args -> nthreads); + ipiv, 1, (int (*)(void))LASWP_PLUS, args -> nthreads); is += bk; } From aecb4a5e8daab1b50ae34636001dfdb234948765 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 21 Dec 2021 18:50:22 +0100 Subject: [PATCH 09/77] fix function typecasts --- lapack/lauum/lauum_L_parallel.c | 4 ++-- lapack/lauum/lauum_U_parallel.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lapack/lauum/lauum_L_parallel.c b/lapack/lauum/lauum_L_parallel.c index 0ebe3f069..1b32e4519 100644 --- a/lapack/lauum/lauum_L_parallel.c +++ b/lapack/lauum/lauum_L_parallel.c @@ -102,7 +102,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.c = a; syrk_thread(mode | BLAS_TRANSA_T | BLAS_TRANSB_N | BLAS_UPLO, - &newarg, NULL, NULL, (void *)HERK_LC, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))HERK_LC, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = i; @@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.b = a + (i ) * COMPSIZE; gemm_thread_n(mode | BLAS_TRANSA_T, - &newarg, NULL, NULL, (void *)TRMM_LCLN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))TRMM_LCLN, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = bk; diff --git a/lapack/lauum/lauum_U_parallel.c b/lapack/lauum/lauum_U_parallel.c index 7214c9731..f5ea54c88 100644 --- a/lapack/lauum/lauum_U_parallel.c +++ b/lapack/lauum/lauum_U_parallel.c @@ -102,7 +102,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.c = a; syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, - &newarg, NULL, NULL, (void *)HERK_UN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))HERK_UN, sa, sb, args -> nthreads); newarg.m = i; newarg.n = bk; @@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.b = a + ( i * lda) * COMPSIZE; gemm_thread_m(mode | BLAS_TRANSA_T | BLAS_RSIDE, - &newarg, NULL, NULL, (void *)TRMM_RCUN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))TRMM_RCUN, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = bk; From 6b407a16cb089492d3ad1e2a1f5fdb71f4ffdd94 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 21 Dec 2021 18:51:28 +0100 Subject: [PATCH 10/77] fix function typecasts --- lapack/potrf/potrf_L_parallel.c | 4 ++-- lapack/potrf/potrf_U_parallel.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lapack/potrf/potrf_L_parallel.c b/lapack/potrf/potrf_L_parallel.c index 68ec8e22a..986816d1a 100644 --- a/lapack/potrf/potrf_L_parallel.c +++ b/lapack/potrf/potrf_L_parallel.c @@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.b = a + (i + bk + i * lda) * COMPSIZE; gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, - &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))TRSM_RCLN, sa, sb, args -> nthreads); newarg.n = n - i - bk; newarg.k = bk; @@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); #else syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, - &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))HERK_LN, sa, sb, args -> nthreads); #endif } } diff --git a/lapack/potrf/potrf_U_parallel.c b/lapack/potrf/potrf_U_parallel.c index 3b5d39511..cc6ff9912 100644 --- a/lapack/potrf/potrf_U_parallel.c +++ b/lapack/potrf/potrf_U_parallel.c @@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; gemm_thread_n(mode | BLAS_TRANSA_T, - &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))TRSM_LCUN, sa, sb, args -> nthreads); newarg.n = n - i - bk; newarg.k = bk; @@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); #else syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, - &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))HERK_UC, sa, sb, args -> nthreads); #endif } } From 9809931eb46c483ff3e6ab301a262eb879072450 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 21 Dec 2021 18:53:55 +0100 Subject: [PATCH 11/77] clean up unused variables and unreachable statements --- cpuid_x86.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 72e95214e..6466bd148 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -323,9 +323,11 @@ int get_vendor(void){ int get_cputype(int gettype){ int eax, ebx, ecx, edx; +/* int extend_family, family; int extend_model, model; int type, stepping; +*/ int feature = 0; cpuid(1, &eax, &ebx, &ecx, &edx); @@ -428,7 +430,8 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ cpuid(0, &cpuid_level, &ebx, &ecx, &edx); if (cpuid_level > 1) { - int numcalls =0 ; + int numcalls; + cpuid(2, &eax, &ebx, &ecx, &edx); numcalls = BITMASK(eax, 0, 0xff); //FIXME some systems may require repeated calls to read all entries info[ 0] = BITMASK(eax, 8, 0xff); @@ -1637,7 +1640,6 @@ int get_cpuname(void){ else return CPUTYPE_BARCELONA; } - break; case 10: // Zen3 if(support_avx()) #ifndef NO_AVX2 @@ -2193,7 +2195,6 @@ int get_coretype(void){ else return CORE_NEHALEM; #endif - break; case 7: if (model == 10) @@ -2582,4 +2583,4 @@ void get_sse(void){ if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n"); } -//} \ No newline at end of file +//} From 2db0b2e4453b0a502cf336f6288688c23246d202 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Thu, 23 Dec 2021 20:04:27 +0800 Subject: [PATCH 12/77] Fixed MSA enabled optimization on Loongson-3A4000 --- cpuid_mips.c | 6 +++--- cpuid_mips64.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpuid_mips.c b/cpuid_mips.c index 1946455d8..d787e7120 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -165,7 +165,7 @@ void get_cpuconfig(void){ }else{ printf("#define UNKNOWN\n"); } - if (!get_feature(msa)) printf("#define NO_MSA\n"); + if (!get_feature("msa")) printf("#define NO_MSA\n"); } void get_libname(void){ @@ -193,7 +193,7 @@ int get_feature(char *search) while (fgets(buffer, sizeof(buffer), infile)) { - if (!strncmp("Features", buffer, 8)) + if (!strncmp("Features", buffer, 8) || !strncmp("ASEs implemented", buffer, 16)) { p = strchr(buffer, ':') + 2; break; @@ -207,7 +207,7 @@ int get_feature(char *search) t = strtok(p," "); while( t = strtok(NULL," ")) { - if (!strcmp(t, search)) { return(1); } + if (strstr(t, search)) { return(1); } } #endif diff --git a/cpuid_mips64.c b/cpuid_mips64.c index 97743bc43..8753ee3f0 100644 --- a/cpuid_mips64.c +++ b/cpuid_mips64.c @@ -201,7 +201,7 @@ void get_cpuconfig(void){ printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 8\n"); } - if (!get_feature(msa)) printf("#define NO_MSA\n"); + if (!get_feature("msa")) printf("#define NO_MSA\n"); } void get_libname(void){ @@ -233,7 +233,7 @@ int get_feature(char *search) while (fgets(buffer, sizeof(buffer), infile)) { - if (!strncmp("Features", buffer, 8)) + if (!strncmp("Features", buffer, 8) || !strncmp("ASEs implemented", buffer, 16)) { p = strchr(buffer, ':') + 2; break; @@ -247,7 +247,7 @@ int get_feature(char *search) t = strtok(p," "); while( t = strtok(NULL," ")) { - if (!strcmp(t, search)) { return(1); } + if (strstr(t, search)) { return(1); } } #endif From e9a0e52201282ee1caec67475307aa7717b2bc31 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 24 Dec 2021 20:00:50 +0100 Subject: [PATCH 13/77] fix function typecast --- kernel/x86_64/casum.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/casum.c b/kernel/x86_64/casum.c index a1bd76f33..60feec0ce 100644 --- a/kernel/x86_64/casum.c +++ b/kernel/x86_64/casum.c @@ -130,7 +130,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) mode = BLAS_DOUBLE | BLAS_COMPLEX; #endif blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, - NULL, 0, result, 0, (void *)asum_thread_function, nthreads); + NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads); ptr = (FLOAT *)result; for (i = 0; i < nthreads; i++) { sumf += (*ptr); From 7b146e590c1d93c62cb0a7590a3ca287bcde52c6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 24 Dec 2021 20:01:52 +0100 Subject: [PATCH 14/77] fix function typecast --- kernel/x86_64/zasum.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/zasum.c b/kernel/x86_64/zasum.c index 6e758e2e3..80e95a2c8 100644 --- a/kernel/x86_64/zasum.c +++ b/kernel/x86_64/zasum.c @@ -130,7 +130,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) mode = BLAS_DOUBLE | BLAS_COMPLEX; #endif blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, - NULL, 0, result, 0, (void *)asum_thread_function, nthreads); + NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads); ptr = (FLOAT *)result; for (i = 0; i < nthreads; i++) { sumf += (*ptr); From 683a7548bf34f610f5bdedfac5c1dac425c66a59 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sat, 25 Dec 2021 11:46:41 +0100 Subject: [PATCH 15/77] added macros for sve zgemm kernels --- kernel/arm64/zgemm_kernel_sve_v1x4.S | 1159 ++++++++++++++++++++++++++ 1 file changed, 1159 insertions(+) create mode 100644 kernel/arm64/zgemm_kernel_sve_v1x4.S diff --git a/kernel/arm64/zgemm_kernel_sve_v1x4.S b/kernel/arm64/zgemm_kernel_sve_v1x4.S new file mode 100644 index 000000000..0fc966f8c --- /dev/null +++ b/kernel/arm64/zgemm_kernel_sve_v1x4.S @@ -0,0 +1,1159 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pCRow3 x15 +#define pA x16 +#define alphaR x19 +#define alphaI x20 + +#define alphaz_R z10.d +#define alphaz_I z11.d +#define alpha0_R d10 +#define alphaV0_R v10.d[0] +#define alpha0_I d11 +#define alphaV0_I v11.d[0] + + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset -> temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pCRow3 +// 16 pA +// 17 alpha_save_R +// 18 must save alpha_save_I +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA00_R, pA01_R +//v01 ALPHA_I -> pA00_I, pA01_I +//v02 pA02_R, pA03_R +//v03 pA02_I, pA03_I +//v04 pA10_R, pA11_R +//v05 pA10_I, pA11_I +//v06 pA12_R, pA13_R +//v07 pA12_I, pA13_I +//v08 must save pB00_R, pB01_R +//v09 must save pB00_I, pB01_I +//v10 must save pB02_R, pB03_R OR ALPHA0_R +//v11 must save pB02_I, pB03_I OR ALPHA0_I +//v12 must save pB10_R, pB11_R +//v13 must save pB10_I, pB11_I +//v14 must save pB12_R, pB13_R OR ALPHA1_R +//v15 must save pB12_I, pB13_I OR ALPHA1_R +//v16 pC0R +//v17 pC0I +//v18 pC1R +//v19 pC1I +//v20 pC2R +//v21 pC2I +//v22 pC3R +//v23 pC3I +//v24 pC3R +//v25 pC3I +//v26 pC22_R, pC23_R +//v27 pC22_I, pC23_I +//v28 pC30_R, pC31_R +//v29 pC30_I, pC31_I +//v30 pC32_R, pC33_R +//v31 pC32_I, pC33_I + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv1x4_I + ld2d {z0.d, z1.d}, p1/z, [pA] + ld2d {z2.d, z3.d}, p1/z, [pA, lanes, lsl #4] // next one + add pA, pA, lanes, lsl #5 // pA += lanes*2*2*8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z17.16b, z17.16b, z17.16b + fmls z17.d, p1/m, z0.d, z9.d +#else + fmla z17.d, p1/m, z0.d, z9.d +#endif + OP_ii z16.d, p1/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + + + fmla z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + OP_ii z18.d, p1/m, z1.d, z11.d +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z19.16b, z21.16b, z21.16b + fmls z19.d, p1/m, z0.d, z11.d +#else + fmla z19.d, p1/m, z0.d, z11.d +#endif + ld1rd z11.d, p0/z, [pB, 24] + + + fmla z20.d, p1/m, z0.d, z12.d + OP_ir z21.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z21.16b, z23.16b, z23.16b + fmls z21.d, p1/m, z0.d, z13.d +#else + fmla z21.d, p1/m, z0.d, z13.d +#endif + OP_ii z20.d, p1/m, z1.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + + + fmla z22.d, p1/m, z0.d, z14.d + OP_ir z23.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z23.16b, z19.16b, z19.16b + fmls z23.d, p1/m, z0.d, z15.d +#else + fmla z23.d, p1/m, z0.d, z15.d +#endif + OP_ii z22.d, p1/m, z1.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M1 + ld2d {z2.d, z3.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + + OP_rr z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + OP_ii z18.d, p1/m, z1.d, z11.d + OP_ri z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z20.d, p1/m, z0.d, z12.d + OP_ir z21.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + OP_ii z20.d, p1/m, z1.d, z13.d + OP_ri z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + + OP_rr z22.d, p1/m, z0.d, z14.d + OP_ir z23.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + OP_ii z22.d, p1/m, z1.d, z15.d + OP_ri z23.d, p1/m, z0.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M2 + ld2d {z2.d, z3.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes *2 * 8 + + OP_rr z16.d, p1/m, z2.d, z8.d + OP_ir z17.d, p1/m, z3.d, z8.d + ld1rd z8.d, p0/z, [pB] + OP_ii z16.d, p1/m, z3.d, z9.d + OP_ri z17.d, p1/m, z2.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + + OP_rr z18.d, p1/m, z2.d, z10.d + OP_ir z19.d, p1/m, z3.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + OP_ii z18.d, p1/m, z3.d, z11.d + OP_ri z19.d, p1/m, z2.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z20.d, p1/m, z2.d, z12.d + OP_ir z21.d, p1/m, z3.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + OP_ii z20.d, p1/m, z3.d, z13.d + OP_ri z21.d, p1/m, z2.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + + OP_rr z22.d, p1/m, z2.d, z14.d + OP_ir z23.d, p1/m, z3.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + OP_ii z22.d, p1/m, z3.d, z15.d + OP_ri z23.d, p1/m, z2.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + add pB, pB, 64 + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_E + OP_rr z16.d, p1/m, z2.d, z8.d + OP_ir z17.d, p1/m, z3.d, z8.d + OP_ii z16.d, p1/m, z3.d, z9.d + OP_ri z17.d, p1/m, z2.d, z9.d + + OP_rr z18.d, p1/m, z2.d, z10.d + OP_ir z19.d, p1/m, z3.d, z10.d + OP_ii z18.d, p1/m, z3.d, z11.d + OP_ri z19.d, p1/m, z2.d, z11.d + + OP_rr z20.d, p1/m, z2.d, z12.d + OP_ir z21.d, p1/m, z3.d, z12.d + OP_ii z20.d, p1/m, z3.d, z13.d + OP_ri z21.d, p1/m, z2.d, z13.d + + OP_rr z22.d, p1/m, z2.d, z14.d + OP_ir z23.d, p1/m, z3.d, z14.d + OP_ii z22.d, p1/m, z3.d, z15.d + OP_ri z23.d, p1/m, z2.d, z15.d + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] + +.endm + +.macro KERNELv1x4_SUB + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d + + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + OP_rr z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + OP_ii z18.d, p1/m, z1.d, z11.d + OP_ri z19.d, p1/m, z0.d, z11.d + + add pB, pB, 64 + + OP_rr z20.d, p1/m, z0.d, z12.d + OP_ir z21.d, p1/m, z1.d, z12.d + OP_ii z20.d, p1/m, z1.d, z13.d + OP_ri z21.d, p1/m, z0.d, z13.d + + OP_rr z22.d, p1/m, z0.d, z14.d + OP_ir z23.d, p1/m, z1.d, z14.d + OP_ii z22.d, p1/m, z1.d, z15.d + OP_ri z23.d, p1/m, z0.d, z15.d + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] +.endm + +.macro SAVEv1x4 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2d {z24.d, z25.d}, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaz_R + fmls z24.d, p1/m, z17.d, alphaz_I + fmla z25.d, p1/m, z16.d, alphaz_I + fmla z25.d, p1/m, z17.d, alphaz_R + st2d {z25.d, z26.d}, p1, [pCRow0] + + add pCRow0, pCRow0, #32 + + ld2d {z26.d, z27.d}, p1/z, [pCRow0] + fmla z26.d, p1/m, z18.d, alphaz_R + fmls z26.d, p1/m, z19.d, alphaz_I + fmla z27.d, p1/m, z18.d, alphaz_I + fmla z27.d, p1/m, z19.d, alphaz_R + st2d {z26.d, z27.d}, p1, [pCRow0] + + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld2d {z28.d, z29.d}, p1/z, [pCRow1] + fmla z28.d, p1/m, z20.d, alphaz_R + fmls z28.d, p1/m, z21.d, alphaz_I + fmla z29.d, p1/m, z20.d, alphaz_I + fmla z29.d, p1/m, z21.d, alphaz_R + st2d {z28.d, z29.d}, p1, [pCRow1] + + add pCRow1, pCRow1, #32 + + ld2d {z30.d, z31.d}, p1/z, [pCRow1] + fmla z30.d, p1/m, z22.d, alphaz_R + fmls z30.d, p1/m, z23.d, alphaz_I + fmla z31.d, p1/m, z22.d, alphaz_I + fmla z31.d, p1/m, z23.d, alphaz_R + st2d {z30.d, z31.d}, p1, [pCRow1] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x2 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv1x2_SUB + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d + + OP_rr z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + OP_ii z18.d, p1/m, z1.d, z11.d + OP_ri z19.d, p1/m, z0.d, z11.d + + add pB, pB, 32 +.endm + +.macro SAVEv1x2 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2d {z24.d, z25.d}, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaz_R + fmls z24.d, p1/m, z17.d, alphaz_I + fmla z25.d, p1/m, z16.d, alphaz_I + fmla z25.d, p1/m, z17.d, alphaz_R + st2d {z25.d, z26.d}, p1, [pCRow0] + + add pCRow0, pCRow0, #32 + + ld2d {z26.d, z27.d}, p1/z, [pCRow0] + fmla z26.d, p1/m, z18.d, alphaz_R + fmls z26.d, p1/m, z19.d, alphaz_I + fmla z27.d, p1/m, z18.d, alphaz_I + fmla z27.d, p1/m, z19.d, alphaz_R + st2d {z26.d, z27.d}, p1, [pCRow0] + + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 + +.endm + +/******************************************************************************/ + + +.macro INITv1x1 + dup z16.d, #0 + dup z17.d, #0 +.endm + + +.macro KERNELv1x1_SUB + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d +.endm + +.macro SAVEv1x1 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2d {z24.d, z25.d}, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaz_R + fmls z24.d, p1/m, z17.d, alphaz_I + fmla z25.d, p1/m, z16.d, alphaz_I + fmla z25.d, p1/m, z17.d, alphaz_R + st2d {z25.d, z26.d}, p1, [pCRow0] + + add pCRow0, pCRow0, #32 + + + add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, d0 + fmov alphaI, d1 + + lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble .Lzgemm_kernel_L2_BEGIN + +.Lzgemm_kernel_L4_BEGIN: + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + + mov pA, origPA // pA = start of A array + +.Lzgemm_kernel_L4_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble .Lzgemm_kernel_L4_M2_BEGIN + + .align 5 +.Lzgemm_kernel_L4_M4_20: + + mov pB, origPB + asr counterL , origK, #3 + cmp counterL , #2 + blt .Lzgemm_kernel_L4_M4_32 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lzgemm_kernel_L4_M4_22a + + .align 5 +.Lzgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L4_M4_22 + + .align 5 +.Lzgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b .Lzgemm_kernel_L4_M4_44 + + .align 5 +.Lzgemm_kernel_L4_M4_32: + + tst counterL, #1 + ble .Lzgemm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b .Lzgemm_kernel_L4_M4_44 + + +.Lzgemm_kernel_L4_M4_40: + + INIT4x4 + +.Lzgemm_kernel_L4_M4_44: + + ands counterL , origK, #7 + ble .Lzgemm_kernel_L4_M4_100 + + .align 5 +.Lzgemm_kernel_L4_M4_46: + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bne .Lzgemm_kernel_L4_M4_46 + +.Lzgemm_kernel_L4_M4_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVE4x4 + +.Lzgemm_kernel_L4_M4_END: + subs counterI, counterI, #1 + bne .Lzgemm_kernel_L4_M4_20 + +.Lzgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lzgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lzgemm_kernel_L4_M1_BEGIN + +.Lzgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lzgemm_kernel_L4_M2_40 + +.Lzgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L4_M2_22 + + +.Lzgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lzgemm_kernel_L4_M2_100 + +.Lzgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L4_M2_42 + +.Lzgemm_kernel_L4_M2_100: + + SAVE2x4 + +.Lzgemm_kernel_L4_M2_END: + + +.Lzgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lzgemm_kernel_L4_END + +.Lzgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lzgemm_kernel_L4_M1_40 + +.Lzgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L4_M1_22 + + +.Lzgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lzgemm_kernel_L4_M1_100 + +.Lzgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L4_M1_42 + +.Lzgemm_kernel_L4_M1_100: + + SAVE1x4 + + +.Lzgemm_kernel_L4_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 4 * 8 * 2 + + subs counterJ, counterJ , #1 // j-- + bgt .Lzgemm_kernel_L4_BEGIN + + +/******************************************************************************/ + +.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble .Lzgemm_kernel_L999 + + tst counterJ , #2 + ble .Lzgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + + + +.Lzgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble .Lzgemm_kernel_L2_M2_BEGIN + +.Lzgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lzgemm_kernel_L2_M4_40 + .align 5 + +.Lzgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L2_M4_22 + + +.Lzgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lzgemm_kernel_L2_M4_100 + +.Lzgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L2_M4_42 + +.Lzgemm_kernel_L2_M4_100: + + SAVE4x2 + +.Lzgemm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt .Lzgemm_kernel_L2_M4_20 + + +.Lzgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lzgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lzgemm_kernel_L2_M1_BEGIN + +.Lzgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lzgemm_kernel_L2_M2_40 + +.Lzgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L2_M2_22 + + +.Lzgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lzgemm_kernel_L2_M2_100 + +.Lzgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L2_M2_42 + +.Lzgemm_kernel_L2_M2_100: + + SAVE2x2 + +.Lzgemm_kernel_L2_M2_END: + + +.Lzgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lzgemm_kernel_L2_END + +.Lzgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble .Lzgemm_kernel_L2_M1_40 + +.Lzgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L2_M1_22 + + +.Lzgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lzgemm_kernel_L2_M1_100 + +.Lzgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L2_M1_42 + +.Lzgemm_kernel_L2_M1_100: + + SAVE1x2 + + +.Lzgemm_kernel_L2_END: + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 2 * 8 * 2 + +/******************************************************************************/ + +.Lzgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lzgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + + + +.Lzgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble .Lzgemm_kernel_L1_M2_BEGIN + +.Lzgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lzgemm_kernel_L1_M4_40 + .align 5 + +.Lzgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L1_M4_22 + + +.Lzgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lzgemm_kernel_L1_M4_100 + +.Lzgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L1_M4_42 + +.Lzgemm_kernel_L1_M4_100: + + SAVE4x1 + +.Lzgemm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt .Lzgemm_kernel_L1_M4_20 + + +.Lzgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lzgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lzgemm_kernel_L1_M1_BEGIN + +.Lzgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lzgemm_kernel_L1_M2_40 + +.Lzgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L1_M2_22 + + +.Lzgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lzgemm_kernel_L1_M2_100 + +.Lzgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L1_M2_42 + +.Lzgemm_kernel_L1_M2_100: + + SAVE2x1 + +.Lzgemm_kernel_L1_M2_END: + + +.Lzgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lzgemm_kernel_L1_END + +.Lzgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lzgemm_kernel_L1_M1_40 + +.Lzgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L1_M1_22 + + +.Lzgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lzgemm_kernel_L1_M1_100 + +.Lzgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L1_M1_42 + +.Lzgemm_kernel_L1_M1_100: + + SAVE1x1 + + +.Lzgemm_kernel_L1_END: + + +.Lzgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + From 878064f39463631e0daf78395248083f1c8b251f Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 26 Dec 2021 08:44:05 +0100 Subject: [PATCH 16/77] sve zgemm kernel --- kernel/arm64/zgemm_kernel_sve_v1x4.S | 544 +++++++-------------------- 1 file changed, 132 insertions(+), 412 deletions(-) diff --git a/kernel/arm64/zgemm_kernel_sve_v1x4.S b/kernel/arm64/zgemm_kernel_sve_v1x4.S index 0fc966f8c..1201d6dac 100644 --- a/kernel/arm64/zgemm_kernel_sve_v1x4.S +++ b/kernel/arm64/zgemm_kernel_sve_v1x4.S @@ -48,6 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow2 x14 #define pCRow3 x15 #define pA x16 +#define lanes x17 + #define alphaR x19 #define alphaI x20 @@ -168,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x4_I ld2d {z0.d, z1.d}, p1/z, [pA] - ld2d {z2.d, z3.d}, p1/z, [pA, lanes, lsl #4] // next one + ld2d {z2.d, z3.d}, p1/z, [pA, #2, mul vl] // next one add pA, pA, lanes, lsl #5 // pA += lanes*2*2*8 ld1rd z8.d, p0/z, [pB] @@ -561,17 +563,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prfm PLDL1KEEP, [origPA] fmov alphaR, d0 + dup alphaz_R, alphaR fmov alphaI, d1 + dup alphaz_I, alphaI lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 + ptrue p0.d // create true predicate mov pB, origPB +// Loop over N mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble .Lzgemm_kernel_L2_BEGIN +/******************************************************************************/ .Lzgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC @@ -582,204 +589,112 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pA, origPA // pA = start of A array -.Lzgemm_kernel_L4_M4_BEGIN: +.Lzgemm_kernel_L4_Mv1_BEGIN: - mov counterI, origM - asr counterI, counterI, #2 // counterI = counterI / 4 - cmp counterI, #0 - ble .Lzgemm_kernel_L4_M2_BEGIN +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.d, counterI, origM + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension .align 5 -.Lzgemm_kernel_L4_M4_20: +.Lzgemm_kernel_L4_Mv1_20: mov pB, origPB + INITv1x4 // fill with zeros + asr counterL , origK, #3 cmp counterL , #2 - blt .Lzgemm_kernel_L4_M4_32 + blt .Lzgemm_kernel_L4_Mv1_32 - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 subs counterL, counterL, #2 // subtract 2 - ble .Lzgemm_kernel_L4_M4_22a + ble .Lzgemm_kernel_L4_Mv1_22a .align 5 -.Lzgemm_kernel_L4_M4_22: +.Lzgemm_kernel_L4_Mv1_22: - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L4_M4_22 + bgt .Lzgemm_kernel_L4_Mv1_22 .align 5 -.Lzgemm_kernel_L4_M4_22a: +.Lzgemm_kernel_L4_Mv1_22a: - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E - b .Lzgemm_kernel_L4_M4_44 + b .Lzgemm_kernel_L4_Mv1_44 .align 5 -.Lzgemm_kernel_L4_M4_32: +.Lzgemm_kernel_L4_Mv1_32: tst counterL, #1 - ble .Lzgemm_kernel_L4_M4_40 + ble .Lzgemm_kernel_L4_Mv1_40 - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E - b .Lzgemm_kernel_L4_M4_44 + b .Lzgemm_kernel_L4_Mv1_44 -.Lzgemm_kernel_L4_M4_40: +.Lzgemm_kernel_L4_Mv1_40: - INIT4x4 + INITv1x4 -.Lzgemm_kernel_L4_M4_44: +.Lzgemm_kernel_L4_Mv1_44: ands counterL , origK, #7 - ble .Lzgemm_kernel_L4_M4_100 + ble .Lzgemm_kernel_L4_Mv1_100 .align 5 -.Lzgemm_kernel_L4_M4_46: - KERNEL4x4_SUB +.Lzgemm_kernel_L4_Mv1_46: + KERNELv1x4_SUB subs counterL, counterL, #1 - bne .Lzgemm_kernel_L4_M4_46 + bne .Lzgemm_kernel_L4_Mv1_46 -.Lzgemm_kernel_L4_M4_100: +.Lzgemm_kernel_L4_Mv1_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] - SAVE4x4 + SAVEv1x4 -.Lzgemm_kernel_L4_M4_END: - subs counterI, counterI, #1 - bne .Lzgemm_kernel_L4_M4_20 +.Lzgemm_kernel_L4_Mv1_END: -.Lzgemm_kernel_L4_M2_BEGIN: + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + b.any .Lzgemm_kernel_L4_Mv1_20 - mov counterI, origM - tst counterI , #3 - ble .Lzgemm_kernel_L4_END - - tst counterI, #2 // counterI = counterI / 2 - ble .Lzgemm_kernel_L4_M1_BEGIN - -.Lzgemm_kernel_L4_M2_20: - - INIT2x4 - - mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 - cmp counterL , #0 - ble .Lzgemm_kernel_L4_M2_40 - -.Lzgemm_kernel_L4_M2_22: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L4_M2_22 - - -.Lzgemm_kernel_L4_M2_40: - - ands counterL , origK, #7 // counterL = counterL % 8 - ble .Lzgemm_kernel_L4_M2_100 - -.Lzgemm_kernel_L4_M2_42: - - KERNEL2x4_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L4_M2_42 - -.Lzgemm_kernel_L4_M2_100: - - SAVE2x4 - -.Lzgemm_kernel_L4_M2_END: - - -.Lzgemm_kernel_L4_M1_BEGIN: - - tst counterI, #1 // counterI = counterI % 2 - ble .Lzgemm_kernel_L4_END - -.Lzgemm_kernel_L4_M1_20: - - INIT1x4 - - mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 - cmp counterL , #0 - ble .Lzgemm_kernel_L4_M1_40 - -.Lzgemm_kernel_L4_M1_22: - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L4_M1_22 - - -.Lzgemm_kernel_L4_M1_40: - - ands counterL , origK, #7 // counterL = counterL % 8 - ble .Lzgemm_kernel_L4_M1_100 - -.Lzgemm_kernel_L4_M1_42: - - KERNEL1x4_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L4_M1_42 - -.Lzgemm_kernel_L4_M1_100: - - SAVE1x4 .Lzgemm_kernel_L4_END: @@ -810,157 +725,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -.Lzgemm_kernel_L2_M4_BEGIN: +.Lzgemm_kernel_L2_Mv1_BEGIN: - mov counterI, origM - asr counterI, counterI, #2 // counterI = counterI / 4 - cmp counterI,#0 - ble .Lzgemm_kernel_L2_M2_BEGIN + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d -.Lzgemm_kernel_L2_M4_20: - INIT4x2 +.Lzgemm_kernel_L2_Mv1_20: + + INITv1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble .Lzgemm_kernel_L2_M4_40 + ble .Lzgemm_kernel_L2_Mv1_40 .align 5 -.Lzgemm_kernel_L2_M4_22: - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB +.Lzgemm_kernel_L2_Mv1_22: + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L2_M4_22 + bgt .Lzgemm_kernel_L2_Mv1_22 -.Lzgemm_kernel_L2_M4_40: +.Lzgemm_kernel_L2_Mv1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble .Lzgemm_kernel_L2_M4_100 + ble .Lzgemm_kernel_L2_Mv1_100 -.Lzgemm_kernel_L2_M4_42: +.Lzgemm_kernel_L2_Mv1_42: - KERNEL4x2_SUB + KERNELv1x2_SUB subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L2_M4_42 + bgt .Lzgemm_kernel_L2_Mv1_42 -.Lzgemm_kernel_L2_M4_100: +.Lzgemm_kernel_L2_Mv1_100: - SAVE4x2 + SAVEv1x2 -.Lzgemm_kernel_L2_M4_END: - - subs counterI, counterI, #1 - bgt .Lzgemm_kernel_L2_M4_20 +.Lzgemm_kernel_L2_Mv1_END: -.Lzgemm_kernel_L2_M2_BEGIN: - - mov counterI, origM - tst counterI , #3 - ble .Lzgemm_kernel_L2_END - - tst counterI, #2 // counterI = counterI / 2 - ble .Lzgemm_kernel_L2_M1_BEGIN - -.Lzgemm_kernel_L2_M2_20: - - INIT2x2 - - mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 - cmp counterL,#0 - ble .Lzgemm_kernel_L2_M2_40 - -.Lzgemm_kernel_L2_M2_22: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L2_M2_22 - - -.Lzgemm_kernel_L2_M2_40: - - ands counterL , origK, #7 // counterL = counterL % 8 - ble .Lzgemm_kernel_L2_M2_100 - -.Lzgemm_kernel_L2_M2_42: - - KERNEL2x2_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L2_M2_42 - -.Lzgemm_kernel_L2_M2_100: - - SAVE2x2 - -.Lzgemm_kernel_L2_M2_END: - - -.Lzgemm_kernel_L2_M1_BEGIN: - - tst counterI, #1 // counterI = counterI % 2 - ble .Lzgemm_kernel_L2_END - -.Lzgemm_kernel_L2_M1_20: - - INIT1x2 - - mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 - cmp counterL, #0 - ble .Lzgemm_kernel_L2_M1_40 - -.Lzgemm_kernel_L2_M1_22: - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L2_M1_22 - - -.Lzgemm_kernel_L2_M1_40: - - ands counterL , origK, #7 // counterL = counterL % 8 - ble .Lzgemm_kernel_L2_M1_100 - -.Lzgemm_kernel_L2_M1_42: - - KERNEL1x2_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L2_M1_42 - -.Lzgemm_kernel_L2_M1_100: - - SAVE1x2 + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Lzgemm_kernel_L2_Mv1_20 .Lzgemm_kernel_L2_END: @@ -981,163 +800,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pA, origPA // pA = A +.Lzgemm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d -.Lzgemm_kernel_L1_M4_BEGIN: +.Lzgemm_kernel_L1_Mv1_20: - mov counterI, origM - asr counterI, counterI, #2 // counterI = counterI / 4 - cmp counterI, #0 - ble .Lzgemm_kernel_L1_M2_BEGIN - -.Lzgemm_kernel_L1_M4_20: - - INIT4x1 + INITv1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble .Lzgemm_kernel_L1_M4_40 + ble .Lzgemm_kernel_L1_Mv1_40 .align 5 -.Lzgemm_kernel_L1_M4_22: - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB +.Lzgemm_kernel_L1_Mv1_22: + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L1_M4_22 + bgt .Lzgemm_kernel_L1_Mv1_22 -.Lzgemm_kernel_L1_M4_40: +.Lzgemm_kernel_L1_Mv1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble .Lzgemm_kernel_L1_M4_100 + ble .Lzgemm_kernel_L1_Mv1_100 -.Lzgemm_kernel_L1_M4_42: +.Lzgemm_kernel_L1_Mv1_42: - KERNEL4x1_SUB + KERNELv1x1_SUB subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L1_M4_42 + bgt .Lzgemm_kernel_L1_Mv1_42 -.Lzgemm_kernel_L1_M4_100: +.Lzgemm_kernel_L1_Mv1_100: - SAVE4x1 + SAVEv1x1 -.Lzgemm_kernel_L1_M4_END: - - subs counterI, counterI, #1 - bgt .Lzgemm_kernel_L1_M4_20 - - -.Lzgemm_kernel_L1_M2_BEGIN: - - mov counterI, origM - tst counterI , #3 - ble .Lzgemm_kernel_L1_END - - tst counterI, #2 // counterI = counterI / 2 - ble .Lzgemm_kernel_L1_M1_BEGIN - -.Lzgemm_kernel_L1_M2_20: - - INIT2x1 - - mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 - cmp counterL , #0 - ble .Lzgemm_kernel_L1_M2_40 - -.Lzgemm_kernel_L1_M2_22: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L1_M2_22 - - -.Lzgemm_kernel_L1_M2_40: - - ands counterL , origK, #7 // counterL = counterL % 8 - ble .Lzgemm_kernel_L1_M2_100 - -.Lzgemm_kernel_L1_M2_42: - - KERNEL2x1_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L1_M2_42 - -.Lzgemm_kernel_L1_M2_100: - - SAVE2x1 - -.Lzgemm_kernel_L1_M2_END: - - -.Lzgemm_kernel_L1_M1_BEGIN: - - tst counterI, #1 // counterI = counterI % 2 - ble .Lzgemm_kernel_L1_END - -.Lzgemm_kernel_L1_M1_20: - - INIT1x1 - - mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 - cmp counterL , #0 - ble .Lzgemm_kernel_L1_M1_40 - -.Lzgemm_kernel_L1_M1_22: - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L1_M1_22 - - -.Lzgemm_kernel_L1_M1_40: - - ands counterL , origK, #7 // counterL = counterL % 8 - ble .Lzgemm_kernel_L1_M1_100 - -.Lzgemm_kernel_L1_M1_42: - - KERNEL1x1_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L1_M1_42 - -.Lzgemm_kernel_L1_M1_100: - - SAVE1x1 +.Lzgemm_kernel_L1_Mv1_END: + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Lzgemm_kernel_L1_Mv1_20 .Lzgemm_kernel_L1_END: +/******************************************************************************/ .Lzgemm_kernel_L999: mov x0, #0 // set return value From 6ec4aab8754b4c0fa5a6dd359fe56ee755e04ee3 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 26 Dec 2021 17:05:46 +0100 Subject: [PATCH 17/77] zgemm sve copy routines --- kernel/arm64/zgemm_ncopy_sve_v1.c | 80 +++++++++++++++++++++++++++++++ kernel/arm64/zgemm_tcopy_sve_v1.c | 77 +++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 kernel/arm64/zgemm_ncopy_sve_v1.c create mode 100644 kernel/arm64/zgemm_tcopy_sve_v1.c diff --git a/kernel/arm64/zgemm_ncopy_sve_v1.c b/kernel/arm64/zgemm_ncopy_sve_v1.c new file mode 100644 index 000000000..be18e9708 --- /dev/null +++ b/kernel/arm64/zgemm_ncopy_sve_v1.c @@ -0,0 +1,80 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + svint64_t lda_vec = svindex_s64(0LL, lda * 2); + uint64_t sve_size = svcntd(); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b64(j, n); + uint64_t active = svcntp_b64(svptrue_b64(), pg); + do { + + aoffset1 = aoffset; + + uint64_t i_cnt = m; + while (i_cnt--) { + svfloat64_t a_vec_real = svld1_gather_index(pg, (double *) aoffset1, lda_vec); + svfloat64_t a_vec_imag = svld1_gather_index(pg, ((double *) aoffset1) + 1, lda_vec); + svst2_f64(pg, (double *) boffset, svcreate2(a_vec_real, a_vec_imag)); + aoffset1 += 2; + boffset += active; + } + aoffset += sve_size * lda * 2; + + j += svcntd(); + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + + + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} diff --git a/kernel/arm64/zgemm_tcopy_sve_v1.c b/kernel/arm64/zgemm_tcopy_sve_v1.c new file mode 100644 index 000000000..085e1fa40 --- /dev/null +++ b/kernel/arm64/zgemm_tcopy_sve_v1.c @@ -0,0 +1,77 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + uint64_t sve_size = svcntd(); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b64(j, n); + uint64_t active = svcntp_b64(svptrue_b64(), pg); + do { + + aoffset1 = aoffset; + + uint64_t i_cnt = m; + while (i_cnt--) { + svfloat64x2_t a_vec = svld2(pg, (double *)aoffset1); + svst2_f64(pg, (double *) boffset, a_vec); + aoffset1 += lda * 2; + boffset += active * 2; + } + aoffset += sve_size * 2; + + j += svcntd(); + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} From 6cae44d4f7afc6352a6521e717eff80f0220aded Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Dec 2021 19:06:55 +0100 Subject: [PATCH 18/77] Ensure that the right xerbla gets included in OSX DYNAMIC_ARCH builds --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 913017c63..decd8cc2d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -251,12 +251,14 @@ if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS) set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) set (CMAKE_Fortran_CREATE_SHARED_LIBRARY "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " + "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") else () set (CMAKE_C_CREATE_SHARED_LIBRARY "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " + "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") endif () endif() From 40b14e4957b9a5d9bbda30fc10aeeba485755f3c Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Wed, 29 Dec 2021 11:42:04 +0100 Subject: [PATCH 19/77] fix zgemm kernel --- kernel/arm64/zgemm_kernel_sve_v1x4.S | 59 +++++++++++++--------------- kernel/arm64/zgemm_ncopy_sve_v1.c | 2 +- kernel/arm64/zgemm_tcopy_sve_v1.c | 2 +- 3 files changed, 29 insertions(+), 34 deletions(-) diff --git a/kernel/arm64/zgemm_kernel_sve_v1x4.S b/kernel/arm64/zgemm_kernel_sve_v1x4.S index 1201d6dac..d5b35775c 100644 --- a/kernel/arm64/zgemm_kernel_sve_v1x4.S +++ b/kernel/arm64/zgemm_kernel_sve_v1x4.S @@ -53,12 +53,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alphaR x19 #define alphaI x20 -#define alphaz_R z10.d -#define alphaz_I z11.d -#define alpha0_R d10 -#define alphaV0_R v10.d[0] -#define alpha0_I d11 -#define alphaV0_I v11.d[0] +#define alphaz_R z6.d +#define alphaz_I z7.d +#define alpha0_R d6 +#define alpha0_I d7 #define A_PRE_SIZE 2560 @@ -170,8 +168,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x4_I ld2d {z0.d, z1.d}, p1/z, [pA] - ld2d {z2.d, z3.d}, p1/z, [pA, #2, mul vl] // next one - add pA, pA, lanes, lsl #5 // pA += lanes*2*2*8 + add pA, pA, lanes, lsl #4 // pA += lanes*2*8 + ld2d {z2.d, z3.d}, p1/z, [pA] // next one + add pA, pA, lanes, lsl #4 // pA += lanes*2*8 ld1rd z8.d, p0/z, [pB] ld1rd z9.d, p0/z, [pB, 8] @@ -283,7 +282,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNELv1x4_M2 - ld2d {z2.d, z3.d}, p1/z, [pA] + ld2d {z0.d, z1.d}, p1/z, [pA] add pA, pA, lanes, lsl #4 // pA = pA + lanes *2 * 8 OP_rr z16.d, p1/m, z2.d, z8.d @@ -396,39 +395,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmls z24.d, p1/m, z17.d, alphaz_I fmla z25.d, p1/m, z16.d, alphaz_I fmla z25.d, p1/m, z17.d, alphaz_R - st2d {z25.d, z26.d}, p1, [pCRow0] + st2d {z24.d, z25.d}, p1, [pCRow0] - add pCRow0, pCRow0, #32 + add pCRow0, pCRow0, lanes, lsl #4 - ld2d {z26.d, z27.d}, p1/z, [pCRow0] + ld2d {z26.d, z27.d}, p1/z, [pCRow1] fmla z26.d, p1/m, z18.d, alphaz_R fmls z26.d, p1/m, z19.d, alphaz_I fmla z27.d, p1/m, z18.d, alphaz_I fmla z27.d, p1/m, z19.d, alphaz_R - st2d {z26.d, z27.d}, p1, [pCRow0] + st2d {z26.d, z27.d}, p1, [pCRow1] - add pCRow0, pCRow0, #32 + add pCRow1, pCRow1, lanes, lsl #4 prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - ld2d {z28.d, z29.d}, p1/z, [pCRow1] + ld2d {z28.d, z29.d}, p1/z, [pCRow2] fmla z28.d, p1/m, z20.d, alphaz_R fmls z28.d, p1/m, z21.d, alphaz_I fmla z29.d, p1/m, z20.d, alphaz_I fmla z29.d, p1/m, z21.d, alphaz_R - st2d {z28.d, z29.d}, p1, [pCRow1] + st2d {z28.d, z29.d}, p1, [pCRow2] - add pCRow1, pCRow1, #32 + add pCRow2, pCRow2, lanes, lsl #4 - ld2d {z30.d, z31.d}, p1/z, [pCRow1] + ld2d {z30.d, z31.d}, p1/z, [pCRow3] fmla z30.d, p1/m, z22.d, alphaz_R fmls z30.d, p1/m, z23.d, alphaz_I fmla z31.d, p1/m, z22.d, alphaz_I fmla z31.d, p1/m, z23.d, alphaz_R - st2d {z30.d, z31.d}, p1, [pCRow1] + st2d {z30.d, z31.d}, p1, [pCRow3] - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 + add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8 prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] @@ -474,24 +473,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmls z24.d, p1/m, z17.d, alphaz_I fmla z25.d, p1/m, z16.d, alphaz_I fmla z25.d, p1/m, z17.d, alphaz_R - st2d {z25.d, z26.d}, p1, [pCRow0] + st2d {z24.d, z25.d}, p1, [pCRow0] - add pCRow0, pCRow0, #32 + add pCRow0, pCRow0, lanes, lsl #4 - ld2d {z26.d, z27.d}, p1/z, [pCRow0] + ld2d {z26.d, z27.d}, p1/z, [pCRow1] fmla z26.d, p1/m, z18.d, alphaz_R fmls z26.d, p1/m, z19.d, alphaz_I fmla z27.d, p1/m, z18.d, alphaz_I fmla z27.d, p1/m, z19.d, alphaz_R - st2d {z26.d, z27.d}, p1, [pCRow0] + st2d {z26.d, z27.d}, p1, [pCRow1] - add pCRow0, pCRow0, #32 + add pCRow1, pCRow1, lanes, lsl #4 prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] - add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 - .endm /******************************************************************************/ @@ -526,10 +523,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmls z24.d, p1/m, z17.d, alphaz_I fmla z25.d, p1/m, z16.d, alphaz_I fmla z25.d, p1/m, z17.d, alphaz_R - st2d {z25.d, z26.d}, p1, [pCRow0] - - add pCRow0, pCRow0, #32 - + st2d {z24.d, z25.d}, p1, [pCRow0] add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 @@ -718,6 +712,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble .Lzgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC + add pCRow1, pCRow0, LDC add pC,pC,LDC, lsl #1 diff --git a/kernel/arm64/zgemm_ncopy_sve_v1.c b/kernel/arm64/zgemm_ncopy_sve_v1.c index be18e9708..57035f4ff 100644 --- a/kernel/arm64/zgemm_ncopy_sve_v1.c +++ b/kernel/arm64/zgemm_ncopy_sve_v1.c @@ -65,7 +65,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ svfloat64_t a_vec_imag = svld1_gather_index(pg, ((double *) aoffset1) + 1, lda_vec); svst2_f64(pg, (double *) boffset, svcreate2(a_vec_real, a_vec_imag)); aoffset1 += 2; - boffset += active; + boffset += active * 2; } aoffset += sve_size * lda * 2; diff --git a/kernel/arm64/zgemm_tcopy_sve_v1.c b/kernel/arm64/zgemm_tcopy_sve_v1.c index 085e1fa40..32f217d7a 100644 --- a/kernel/arm64/zgemm_tcopy_sve_v1.c +++ b/kernel/arm64/zgemm_tcopy_sve_v1.c @@ -65,7 +65,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset1 += lda * 2; boffset += active * 2; } - aoffset += sve_size * 2; + aoffset += active * 2; j += svcntd(); pg = svwhilelt_b64(j, n); From ea3db69faa99dc4f7ad6641c9590f77ace7d6b03 Mon Sep 17 00:00:00 2001 From: jgillis Date: Wed, 29 Dec 2021 22:50:20 +0100 Subject: [PATCH 20/77] Fix cmake crosscompilation for core2 target Missing HAVE_SSE* cmake variables cause cc.cmake to forget about `-msse*` flags --- cmake/prebuild.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 259d9c738..232a6cc35 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -127,6 +127,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS "#define DLOCAL_BUFFER_SIZE\t16384\n" "#define CLOCAL_BUFFER_SIZE\t16384\n" "#define ZLOCAL_BUFFER_SIZE\t16384\n") + set(HAVE_SSE 1) + set(HAVE_SSE2 1) + set(HAVE_SSE3 1) + set(HAVE_SSSE3 1) set(SGEMM_UNROLL_M 8) set(SGEMM_UNROLL_N 4) set(DGEMM_UNROLL_M 4) From f7b69128680323ae30ff5992c2ea9f7cc8db8973 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Thu, 30 Dec 2021 21:00:16 +0100 Subject: [PATCH 21/77] ztrmm sve copy kernels --- kernel/arm64/ztrmm_lncopy_sve_v1.c | 145 +++++++++++++++++++++++++++++ kernel/arm64/ztrmm_ltcopy_sve_v1.c | 143 ++++++++++++++++++++++++++++ kernel/arm64/ztrmm_uncopy_sve_v1.c | 145 +++++++++++++++++++++++++++++ kernel/arm64/ztrmm_utcopy_sve_v1.c | 141 ++++++++++++++++++++++++++++ 4 files changed, 574 insertions(+) create mode 100644 kernel/arm64/ztrmm_lncopy_sve_v1.c create mode 100644 kernel/arm64/ztrmm_ltcopy_sve_v1.c create mode 100644 kernel/arm64/ztrmm_uncopy_sve_v1.c create mode 100644 kernel/arm64/ztrmm_utcopy_sve_v1.c diff --git a/kernel/arm64/ztrmm_lncopy_sve_v1.c b/kernel/arm64/ztrmm_lncopy_sve_v1.c new file mode 100644 index 000000000..19c34ff41 --- /dev/null +++ b/kernel/arm64/ztrmm_lncopy_sve_v1.c @@ -0,0 +1,145 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + lda += lda; + + js = 0; + FLOAT *ao; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda*2); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda*2); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do + { + X = posX; + + if (posX <= posY) { + ao = a + posY * 2 + posX * lda; + } else { + ao = a + posX * 2 + posY * lda; + } + + i = 0; + do + { + if (X > posY) { +#ifdef DOUBLE + svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#else + svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#endif + svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); + ao += 2; + b += n_active * 2; + X ++; + i ++; + } else + if (X < posY) { + ao += lda * 2; + b += n_active * 2; + X ++; + i ++; + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = *(ao+k*lda+j); + b[temp++] = *(ao+k*lda+j+1); + } + b[temp++] = ONE; + b[temp++] = ZERO; + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k <= j; k++) { + b[temp++] = *(ao+k*lda+j); + b[temp++] = *(ao+k*lda+j+1); + } + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + } +#endif + ao += n_active * 2; + b += n_active*n_active * 2; + X += n_active; + i += n_active; + } + } while (i < m); + + posY += n_active; + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + + return 0; +} diff --git a/kernel/arm64/ztrmm_ltcopy_sve_v1.c b/kernel/arm64/ztrmm_ltcopy_sve_v1.c new file mode 100644 index 000000000..c272db602 --- /dev/null +++ b/kernel/arm64/ztrmm_ltcopy_sve_v1.c @@ -0,0 +1,143 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + lda += lda; + + FLOAT *ao; + js = 0; +#ifdef DOUBLE + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do + { + X = posX; + + if (posX <= posY) { + ao = a + posY * 2 + posX * lda; + } else { + ao = a + posX * 2 + posY * lda; + } + + i = 0; + do + { + if (X > posY) { + ao += 2; + b += n_active * 2; + X ++; + i ++; + } else + if (X < posY) { +#ifdef DOUBLE + svfloat64x2_t aj_vec = svld2(pn, ao); +#else + svfloat32x2_t aj_vec = svld2(pn, ao); +#endif + svst2(pn, b, aj_vec); + ao += lda * 2; + b += n_active * 2; + X ++; + i ++; + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + b[temp++] = ONE; + b[temp++] = ZERO; + for (int k = j+1; k < n_active; k++) { + b[temp++] = *(ao+j*lda+k); + b[temp++] = *(ao+j*lda+k+1); + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + for (int k = j; k < n_active; k++) { + b[temp++] = *(ao+j*lda+k); + b[temp++] = *(ao+j*lda+k+1); + } + } +#endif + ao += n_active * lda * 2; + b += n_active*n_active * 2; + X += n_active; + i += n_active; + } + } while (i < m); + + + posY += n_active; + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + + + return 0; +} diff --git a/kernel/arm64/ztrmm_uncopy_sve_v1.c b/kernel/arm64/ztrmm_uncopy_sve_v1.c new file mode 100644 index 000000000..aaa217063 --- /dev/null +++ b/kernel/arm64/ztrmm_uncopy_sve_v1.c @@ -0,0 +1,145 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + lda += lda; + + js = 0; + FLOAT *ao; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda * 2); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda * 2); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do + { + X = posX; + + if (posX <= posY) { + ao = a + posX * 2 + posY * lda; + } else { + ao = a + posY * 2 + posX * lda; + } + + i = 0; + do + { + if (X < posY) { +#ifdef DOUBLE + svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#else + svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#endif + svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); + ao += 2; + b += n_active * 2; + X ++; + i ++; + } else + if (X > posY) { + ao += lda * 2; + b += n_active * 2; + X ++; + i ++; + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + b[temp++] = ONE; + b[temp++] = ZERO; + for (int k = j+1; k < n_active; k++) { + b[temp++] = *(ao+k*lda+j); + b[temp++] = *(ao+k*lda+j+1); + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + for (int k = j; k < n_active; k++) { + b[temp++] = *(ao+k*lda+j); + b[temp++] = *(ao+k*lda+j+1); + } + } +#endif + ao += n_active * 2; + b += n_active*n_active * 2; + X += n_active; + i += n_active; + } + } while (i < m); + + posY += n_active; + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + + return 0; +} diff --git a/kernel/arm64/ztrmm_utcopy_sve_v1.c b/kernel/arm64/ztrmm_utcopy_sve_v1.c new file mode 100644 index 000000000..c3e1f1b42 --- /dev/null +++ b/kernel/arm64/ztrmm_utcopy_sve_v1.c @@ -0,0 +1,141 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + lda += lda; + + FLOAT *ao; + js = 0; +#ifdef DOUBLE + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do + { + X = posX; + + if (posX <= posY) { + ao = a + posX * 2 + posY * lda; + } else { + ao = a + posY * 2 + posX * lda; + } + + i = 0; + do + { + if (X < posY) { + ao += 2; + b += n_active * 2; + X ++; + i ++; + } else + if (X > posY) { +#ifdef DOUBLE + svfloat64x2_t aj_vec = svld2(pn, ao); +#else + svfloat32x2_t aj_vec = svld2(pn, ao); +#endif + svst2(pn, b, aj_vec); + ao += lda * 2; + b += n_active * 2; + X ++; + i ++; + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = *(ao+j*lda+k); + b[temp++] = *(ao+j*lda+k+1); + } + b[temp++] = ONE; + b[temp++] = ZERO; + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k <= j; k++) { + b[temp++] = *(ao+j*lda+k); + b[temp++] = *(ao+j*lda+k+1); + } + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + } +#endif + ao += n_active * lda * 2; + b += n_active*n_active * 2; + X += n_active; + i += n_active; + } + } while (i < m); + + posY += n_active; + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + + return 0; +} From b329e45288c2e7fc0ef15c4e8a7b3c8dfd74a930 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 1 Jan 2022 00:46:23 +0100 Subject: [PATCH 22/77] Guard against omp_get_num_places returning zero --- driver/others/memory.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index bd0553ca9..0f4cbb24d 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -232,11 +232,11 @@ int get_num_procs(void); #else int get_num_procs(void) { static int nums = 0; - + int ret; #if defined(__GLIBC_PREREQ) cpu_set_t cpuset,*cpusetp; size_t size; - int ret; + #if !__GLIBC_PREREQ(2, 7) int i; #if !__GLIBC_PREREQ(2, 6) @@ -249,7 +249,8 @@ int get_num_procs(void) { #if defined(USE_OPENMP) #if _OPENMP >= 201511 - nums = omp_get_num_places(); + ret = omp_get_num_places(); + if (ret >0 ) nums = ret; #endif return nums; #endif @@ -1800,11 +1801,12 @@ int get_num_procs(void); int get_num_procs(void) { static int nums = 0; - + int ret; + #if defined(__GLIBC_PREREQ) cpu_set_t cpuset,*cpusetp; size_t size; - int ret; + #if !__GLIBC_PREREQ(2, 7) int i; #if !__GLIBC_PREREQ(2, 6) @@ -1818,7 +1820,8 @@ int get_num_procs(void) { #if defined(USE_OPENMP) /* if (omp_get_proc_bind() != omp_proc_bind_false) */ #if _OPENMP >= 201511 - nums = omp_get_num_places(); + ret = omp_get_num_places(); + if (ret >0 ) nums = ret; #endif return nums; #endif From 0140373802db2d910baa92bc7b31dba076fc205b Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 2 Jan 2022 19:15:33 +0100 Subject: [PATCH 23/77] add sve ztrmm --- kernel/Makefile.L3 | 32 + kernel/arm64/KERNEL.A64FX | 12 +- kernel/arm64/ztrmm_kernel_sve_v1x4.S | 1006 ++++++++++++++++++++++++++ 3 files changed, 1044 insertions(+), 6 deletions(-) create mode 100644 kernel/arm64/ztrmm_kernel_sve_v1x4.S diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index d22bd46a5..da279b185 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -1739,29 +1739,61 @@ $(KDIR)ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_ $(KDIR)ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ +ifdef ZTRMMUNCOPY_M +$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef ZTRMMLNCOPY_M +$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif +ifdef ZTRMMUTCOPY_M +$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef ZTRMMLTCOPY_M +$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index 04be0fab9..986b7ab47 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -182,11 +182,11 @@ ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) -DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c -DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c -DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c -DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c +ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c -DSYMMUCOPY_M = symm_ucopy_sve.c -DSYMMLCOPY_M = symm_lcopy_sve.c +ZSYMMUCOPY_M = symm_ucopy_sve.c +ZSYMMLCOPY_M = symm_lcopy_sve.c diff --git a/kernel/arm64/ztrmm_kernel_sve_v1x4.S b/kernel/arm64/ztrmm_kernel_sve_v1x4.S new file mode 100644 index 000000000..1a81b4da0 --- /dev/null +++ b/kernel/arm64/ztrmm_kernel_sve_v1x4.S @@ -0,0 +1,1006 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pCRow3 x15 +#define pA x16 +#define lanes x17 + +#define alphaR x19 +#define alphaI x20 +#define temp x21 +#define tempOffset x22 +#define tempK x23 + +#define alphaz_R z6.d +#define alphaz_I z7.d +#define alpha0_R d6 +#define alpha0_I d7 + + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset -> temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pCRow3 +// 16 pA +// 17 alpha_save_R +// 18 must save alpha_save_I +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA00_R, pA01_R +//v01 ALPHA_I -> pA00_I, pA01_I +//v02 pA02_R, pA03_R +//v03 pA02_I, pA03_I +//v04 pA10_R, pA11_R +//v05 pA10_I, pA11_I +//v06 pA12_R, pA13_R +//v07 pA12_I, pA13_I +//v08 must save pB00_R, pB01_R +//v09 must save pB00_I, pB01_I +//v10 must save pB02_R, pB03_R OR ALPHA0_R +//v11 must save pB02_I, pB03_I OR ALPHA0_I +//v12 must save pB10_R, pB11_R +//v13 must save pB10_I, pB11_I +//v14 must save pB12_R, pB13_R OR ALPHA1_R +//v15 must save pB12_I, pB13_I OR ALPHA1_R +//v16 pC0R +//v17 pC0I +//v18 pC1R +//v19 pC1I +//v20 pC2R +//v21 pC2I +//v22 pC3R +//v23 pC3I +//v24 pC3R +//v25 pC3I +//v26 pC22_R, pC23_R +//v27 pC22_I, pC23_I +//v28 pC30_R, pC31_R +//v29 pC30_I, pC31_I +//v30 pC32_R, pC33_R +//v31 pC32_I, pC33_I + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv1x4_I + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA += lanes*2*8 + ld2d {z2.d, z3.d}, p1/z, [pA] // next one + add pA, pA, lanes, lsl #4 // pA += lanes*2*8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z17.16b, z17.16b, z17.16b + fmls z17.d, p1/m, z0.d, z9.d +#else + fmla z17.d, p1/m, z0.d, z9.d +#endif + OP_ii z16.d, p1/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + + + fmla z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + OP_ii z18.d, p1/m, z1.d, z11.d +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z19.16b, z21.16b, z21.16b + fmls z19.d, p1/m, z0.d, z11.d +#else + fmla z19.d, p1/m, z0.d, z11.d +#endif + ld1rd z11.d, p0/z, [pB, 24] + + + fmla z20.d, p1/m, z0.d, z12.d + OP_ir z21.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z21.16b, z23.16b, z23.16b + fmls z21.d, p1/m, z0.d, z13.d +#else + fmla z21.d, p1/m, z0.d, z13.d +#endif + OP_ii z20.d, p1/m, z1.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + + + fmla z22.d, p1/m, z0.d, z14.d + OP_ir z23.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z23.16b, z19.16b, z19.16b + fmls z23.d, p1/m, z0.d, z15.d +#else + fmla z23.d, p1/m, z0.d, z15.d +#endif + OP_ii z22.d, p1/m, z1.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M1 + ld2d {z2.d, z3.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + + OP_rr z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + OP_ii z18.d, p1/m, z1.d, z11.d + OP_ri z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z20.d, p1/m, z0.d, z12.d + OP_ir z21.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + OP_ii z20.d, p1/m, z1.d, z13.d + OP_ri z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + + OP_rr z22.d, p1/m, z0.d, z14.d + OP_ir z23.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + OP_ii z22.d, p1/m, z1.d, z15.d + OP_ri z23.d, p1/m, z0.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M2 + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes *2 * 8 + + OP_rr z16.d, p1/m, z2.d, z8.d + OP_ir z17.d, p1/m, z3.d, z8.d + ld1rd z8.d, p0/z, [pB] + OP_ii z16.d, p1/m, z3.d, z9.d + OP_ri z17.d, p1/m, z2.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + + OP_rr z18.d, p1/m, z2.d, z10.d + OP_ir z19.d, p1/m, z3.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + OP_ii z18.d, p1/m, z3.d, z11.d + OP_ri z19.d, p1/m, z2.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z20.d, p1/m, z2.d, z12.d + OP_ir z21.d, p1/m, z3.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + OP_ii z20.d, p1/m, z3.d, z13.d + OP_ri z21.d, p1/m, z2.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + + OP_rr z22.d, p1/m, z2.d, z14.d + OP_ir z23.d, p1/m, z3.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + OP_ii z22.d, p1/m, z3.d, z15.d + OP_ri z23.d, p1/m, z2.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + add pB, pB, 64 + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_E + OP_rr z16.d, p1/m, z2.d, z8.d + OP_ir z17.d, p1/m, z3.d, z8.d + OP_ii z16.d, p1/m, z3.d, z9.d + OP_ri z17.d, p1/m, z2.d, z9.d + + OP_rr z18.d, p1/m, z2.d, z10.d + OP_ir z19.d, p1/m, z3.d, z10.d + OP_ii z18.d, p1/m, z3.d, z11.d + OP_ri z19.d, p1/m, z2.d, z11.d + + OP_rr z20.d, p1/m, z2.d, z12.d + OP_ir z21.d, p1/m, z3.d, z12.d + OP_ii z20.d, p1/m, z3.d, z13.d + OP_ri z21.d, p1/m, z2.d, z13.d + + OP_rr z22.d, p1/m, z2.d, z14.d + OP_ir z23.d, p1/m, z3.d, z14.d + OP_ii z22.d, p1/m, z3.d, z15.d + OP_ri z23.d, p1/m, z2.d, z15.d + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] + +.endm + +.macro KERNELv1x4_SUB + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d + + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + OP_rr z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + OP_ii z18.d, p1/m, z1.d, z11.d + OP_ri z19.d, p1/m, z0.d, z11.d + + add pB, pB, 64 + + OP_rr z20.d, p1/m, z0.d, z12.d + OP_ir z21.d, p1/m, z1.d, z12.d + OP_ii z20.d, p1/m, z1.d, z13.d + OP_ri z21.d, p1/m, z0.d, z13.d + + OP_rr z22.d, p1/m, z0.d, z14.d + OP_ir z23.d, p1/m, z1.d, z14.d + OP_ii z22.d, p1/m, z1.d, z15.d + OP_ri z23.d, p1/m, z0.d, z15.d + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] +.endm + +.macro SAVEv1x4 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + eor z24.d, z16.d, z16.d + eor z25.d, z16.d, z16.d + fmla z24.d, p1/m, z16.d, alphaz_R + fmls z24.d, p1/m, z17.d, alphaz_I + fmla z25.d, p1/m, z16.d, alphaz_I + fmla z25.d, p1/m, z17.d, alphaz_R + st2d {z24.d, z25.d}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #4 + + eor z26.d, z16.d, z16.d + eor z27.d, z16.d, z16.d + fmla z26.d, p1/m, z18.d, alphaz_R + fmls z26.d, p1/m, z19.d, alphaz_I + fmla z27.d, p1/m, z18.d, alphaz_I + fmla z27.d, p1/m, z19.d, alphaz_R + st2d {z26.d, z27.d}, p1, [pCRow1] + + add pCRow1, pCRow1, lanes, lsl #4 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + eor z28.d, z16.d, z16.d + eor z29.d, z16.d, z16.d + fmla z28.d, p1/m, z20.d, alphaz_R + fmls z28.d, p1/m, z21.d, alphaz_I + fmla z29.d, p1/m, z20.d, alphaz_I + fmla z29.d, p1/m, z21.d, alphaz_R + st2d {z28.d, z29.d}, p1, [pCRow2] + + add pCRow2, pCRow2, lanes, lsl #4 + + eor z30.d, z16.d, z16.d + eor z31.d, z16.d, z16.d + fmla z30.d, p1/m, z22.d, alphaz_R + fmls z30.d, p1/m, z23.d, alphaz_I + fmla z31.d, p1/m, z22.d, alphaz_I + fmla z31.d, p1/m, z23.d, alphaz_R + st2d {z30.d, z31.d}, p1, [pCRow3] + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x2 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv1x2_SUB + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d + + OP_rr z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + OP_ii z18.d, p1/m, z1.d, z11.d + OP_ri z19.d, p1/m, z0.d, z11.d + + add pB, pB, 32 +.endm + +.macro SAVEv1x2 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + eor z24.d, z16.d, z16.d + eor z25.d, z16.d, z16.d + fmla z24.d, p1/m, z16.d, alphaz_R + fmls z24.d, p1/m, z17.d, alphaz_I + fmla z25.d, p1/m, z16.d, alphaz_I + fmla z25.d, p1/m, z17.d, alphaz_R + st2d {z24.d, z25.d}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #4 + + eor z26.d, z16.d, z16.d + eor z27.d, z16.d, z16.d + fmla z26.d, p1/m, z18.d, alphaz_R + fmls z26.d, p1/m, z19.d, alphaz_I + fmla z27.d, p1/m, z18.d, alphaz_I + fmla z27.d, p1/m, z19.d, alphaz_R + st2d {z26.d, z27.d}, p1, [pCRow1] + + add pCRow1, pCRow1, lanes, lsl #4 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x1 + dup z16.d, #0 + dup z17.d, #0 +.endm + + +.macro KERNELv1x1_SUB + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d +.endm + +.macro SAVEv1x1 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + eor z24.d, z16.d, z16.d + eor z25.d, z16.d, z16.d + fmla z24.d, p1/m, z16.d, alphaz_R + fmls z24.d, p1/m, z17.d, alphaz_I + fmla z25.d, p1/m, z16.d, alphaz_I + fmla z25.d, p1/m, z17.d, alphaz_R + st2d {z24.d, z25.d}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, d0 + dup alphaz_R, alphaR + fmov alphaI, d1 + dup alphaz_I, alphaI + + lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 + ptrue p0.d // create true predicate + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB + +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble .Lztrmm_kernel_L2_BEGIN + +/******************************************************************************/ +.Lztrmm_kernel_L4_BEGIN: + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = start of A array + +.Lztrmm_kernel_L4_Mv1_BEGIN: + +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.d, counterI, origM + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Lztrmm_kernel_L4_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #4 // add tempOffset*lanes*8*2 + lsl temp, tempOffset, #6 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #4 +#endif + INITv1x4 // fill with zeros + + asr counterL , tempK, #3 + cmp counterL , #2 + blt .Lztrmm_kernel_L4_Mv1_32 + + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lztrmm_kernel_L4_Mv1_22a + + .align 5 +.Lztrmm_kernel_L4_Mv1_22: + + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + + subs counterL, counterL, #1 + bgt .Lztrmm_kernel_L4_Mv1_22 + + .align 5 +.Lztrmm_kernel_L4_Mv1_22a: + + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E + + b .Lztrmm_kernel_L4_Mv1_44 + + .align 5 +.Lztrmm_kernel_L4_Mv1_32: + + tst counterL, #1 + ble .Lztrmm_kernel_L4_Mv1_40 + + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E + + b .Lztrmm_kernel_L4_Mv1_44 + + +.Lztrmm_kernel_L4_Mv1_40: + + INITv1x4 + +.Lztrmm_kernel_L4_Mv1_44: + + ands counterL , tempK, #7 + ble .Lztrmm_kernel_L4_Mv1_100 + + .align 5 +.Lztrmm_kernel_L4_Mv1_46: + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Lztrmm_kernel_L4_Mv1_46 + +.Lztrmm_kernel_L4_Mv1_100: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #4 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #4 // add tempOffset*lanes*8*2 + lsl temp, tempK, #6 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Lztrmm_kernel_L4_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + b.any .Lztrmm_kernel_L4_Mv1_20 + + + +.Lztrmm_kernel_L4_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 4 * 8 * 2 + +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt .Lztrmm_kernel_L4_BEGIN + + +/******************************************************************************/ + +.Lztrmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble .Lztrmm_kernel_L999 + + tst counterJ , #2 + ble .Lztrmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + add pCRow1, pCRow0, LDC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + + + +.Lztrmm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + +.Lztrmm_kernel_L2_Mv1_20: + + INITv1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #4 // add tempOffset*lanes*8*2 + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lztrmm_kernel_L2_Mv1_40 + .align 5 + +.Lztrmm_kernel_L2_Mv1_22: + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lztrmm_kernel_L2_Mv1_22 + + +.Lztrmm_kernel_L2_Mv1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lztrmm_kernel_L2_Mv1_100 + +.Lztrmm_kernel_L2_Mv1_42: + + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lztrmm_kernel_L2_Mv1_42 + +.Lztrmm_kernel_L2_Mv1_100: + + SAVEv1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #2 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #4 // add tempOffset*lanes*8*2 + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +.Lztrmm_kernel_L2_Mv1_END: + + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Lztrmm_kernel_L2_Mv1_20 + + +.Lztrmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 2 * 8 * 2 + +/******************************************************************************/ + +.Lztrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lztrmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +.Lztrmm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + +.Lztrmm_kernel_L1_Mv1_20: + + INITv1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #4 // add tempOffset*lanes*8*2 + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , temp, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lztrmm_kernel_L1_Mv1_40 + .align 5 + +.Lztrmm_kernel_L1_Mv1_22: + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lztrmm_kernel_L1_Mv1_22 + + +.Lztrmm_kernel_L1_Mv1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lztrmm_kernel_L1_Mv1_100 + +.Lztrmm_kernel_L1_Mv1_42: + + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lztrmm_kernel_L1_Mv1_42 + +.Lztrmm_kernel_L1_Mv1_100: + + SAVEv1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #1 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #4 // add tempOffset*lanes*8*2 + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +.Lztrmm_kernel_L1_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Lztrmm_kernel_L1_Mv1_20 + +.Lztrmm_kernel_L1_END: + +/******************************************************************************/ + +.Lztrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + From ce329ab6869bd958cde05c1dcd39ce7c6bc02cd9 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Mon, 3 Jan 2022 15:56:05 +0100 Subject: [PATCH 24/77] add sve zhemm copy routines --- kernel/arm64/KERNEL.A64FX | 4 +- kernel/arm64/zhemm_ltcopy_sve.c | 106 +++++++++++++++++++++++++++++++ kernel/arm64/zhemm_utcopy_sve.c | 107 ++++++++++++++++++++++++++++++++ 3 files changed, 215 insertions(+), 2 deletions(-) create mode 100644 kernel/arm64/zhemm_ltcopy_sve.c create mode 100644 kernel/arm64/zhemm_utcopy_sve.c diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index 986b7ab47..ff5d3aa0e 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -187,6 +187,6 @@ ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c -ZSYMMUCOPY_M = symm_ucopy_sve.c -ZSYMMLCOPY_M = symm_lcopy_sve.c +ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c +ZHEMMUTCOPY_M = zhemm_utcopy_sve.c diff --git a/kernel/arm64/zhemm_ltcopy_sve.c b/kernel/arm64/zhemm_ltcopy_sve.c new file mode 100644 index 000000000..58e9ff589 --- /dev/null +++ b/kernel/arm64/zhemm_ltcopy_sve.c @@ -0,0 +1,106 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG offset, i; + + lda *= 2; + + uint64_t sve_size = svcntd(); + svint64_t posY_vec = svdup_s64(posY); + svint64_t posX_vec = svdup_s64(posX); + svint64_t lda_vec = svdup_s64(lda); + svint64_t one_vec = svdup_s64(1LL); + + int64_t j = 0; + svbool_t pg = svwhilelt_b64(j, n); + int64_t active = svcntp_b64(svptrue_b64(), pg); + svint64_t index_neg = svindex_s64(0LL, -1LL); + svint64_t index = svindex_s64(0LL, 1LL); + + do { + offset = posX - posY; + svint64_t vec_off = svdup_s64(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint64_t temp = svadd_z(pg, posX_vec, index); + svint64_t temp1 = svmul_z(pg, temp, 2); + temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); + svint64_t temp2 = svmul_z(pg, temp, lda_vec); + temp2 = svmla_z(pg, temp2, posY_vec, 2); + svint64_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, lda_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); + if (offset <= 0) { + svbool_t off_g = svwhilelt_b64(offset, 0LL); + data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); + } + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + // dealing with ZERO separately + if (offset > -active && offset < 1) + b[ -2*offset + 1 ] = ZERO; + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s64(posX); + j += sve_size; + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} diff --git a/kernel/arm64/zhemm_utcopy_sve.c b/kernel/arm64/zhemm_utcopy_sve.c new file mode 100644 index 000000000..9ddbf6cbd --- /dev/null +++ b/kernel/arm64/zhemm_utcopy_sve.c @@ -0,0 +1,107 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG offset, i; + + lda *= 2; + + uint64_t sve_size = svcntd(); + svint64_t posY_vec = svdup_s64(posY); + svint64_t posX_vec = svdup_s64(posX); + svint64_t lda_vec = svdup_s64(lda); + svint64_t one_vec = svdup_s64(1LL); + + int64_t j = 0; + svbool_t pg = svwhilelt_b64(j, n); + int64_t active = svcntp_b64(svptrue_b64(), pg); + svint64_t index_neg = svindex_s64(0LL, -1LL); + svint64_t index = svindex_s64(0LL, 1LL); + + do { + offset = posX - posY; + svint64_t vec_off = svdup_s64(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint64_t temp = svadd_z(pg, posX_vec, index); + svint64_t temp1 = svmul_z(pg, temp, lda); + temp1 = svmla_z(pg, temp1, posY_vec, 2); + svint64_t temp2 = svmul_z(pg, temp, 2); + temp2 = svmla_z(pg, temp2, posY_vec, lda); + svint64_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, 2); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); + data_vec_imag = svneg_z(pg, data_vec_imag); + if (offset <= 0) { + svbool_t off_g = svwhilelt_b64(offset, 0LL); + data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); + } + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + // dealing with ZERO separately + if (offset > -active && offset < 1) + b[ -2*offset + 1 ] = ZERO; + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s64(posX); + j += sve_size; + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} From 68c414d3a6d9af7f8a686868feeddcd237977b05 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Tue, 4 Jan 2022 14:40:59 +0100 Subject: [PATCH 25/77] ztrmm sve copy functions --- kernel/arm64/ztrmm_lncopy_sve_v1.c | 14 +++++++------- kernel/arm64/ztrmm_ltcopy_sve_v1.c | 12 ++++++------ kernel/arm64/ztrmm_uncopy_sve_v1.c | 14 +++++++------- kernel/arm64/ztrmm_utcopy_sve_v1.c | 12 ++++++------ 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/kernel/arm64/ztrmm_lncopy_sve_v1.c b/kernel/arm64/ztrmm_lncopy_sve_v1.c index 19c34ff41..d34f607ab 100644 --- a/kernel/arm64/ztrmm_lncopy_sve_v1.c +++ b/kernel/arm64/ztrmm_lncopy_sve_v1.c @@ -53,11 +53,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON js = 0; FLOAT *ao; #ifdef DOUBLE - svint64_t index = svindex_s64(0LL, lda*2); + svint64_t index = svindex_s64(0LL, lda); svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svint32_t index = svindex_s32(0, lda*2); + svint32_t index = svindex_s32(0, lda); svbool_t pn = svwhilelt_b32(js, n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif @@ -89,7 +89,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i ++; } else if (X < posY) { - ao += lda * 2; + ao += lda; b += n_active * 2; X ++; i ++; @@ -99,8 +99,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int temp = 0; for (int j = 0; j < n_active; j++) { for (int k = 0 ; k < j; k++) { - b[temp++] = *(ao+k*lda+j); - b[temp++] = *(ao+k*lda+j+1); + b[temp++] = *(ao+k*lda+j*2); + b[temp++] = *(ao+k*lda+j*2+1); } b[temp++] = ONE; b[temp++] = ZERO; @@ -113,8 +113,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int temp = 0; for (int j = 0; j < n_active; j++) { for (int k = 0 ; k <= j; k++) { - b[temp++] = *(ao+k*lda+j); - b[temp++] = *(ao+k*lda+j+1); + b[temp++] = *(ao+k*lda+j*2); + b[temp++] = *(ao+k*lda+j*2+1); } for (int k = j+1; k < n_active; k++) { b[temp++] = ZERO; diff --git a/kernel/arm64/ztrmm_ltcopy_sve_v1.c b/kernel/arm64/ztrmm_ltcopy_sve_v1.c index c272db602..7f34c9857 100644 --- a/kernel/arm64/ztrmm_ltcopy_sve_v1.c +++ b/kernel/arm64/ztrmm_ltcopy_sve_v1.c @@ -85,7 +85,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svfloat32x2_t aj_vec = svld2(pn, ao); #endif svst2(pn, b, aj_vec); - ao += lda * 2; + ao += lda; b += n_active * 2; X ++; i ++; @@ -101,8 +101,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[temp++] = ONE; b[temp++] = ZERO; for (int k = j+1; k < n_active; k++) { - b[temp++] = *(ao+j*lda+k); - b[temp++] = *(ao+j*lda+k+1); + b[temp++] = *(ao+j*lda+k*2); + b[temp++] = *(ao+j*lda+k*2+1); } } #else @@ -113,12 +113,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[temp++] = ZERO; } for (int k = j; k < n_active; k++) { - b[temp++] = *(ao+j*lda+k); - b[temp++] = *(ao+j*lda+k+1); + b[temp++] = *(ao+j*lda+k*2); + b[temp++] = *(ao+j*lda+k*2+1); } } #endif - ao += n_active * lda * 2; + ao += n_active * lda; b += n_active*n_active * 2; X += n_active; i += n_active; diff --git a/kernel/arm64/ztrmm_uncopy_sve_v1.c b/kernel/arm64/ztrmm_uncopy_sve_v1.c index aaa217063..7eb9452c9 100644 --- a/kernel/arm64/ztrmm_uncopy_sve_v1.c +++ b/kernel/arm64/ztrmm_uncopy_sve_v1.c @@ -53,11 +53,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON js = 0; FLOAT *ao; #ifdef DOUBLE - svint64_t index = svindex_s64(0LL, lda * 2); + svint64_t index = svindex_s64(0LL, lda); svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svint32_t index = svindex_s32(0, lda * 2); + svint32_t index = svindex_s32(0, lda); svbool_t pn = svwhilelt_b32(js, n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif @@ -89,7 +89,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i ++; } else if (X > posY) { - ao += lda * 2; + ao += lda; b += n_active * 2; X ++; i ++; @@ -105,8 +105,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[temp++] = ONE; b[temp++] = ZERO; for (int k = j+1; k < n_active; k++) { - b[temp++] = *(ao+k*lda+j); - b[temp++] = *(ao+k*lda+j+1); + b[temp++] = *(ao+k*lda+j*2); + b[temp++] = *(ao+k*lda+j*2+1); } } #else @@ -117,8 +117,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[temp++] = ZERO; } for (int k = j; k < n_active; k++) { - b[temp++] = *(ao+k*lda+j); - b[temp++] = *(ao+k*lda+j+1); + b[temp++] = *(ao+k*lda+j*2); + b[temp++] = *(ao+k*lda+j*2+1); } } #endif diff --git a/kernel/arm64/ztrmm_utcopy_sve_v1.c b/kernel/arm64/ztrmm_utcopy_sve_v1.c index c3e1f1b42..60c8ff3b4 100644 --- a/kernel/arm64/ztrmm_utcopy_sve_v1.c +++ b/kernel/arm64/ztrmm_utcopy_sve_v1.c @@ -85,7 +85,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svfloat32x2_t aj_vec = svld2(pn, ao); #endif svst2(pn, b, aj_vec); - ao += lda * 2; + ao += lda; b += n_active * 2; X ++; i ++; @@ -95,8 +95,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int temp = 0; for (int j = 0; j < n_active; j++) { for (int k = 0 ; k < j; k++) { - b[temp++] = *(ao+j*lda+k); - b[temp++] = *(ao+j*lda+k+1); + b[temp++] = *(ao+j*lda+k*2); + b[temp++] = *(ao+j*lda+k*2+1); } b[temp++] = ONE; b[temp++] = ZERO; @@ -109,8 +109,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int temp = 0; for (int j = 0; j < n_active; j++) { for (int k = 0 ; k <= j; k++) { - b[temp++] = *(ao+j*lda+k); - b[temp++] = *(ao+j*lda+k+1); + b[temp++] = *(ao+j*lda+k*2); + b[temp++] = *(ao+j*lda+k*2+1); } for (int k = j+1; k < n_active; k++) { b[temp++] = ZERO; @@ -118,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } } #endif - ao += n_active * lda * 2; + ao += n_active * lda; b += n_active*n_active * 2; X += n_active; i += n_active; From 2e2c02b762afd67fe3cfb49620ab9df721f1a8ea Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Tue, 4 Jan 2022 14:42:07 +0100 Subject: [PATCH 26/77] fix sve ztrmm kernel --- kernel/arm64/ztrmm_kernel_sve_v1x4.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/ztrmm_kernel_sve_v1x4.S b/kernel/arm64/ztrmm_kernel_sve_v1x4.S index 1a81b4da0..b71a3d39e 100644 --- a/kernel/arm64/ztrmm_kernel_sve_v1x4.S +++ b/kernel/arm64/ztrmm_kernel_sve_v1x4.S @@ -723,7 +723,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pB, pB, temp #endif #if defined(LEFT) - add tempOffset, tempOffset, #4 + add tempOffset, tempOffset, lanes #endif prfm PLDL1KEEP, [pA] @@ -856,7 +856,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pB, pB, temp #endif #if defined(LEFT) - add tempOffset, tempOffset, #4 + add tempOffset, tempOffset, lanes #endif .Lztrmm_kernel_L2_Mv1_END: @@ -923,7 +923,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add tempK, tempOffset, #1 #endif - asr counterL , temp, #3 // counterL = counterL / 8 + asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble .Lztrmm_kernel_L1_Mv1_40 .align 5 @@ -972,7 +972,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pB, pB, temp #endif #if defined(LEFT) - add tempOffset, tempOffset, #4 + add tempOffset, tempOffset, lanes #endif .Lztrmm_kernel_L1_Mv1_END: From 07fa6fa3b192f525f5bb8f36e7fc694095f53593 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Wed, 5 Jan 2022 08:57:51 +0100 Subject: [PATCH 27/77] configure Makefile for sve --- kernel/Makefile.L3 | 86 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 79 insertions(+), 7 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index da279b185..1c0931d96 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -1691,29 +1691,61 @@ $(KDIR)qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N $(KDIR)qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ +ifdef CTRMMUNCOPY_M +$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif -$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c +ifdef CTRMMLNCOPY_M +$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ -$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c +$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c +else +$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ -$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c +$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif -$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c +ifdef CTRMMUTCOPY_M +$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else +$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef CTRMMLTCOPY_M +$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ -$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c +$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else +$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif $(KDIR)ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -1929,11 +1961,21 @@ $(KDIR)csymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N) $(KDIR)csymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ +ifdef CSYMMUCOPY_M +$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMUCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ +else $(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ +endif +ifdef CSYMMLCOPY_M +$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMLCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ +else $(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ +endif $(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ @@ -1941,11 +1983,21 @@ $(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N) $(KDIR)zsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ +ifdef ZSYMMUCOPY_M +$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMUCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ +else $(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ +endif +ifdef ZSYMMLCOPY_M +$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMLCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ +else $(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ +endif $(KDIR)xsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ @@ -1965,11 +2017,21 @@ $(KDIR)chemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N $(KDIR)chemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ +ifdef CHEMMUTCOPY_M +$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ +else $(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ +endif +ifdef CHEMMLTCOPY_M +$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ +else $(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ +endif $(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ @@ -1977,11 +2039,21 @@ $(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N $(KDIR)zhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ +ifdef ZHEMMUTCOPY_M +$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ +else $(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ +endif +ifdef ZHEMMLTCOPY_M +$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ +else $(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ +endif $(KDIR)xhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ From d30157d8914c812f97d1b4de7631ead7440b3d3e Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Wed, 5 Jan 2022 09:00:54 +0100 Subject: [PATCH 28/77] update configuration of kernels for A64FX and ARMV8SVE --- kernel/arm64/KERNEL.A64FX | 29 +++++++++++++------ kernel/arm64/KERNEL.ARMV8SVE | 54 +++++++++++++++++++++++++----------- 2 files changed, 59 insertions(+), 24 deletions(-) diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index ff5d3aa0e..76dda0c65 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -156,19 +156,30 @@ DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c DSYMMUCOPY_M = symm_ucopy_sve.c DSYMMLCOPY_M = symm_lcopy_sve.c -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) -CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +CGEMMINCOPY = cgemm_ncopy_sve_v1.c +CGEMMITCOPY = cgemm_tcopy_sve_v1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +CHEMMLTCOPY_M = chemm_ltcopy_sve.c +CHEMMUTCOPY_M = chemm_utcopy_sve.c + +CSYMMUCOPY_M = zsymm_ucopy_sve.c +CSYMMLCOPY_M = zsymm_lcopy_sve.c + ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S @@ -190,3 +201,5 @@ ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c ZHEMMUTCOPY_M = zhemm_utcopy_sve.c +ZSYMMUCOPY_M = zsymm_ucopy_sve.c +ZSYMMLCOPY_M = zsymm_lcopy_sve.c diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index 0364a929c..63dfde22f 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -156,28 +156,50 @@ DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c DSYMMUCOPY_M = symm_ucopy_sve.c DSYMMLCOPY_M = symm_lcopy_sve.c -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) -CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +CGEMMINCOPY = cgemm_ncopy_sve_v1.c +CGEMMITCOPY = cgemm_tcopy_sve_v1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) -ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif +CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +CHEMMLTCOPY_M = chemm_ltcopy_sve.c +CHEMMUTCOPY_M = chemm_utcopy_sve.c + +CSYMMUCOPY_M = zsymm_ucopy_sve.c +CSYMMLCOPY_M = zsymm_lcopy_sve.c + +ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +ZGEMMINCOPY = zgemm_ncopy_sve_v1.c +ZGEMMITCOPY = zgemm_tcopy_sve_v1.c ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c +ZHEMMUTCOPY_M = zhemm_utcopy_sve.c + +ZSYMMUCOPY_M = zsymm_ucopy_sve.c +ZSYMMLCOPY_M = zsymm_lcopy_sve.c From 87537b8c553a3d79ae2123b36716cc22a20280b1 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Wed, 5 Jan 2022 09:07:28 +0100 Subject: [PATCH 29/77] modify sve zgemmcopy kernels --- kernel/arm64/zgemm_ncopy_sve_v1.c | 3 +-- kernel/arm64/zgemm_tcopy_sve_v1.c | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/arm64/zgemm_ncopy_sve_v1.c b/kernel/arm64/zgemm_ncopy_sve_v1.c index 57035f4ff..8f9b4268a 100644 --- a/kernel/arm64/zgemm_ncopy_sve_v1.c +++ b/kernel/arm64/zgemm_ncopy_sve_v1.c @@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ IFLOAT *aoffset, *aoffset1, *boffset; svint64_t lda_vec = svindex_s64(0LL, lda * 2); - uint64_t sve_size = svcntd(); aoffset = a; boffset = b; @@ -67,7 +66,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset1 += 2; boffset += active * 2; } - aoffset += sve_size * lda * 2; + aoffset += active * lda * 2; j += svcntd(); pg = svwhilelt_b64(j, n); diff --git a/kernel/arm64/zgemm_tcopy_sve_v1.c b/kernel/arm64/zgemm_tcopy_sve_v1.c index 32f217d7a..c6e50bc1c 100644 --- a/kernel/arm64/zgemm_tcopy_sve_v1.c +++ b/kernel/arm64/zgemm_tcopy_sve_v1.c @@ -46,8 +46,6 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG j; IFLOAT *aoffset, *aoffset1, *boffset; - uint64_t sve_size = svcntd(); - aoffset = a; boffset = b; From 18102ae8c317c0e2ba371ecff2d35b72132976e3 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Wed, 5 Jan 2022 09:09:18 +0100 Subject: [PATCH 30/77] add cgemm ctrmm sve kernels --- kernel/arm64/cgemm_kernel_sve_v1x4.S | 874 ++++++++++++++++++++++ kernel/arm64/ctrmm_kernel_sve_v1x4.S | 1006 ++++++++++++++++++++++++++ 2 files changed, 1880 insertions(+) create mode 100644 kernel/arm64/cgemm_kernel_sve_v1x4.S create mode 100644 kernel/arm64/ctrmm_kernel_sve_v1x4.S diff --git a/kernel/arm64/cgemm_kernel_sve_v1x4.S b/kernel/arm64/cgemm_kernel_sve_v1x4.S new file mode 100644 index 000000000..38770f66b --- /dev/null +++ b/kernel/arm64/cgemm_kernel_sve_v1x4.S @@ -0,0 +1,874 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pCRow3 x15 +#define pA x16 +#define lanes x17 + +#define alphaR w19 +#define alphaI w20 + +#define alphaz_R z6.s +#define alphaz_I z7.s +#define alpha0_R s4 +#define alpha0_I s5 + + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset -> temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pCRow3 +// 16 pA +// 17 alpha_save_R +// 18 must save alpha_save_I +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA00_R, pA01_R +//v01 ALPHA_I -> pA00_I, pA01_I +//v02 pA02_R, pA03_R +//v03 pA02_I, pA03_I +//v04 pA10_R, pA11_R +//v05 pA10_I, pA11_I +//v06 pA12_R, pA13_R +//v07 pA12_I, pA13_I +//v08 must save pB00_R, pB01_R +//v09 must save pB00_I, pB01_I +//v10 must save pB02_R, pB03_R OR ALPHA0_R +//v11 must save pB02_I, pB03_I OR ALPHA0_I +//v12 must save pB10_R, pB11_R +//v13 must save pB10_I, pB11_I +//v14 must save pB12_R, pB13_R OR ALPHA1_R +//v15 must save pB12_I, pB13_I OR ALPHA1_R +//v16 pC0R +//v17 pC0I +//v18 pC1R +//v19 pC1I +//v20 pC2R +//v21 pC2I +//v22 pC3R +//v23 pC3I +//v24 pC3R +//v25 pC3I +//v26 pC22_R, pC23_R +//v27 pC22_I, pC23_I +//v28 pC30_R, pC31_R +//v29 pC30_I, pC31_I +//v30 pC32_R, pC33_R +//v31 pC32_I, pC33_I + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x4 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 +.endm + +.macro KERNELv1x4_I + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA += lanes*2*4 + ld2w {z2.s, z3.s}, p1/z, [pA] // next one + add pA, pA, lanes, lsl #3 // pA += lanes*2*4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z17.16b, z17.16b, z17.16b + fmls z17.s, p1/m, z0.s, z9.s +#else + fmla z17.s, p1/m, z0.s, z9.s +#endif + OP_ii z16.s, p1/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + + + fmla z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + OP_ii z18.s, p1/m, z1.s, z11.s +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z19.16b, z21.16b, z21.16b + fmls z19.s, p1/m, z0.s, z11.s +#else + fmla z19.s, p1/m, z0.s, z11.s +#endif + ld1rw z11.s, p0/z, [pB, 12] + + + fmla z20.s, p1/m, z0.s, z12.s + OP_ir z21.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z21.16b, z23.16b, z23.16b + fmls z21.s, p1/m, z0.s, z13.s +#else + fmla z21.s, p1/m, z0.s, z13.s +#endif + OP_ii z20.s, p1/m, z1.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + + + fmla z22.s, p1/m, z0.s, z14.s + OP_ir z23.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z23.16b, z19.16b, z19.16b + fmls z23.s, p1/m, z0.s, z15.s +#else + fmla z23.s, p1/m, z0.s, z15.s +#endif + OP_ii z22.s, p1/m, z1.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M1 + ld2w {z2.s, z3.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4 + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + + OP_rr z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + OP_ii z18.s, p1/m, z1.s, z11.s + OP_ri z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z20.s, p1/m, z0.s, z12.s + OP_ir z21.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + OP_ii z20.s, p1/m, z1.s, z13.s + OP_ri z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + + OP_rr z22.s, p1/m, z0.s, z14.s + OP_ir z23.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + OP_ii z22.s, p1/m, z1.s, z15.s + OP_ri z23.s, p1/m, z0.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M2 + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes *2 * 4 + + OP_rr z16.s, p1/m, z2.s, z8.s + OP_ir z17.s, p1/m, z3.s, z8.s + ld1rw z8.s, p0/z, [pB] + OP_ii z16.s, p1/m, z3.s, z9.s + OP_ri z17.s, p1/m, z2.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + + OP_rr z18.s, p1/m, z2.s, z10.s + OP_ir z19.s, p1/m, z3.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + OP_ii z18.s, p1/m, z3.s, z11.s + OP_ri z19.s, p1/m, z2.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z20.s, p1/m, z2.s, z12.s + OP_ir z21.s, p1/m, z3.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + OP_ii z20.s, p1/m, z3.s, z13.s + OP_ri z21.s, p1/m, z2.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + + OP_rr z22.s, p1/m, z2.s, z14.s + OP_ir z23.s, p1/m, z3.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + OP_ii z22.s, p1/m, z3.s, z15.s + OP_ri z23.s, p1/m, z2.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + add pB, pB, 32 + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_E + OP_rr z16.s, p1/m, z2.s, z8.s + OP_ir z17.s, p1/m, z3.s, z8.s + OP_ii z16.s, p1/m, z3.s, z9.s + OP_ri z17.s, p1/m, z2.s, z9.s + + OP_rr z18.s, p1/m, z2.s, z10.s + OP_ir z19.s, p1/m, z3.s, z10.s + OP_ii z18.s, p1/m, z3.s, z11.s + OP_ri z19.s, p1/m, z2.s, z11.s + + OP_rr z20.s, p1/m, z2.s, z12.s + OP_ir z21.s, p1/m, z3.s, z12.s + OP_ii z20.s, p1/m, z3.s, z13.s + OP_ri z21.s, p1/m, z2.s, z13.s + + OP_rr z22.s, p1/m, z2.s, z14.s + OP_ir z23.s, p1/m, z3.s, z14.s + OP_ii z22.s, p1/m, z3.s, z15.s + OP_ri z23.s, p1/m, z2.s, z15.s + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] + +.endm + +.macro KERNELv1x4_SUB + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s + + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + OP_rr z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + OP_ii z18.s, p1/m, z1.s, z11.s + OP_ri z19.s, p1/m, z0.s, z11.s + + add pB, pB, 32 + + OP_rr z20.s, p1/m, z0.s, z12.s + OP_ir z21.s, p1/m, z1.s, z12.s + OP_ii z20.s, p1/m, z1.s, z13.s + OP_ri z21.s, p1/m, z0.s, z13.s + + OP_rr z22.s, p1/m, z0.s, z14.s + OP_ir z23.s, p1/m, z1.s, z14.s + OP_ii z22.s, p1/m, z1.s, z15.s + OP_ri z23.s, p1/m, z0.s, z15.s + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] +.endm + +.macro SAVEv1x4 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2w {z24.s, z25.s}, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaz_R + fmls z24.s, p1/m, z17.s, alphaz_I + fmla z25.s, p1/m, z16.s, alphaz_I + fmla z25.s, p1/m, z17.s, alphaz_R + st2w {z24.s, z25.s}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #3 + + ld2w {z26.s, z27.s}, p1/z, [pCRow1] + fmla z26.s, p1/m, z18.s, alphaz_R + fmls z26.s, p1/m, z19.s, alphaz_I + fmla z27.s, p1/m, z18.s, alphaz_I + fmla z27.s, p1/m, z19.s, alphaz_R + st2w {z26.s, z27.s}, p1, [pCRow1] + + add pCRow1, pCRow1, lanes, lsl #3 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld2w {z28.s, z29.s}, p1/z, [pCRow2] + fmla z28.s, p1/m, z20.s, alphaz_R + fmls z28.s, p1/m, z21.s, alphaz_I + fmla z29.s, p1/m, z20.s, alphaz_I + fmla z29.s, p1/m, z21.s, alphaz_R + st2w {z28.s, z29.s}, p1, [pCRow2] + + add pCRow2, pCRow2, lanes, lsl #3 + + ld2w {z30.s, z31.s}, p1/z, [pCRow3] + fmla z30.s, p1/m, z22.s, alphaz_R + fmls z30.s, p1/m, z23.s, alphaz_I + fmla z31.s, p1/m, z22.s, alphaz_I + fmla z31.s, p1/m, z23.s, alphaz_R + st2w {z30.s, z31.s}, p1, [pCRow3] + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x2 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 +.endm + +.macro KERNELv1x2_SUB + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s + + OP_rr z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + OP_ii z18.s, p1/m, z1.s, z11.s + OP_ri z19.s, p1/m, z0.s, z11.s + + add pB, pB, 16 +.endm + +.macro SAVEv1x2 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2w {z24.s, z25.s}, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaz_R + fmls z24.s, p1/m, z17.s, alphaz_I + fmla z25.s, p1/m, z16.s, alphaz_I + fmla z25.s, p1/m, z17.s, alphaz_R + st2w {z24.s, z25.s}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #3 + + ld2w {z26.s, z27.s}, p1/z, [pCRow1] + fmla z26.s, p1/m, z18.s, alphaz_R + fmls z26.s, p1/m, z19.s, alphaz_I + fmla z27.s, p1/m, z18.s, alphaz_I + fmla z27.s, p1/m, z19.s, alphaz_R + st2w {z26.s, z27.s}, p1, [pCRow1] + + add pCRow1, pCRow1, lanes, lsl #3 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x1 + dup z16.s, #0 + dup z17.s, #0 +.endm + + +.macro KERNELv1x1_SUB + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + + add pB, pB, 8 + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s +.endm + +.macro SAVEv1x1 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2w {z24.s, z25.s}, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaz_R + fmls z24.s, p1/m, z17.s, alphaz_I + fmla z25.s, p1/m, z16.s, alphaz_I + fmla z25.s, p1/m, z17.s, alphaz_R + st2w {z24.s, z25.s}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, s0 + dup alphaz_R, alphaR + fmov alphaI, s1 + dup alphaz_I, alphaI + + lsl LDC, LDC, #3 // ldc = ldc * 2 * 4 + ptrue p0.s // create true predicate + + mov pB, origPB + +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble .Lcgemm_kernel_L2_BEGIN + +/******************************************************************************/ +.Lcgemm_kernel_L4_BEGIN: + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + + mov pA, origPA // pA = start of A array + +.Lcgemm_kernel_L4_Mv1_BEGIN: + +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.s, counterI, origM + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Lcgemm_kernel_L4_Mv1_20: + + mov pB, origPB + INITv1x4 // fill with zeros + + asr counterL , origK, #3 + cmp counterL , #2 + blt .Lcgemm_kernel_L4_Mv1_32 + + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lcgemm_kernel_L4_Mv1_22a + + .align 5 +.Lcgemm_kernel_L4_Mv1_22: + + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + + subs counterL, counterL, #1 + bgt .Lcgemm_kernel_L4_Mv1_22 + + .align 5 +.Lcgemm_kernel_L4_Mv1_22a: + + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E + + b .Lcgemm_kernel_L4_Mv1_44 + + .align 5 +.Lcgemm_kernel_L4_Mv1_32: + + tst counterL, #1 + ble .Lcgemm_kernel_L4_Mv1_40 + + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E + + b .Lcgemm_kernel_L4_Mv1_44 + + +.Lcgemm_kernel_L4_Mv1_40: + + INITv1x4 + +.Lcgemm_kernel_L4_Mv1_44: + + ands counterL , origK, #7 + ble .Lcgemm_kernel_L4_Mv1_100 + + .align 5 +.Lcgemm_kernel_L4_Mv1_46: + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Lcgemm_kernel_L4_Mv1_46 + +.Lcgemm_kernel_L4_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Lcgemm_kernel_L4_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + b.any .Lcgemm_kernel_L4_Mv1_20 + + + +.Lcgemm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 4 * 2 + + subs counterJ, counterJ , #1 // j-- + bgt .Lcgemm_kernel_L4_BEGIN + + +/******************************************************************************/ + +.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble .Lcgemm_kernel_L999 + + tst counterJ , #2 + ble .Lcgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + add pCRow1, pCRow0, LDC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + + + +.Lcgemm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + +.Lcgemm_kernel_L2_Mv1_20: + + INITv1x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lcgemm_kernel_L2_Mv1_40 + .align 5 + +.Lcgemm_kernel_L2_Mv1_22: + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lcgemm_kernel_L2_Mv1_22 + + +.Lcgemm_kernel_L2_Mv1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lcgemm_kernel_L2_Mv1_100 + +.Lcgemm_kernel_L2_Mv1_42: + + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lcgemm_kernel_L2_Mv1_42 + +.Lcgemm_kernel_L2_Mv1_100: + + SAVEv1x2 + +.Lcgemm_kernel_L2_Mv1_END: + + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lcgemm_kernel_L2_Mv1_20 + + +.Lcgemm_kernel_L2_END: + lsl temp, origK, #4 + add origPB, origPB, temp // B = B + K * 2 * 4 * 2 + +/******************************************************************************/ + +.Lcgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lcgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + +.Lcgemm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + +.Lcgemm_kernel_L1_Mv1_20: + + INITv1x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lcgemm_kernel_L1_Mv1_40 + .align 5 + +.Lcgemm_kernel_L1_Mv1_22: + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lcgemm_kernel_L1_Mv1_22 + + +.Lcgemm_kernel_L1_Mv1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lcgemm_kernel_L1_Mv1_100 + +.Lcgemm_kernel_L1_Mv1_42: + + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lcgemm_kernel_L1_Mv1_42 + +.Lcgemm_kernel_L1_Mv1_100: + + SAVEv1x1 + +.Lcgemm_kernel_L1_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lcgemm_kernel_L1_Mv1_20 + +.Lcgemm_kernel_L1_END: + +/******************************************************************************/ + +.Lcgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/ctrmm_kernel_sve_v1x4.S b/kernel/arm64/ctrmm_kernel_sve_v1x4.S new file mode 100644 index 000000000..242968f63 --- /dev/null +++ b/kernel/arm64/ctrmm_kernel_sve_v1x4.S @@ -0,0 +1,1006 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pCRow3 x15 +#define pA x16 +#define lanes x17 + +#define alphaR w19 +#define alphaI w20 +#define temp x21 +#define tempOffset x22 +#define tempK x23 + +#define alphaz_R z6.s +#define alphaz_I z7.s +#define alpha0_R s6 +#define alpha0_I s7 + + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset -> temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pCRow3 +// 16 pA +// 17 alpha_save_R +// 18 must save alpha_save_I +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA00_R, pA01_R +//v01 ALPHA_I -> pA00_I, pA01_I +//v02 pA02_R, pA03_R +//v03 pA02_I, pA03_I +//v04 pA10_R, pA11_R +//v05 pA10_I, pA11_I +//v06 pA12_R, pA13_R +//v07 pA12_I, pA13_I +//v08 must save pB00_R, pB01_R +//v09 must save pB00_I, pB01_I +//v10 must save pB02_R, pB03_R OR ALPHA0_R +//v11 must save pB02_I, pB03_I OR ALPHA0_I +//v12 must save pB10_R, pB11_R +//v13 must save pB10_I, pB11_I +//v14 must save pB12_R, pB13_R OR ALPHA1_R +//v15 must save pB12_I, pB13_I OR ALPHA1_R +//v16 pC0R +//v17 pC0I +//v18 pC1R +//v19 pC1I +//v20 pC2R +//v21 pC2I +//v22 pC3R +//v23 pC3I +//v24 pC3R +//v25 pC3I +//v26 pC22_R, pC23_R +//v27 pC22_I, pC23_I +//v28 pC30_R, pC31_R +//v29 pC30_I, pC31_I +//v30 pC32_R, pC33_R +//v31 pC32_I, pC33_I + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x4 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 +.endm + +.macro KERNELv1x4_I + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA += lanes*2*4 + ld2w {z2.s, z3.s}, p1/z, [pA] // next one + add pA, pA, lanes, lsl #3 // pA += lanes*2*4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z17.16b, z17.16b, z17.16b + fmls z17.s, p1/m, z0.s, z9.s +#else + fmla z17.s, p1/m, z0.s, z9.s +#endif + OP_ii z16.s, p1/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + + + fmla z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + OP_ii z18.s, p1/m, z1.s, z11.s +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z19.16b, z21.16b, z21.16b + fmls z19.s, p1/m, z0.s, z11.s +#else + fmla z19.s, p1/m, z0.s, z11.s +#endif + ld1rw z11.s, p0/z, [pB, 12] + + + fmla z20.s, p1/m, z0.s, z12.s + OP_ir z21.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z21.16b, z23.16b, z23.16b + fmls z21.s, p1/m, z0.s, z13.s +#else + fmla z21.s, p1/m, z0.s, z13.s +#endif + OP_ii z20.s, p1/m, z1.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + + + fmla z22.s, p1/m, z0.s, z14.s + OP_ir z23.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z23.16b, z19.16b, z19.16b + fmls z23.s, p1/m, z0.s, z15.s +#else + fmla z23.s, p1/m, z0.s, z15.s +#endif + OP_ii z22.s, p1/m, z1.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M1 + ld2w {z2.s, z3.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4 + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + + OP_rr z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + OP_ii z18.s, p1/m, z1.s, z11.s + OP_ri z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z20.s, p1/m, z0.s, z12.s + OP_ir z21.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + OP_ii z20.s, p1/m, z1.s, z13.s + OP_ri z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + + OP_rr z22.s, p1/m, z0.s, z14.s + OP_ir z23.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + OP_ii z22.s, p1/m, z1.s, z15.s + OP_ri z23.s, p1/m, z0.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M2 + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes *2 * 4 + + OP_rr z16.s, p1/m, z2.s, z8.s + OP_ir z17.s, p1/m, z3.s, z8.s + ld1rw z8.s, p0/z, [pB] + OP_ii z16.s, p1/m, z3.s, z9.s + OP_ri z17.s, p1/m, z2.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + + OP_rr z18.s, p1/m, z2.s, z10.s + OP_ir z19.s, p1/m, z3.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + OP_ii z18.s, p1/m, z3.s, z11.s + OP_ri z19.s, p1/m, z2.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z20.s, p1/m, z2.s, z12.s + OP_ir z21.s, p1/m, z3.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + OP_ii z20.s, p1/m, z3.s, z13.s + OP_ri z21.s, p1/m, z2.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + + OP_rr z22.s, p1/m, z2.s, z14.s + OP_ir z23.s, p1/m, z3.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + OP_ii z22.s, p1/m, z3.s, z15.s + OP_ri z23.s, p1/m, z2.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + add pB, pB, 32 + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_E + OP_rr z16.s, p1/m, z2.s, z8.s + OP_ir z17.s, p1/m, z3.s, z8.s + OP_ii z16.s, p1/m, z3.s, z9.s + OP_ri z17.s, p1/m, z2.s, z9.s + + OP_rr z18.s, p1/m, z2.s, z10.s + OP_ir z19.s, p1/m, z3.s, z10.s + OP_ii z18.s, p1/m, z3.s, z11.s + OP_ri z19.s, p1/m, z2.s, z11.s + + OP_rr z20.s, p1/m, z2.s, z12.s + OP_ir z21.s, p1/m, z3.s, z12.s + OP_ii z20.s, p1/m, z3.s, z13.s + OP_ri z21.s, p1/m, z2.s, z13.s + + OP_rr z22.s, p1/m, z2.s, z14.s + OP_ir z23.s, p1/m, z3.s, z14.s + OP_ii z22.s, p1/m, z3.s, z15.s + OP_ri z23.s, p1/m, z2.s, z15.s + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] + +.endm + +.macro KERNELv1x4_SUB + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s + + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + OP_rr z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + OP_ii z18.s, p1/m, z1.s, z11.s + OP_ri z19.s, p1/m, z0.s, z11.s + + add pB, pB, 32 + + OP_rr z20.s, p1/m, z0.s, z12.s + OP_ir z21.s, p1/m, z1.s, z12.s + OP_ii z20.s, p1/m, z1.s, z13.s + OP_ri z21.s, p1/m, z0.s, z13.s + + OP_rr z22.s, p1/m, z0.s, z14.s + OP_ir z23.s, p1/m, z1.s, z14.s + OP_ii z22.s, p1/m, z1.s, z15.s + OP_ri z23.s, p1/m, z0.s, z15.s + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] +.endm + +.macro SAVEv1x4 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + eor z24.d, z16.d, z16.d + eor z25.d, z16.d, z16.d + fmla z24.s, p1/m, z16.s, alphaz_R + fmls z24.s, p1/m, z17.s, alphaz_I + fmla z25.s, p1/m, z16.s, alphaz_I + fmla z25.s, p1/m, z17.s, alphaz_R + st2w {z24.s, z25.s}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #3 + + eor z26.d, z16.d, z16.d + eor z27.d, z16.d, z16.d + fmla z26.s, p1/m, z18.s, alphaz_R + fmls z26.s, p1/m, z19.s, alphaz_I + fmla z27.s, p1/m, z18.s, alphaz_I + fmla z27.s, p1/m, z19.s, alphaz_R + st2w {z26.s, z27.s}, p1, [pCRow1] + + add pCRow1, pCRow1, lanes, lsl #3 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + eor z28.d, z16.d, z16.d + eor z29.d, z16.d, z16.d + fmla z28.s, p1/m, z20.s, alphaz_R + fmls z28.s, p1/m, z21.s, alphaz_I + fmla z29.s, p1/m, z20.s, alphaz_I + fmla z29.s, p1/m, z21.s, alphaz_R + st2w {z28.s, z29.s}, p1, [pCRow2] + + add pCRow2, pCRow2, lanes, lsl #3 + + eor z30.d, z16.d, z16.d + eor z31.d, z16.d, z16.d + fmla z30.s, p1/m, z22.s, alphaz_R + fmls z30.s, p1/m, z23.s, alphaz_I + fmla z31.s, p1/m, z22.s, alphaz_I + fmla z31.s, p1/m, z23.s, alphaz_R + st2w {z30.s, z31.s}, p1, [pCRow3] + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x2 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 +.endm + +.macro KERNELv1x2_SUB + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s + + OP_rr z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + OP_ii z18.s, p1/m, z1.s, z11.s + OP_ri z19.s, p1/m, z0.s, z11.s + + add pB, pB, 16 +.endm + +.macro SAVEv1x2 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + eor z24.d, z16.d, z16.d + eor z25.d, z16.d, z16.d + fmla z24.s, p1/m, z16.s, alphaz_R + fmls z24.s, p1/m, z17.s, alphaz_I + fmla z25.s, p1/m, z16.s, alphaz_I + fmla z25.s, p1/m, z17.s, alphaz_R + st2w {z24.s, z25.s}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #3 + + eor z26.d, z16.d, z16.d + eor z27.d, z16.d, z16.d + fmla z26.s, p1/m, z18.s, alphaz_R + fmls z26.s, p1/m, z19.s, alphaz_I + fmla z27.s, p1/m, z18.s, alphaz_I + fmla z27.s, p1/m, z19.s, alphaz_R + st2w {z26.s, z27.s}, p1, [pCRow1] + + add pCRow1, pCRow1, lanes, lsl #3 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x1 + dup z16.s, #0 + dup z17.s, #0 +.endm + + +.macro KERNELv1x1_SUB + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + + add pB, pB, 8 + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s +.endm + +.macro SAVEv1x1 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + eor z24.d, z16.d, z16.d + eor z25.d, z16.d, z16.d + fmla z24.s, p1/m, z16.s, alphaz_R + fmls z24.s, p1/m, z17.s, alphaz_I + fmla z25.s, p1/m, z16.s, alphaz_I + fmla z25.s, p1/m, z17.s, alphaz_R + st2w {z24.s, z25.s}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *8 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, s0 + dup alphaz_R, alphaR + fmov alphaI, s1 + dup alphaz_I, alphaI + + lsl LDC, LDC, #3 // ldc = ldc * 2 * 4 + ptrue p0.s // create true predicate + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB + +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble .Lctrmm_kernel_L2_BEGIN + +/******************************************************************************/ +.Lctrmm_kernel_L4_BEGIN: + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = start of A array + +.Lctrmm_kernel_L4_Mv1_BEGIN: + +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.s, counterI, origM + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Lctrmm_kernel_L4_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2 + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #4 +#endif + INITv1x4 // fill with zeros + + asr counterL , tempK, #3 + cmp counterL , #2 + blt .Lctrmm_kernel_L4_Mv1_32 + + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lctrmm_kernel_L4_Mv1_22a + + .align 5 +.Lctrmm_kernel_L4_Mv1_22: + + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + + subs counterL, counterL, #1 + bgt .Lctrmm_kernel_L4_Mv1_22 + + .align 5 +.Lctrmm_kernel_L4_Mv1_22a: + + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E + + b .Lctrmm_kernel_L4_Mv1_44 + + .align 5 +.Lctrmm_kernel_L4_Mv1_32: + + tst counterL, #1 + ble .Lctrmm_kernel_L4_Mv1_40 + + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E + + b .Lctrmm_kernel_L4_Mv1_44 + + +.Lctrmm_kernel_L4_Mv1_40: + + INITv1x4 + +.Lctrmm_kernel_L4_Mv1_44: + + ands counterL , tempK, #7 + ble .Lctrmm_kernel_L4_Mv1_100 + + .align 5 +.Lctrmm_kernel_L4_Mv1_46: + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Lctrmm_kernel_L4_Mv1_46 + +.Lctrmm_kernel_L4_Mv1_100: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #4 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2 + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Lctrmm_kernel_L4_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + b.any .Lctrmm_kernel_L4_Mv1_20 + + + +.Lctrmm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 * 2 + +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt .Lctrmm_kernel_L4_BEGIN + + +/******************************************************************************/ + +.Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble .Lctrmm_kernel_L999 + + tst counterJ , #2 + ble .Lctrmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + add pCRow1, pCRow0, LDC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + + + +.Lctrmm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + +.Lctrmm_kernel_L2_Mv1_20: + + INITv1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2 + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lctrmm_kernel_L2_Mv1_40 + .align 5 + +.Lctrmm_kernel_L2_Mv1_22: + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lctrmm_kernel_L2_Mv1_22 + + +.Lctrmm_kernel_L2_Mv1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lctrmm_kernel_L2_Mv1_100 + +.Lctrmm_kernel_L2_Mv1_42: + + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lctrmm_kernel_L2_Mv1_42 + +.Lctrmm_kernel_L2_Mv1_100: + + SAVEv1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #2 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2 + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Lctrmm_kernel_L2_Mv1_END: + + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lctrmm_kernel_L2_Mv1_20 + + +.Lctrmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + + lsl temp, origK, #4 + add origPB, origPB, temp // B = B + K * 2 * 8 * 2 + +/******************************************************************************/ + +.Lctrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lctrmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +.Lctrmm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + +.Lctrmm_kernel_L1_Mv1_20: + + INITv1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2 + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lctrmm_kernel_L1_Mv1_40 + .align 5 + +.Lctrmm_kernel_L1_Mv1_22: + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lctrmm_kernel_L1_Mv1_22 + + +.Lctrmm_kernel_L1_Mv1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lctrmm_kernel_L1_Mv1_100 + +.Lctrmm_kernel_L1_Mv1_42: + + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lctrmm_kernel_L1_Mv1_42 + +.Lctrmm_kernel_L1_Mv1_100: + + SAVEv1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #1 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2 + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Lctrmm_kernel_L1_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lctrmm_kernel_L1_Mv1_20 + +.Lctrmm_kernel_L1_END: + +/******************************************************************************/ + +.Lctrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + From 39ab2197048efca92d059f919987571cd92a903c Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Wed, 5 Jan 2022 09:12:22 +0100 Subject: [PATCH 31/77] sve copy functions for cgemm chemm zsymm --- kernel/arm64/cgemm_ncopy_sve_v1.c | 79 ++++++++++++++++ kernel/arm64/cgemm_tcopy_sve_v1.c | 75 +++++++++++++++ kernel/arm64/chemm_ltcopy_sve.c | 107 +++++++++++++++++++++ kernel/arm64/chemm_utcopy_sve.c | 108 +++++++++++++++++++++ kernel/arm64/zsymm_lcopy_sve.c | 150 ++++++++++++++++++++++++++++++ kernel/arm64/zsymm_ucopy_sve.c | 150 ++++++++++++++++++++++++++++++ param.h | 6 +- 7 files changed, 673 insertions(+), 2 deletions(-) create mode 100644 kernel/arm64/cgemm_ncopy_sve_v1.c create mode 100644 kernel/arm64/cgemm_tcopy_sve_v1.c create mode 100644 kernel/arm64/chemm_ltcopy_sve.c create mode 100644 kernel/arm64/chemm_utcopy_sve.c create mode 100644 kernel/arm64/zsymm_lcopy_sve.c create mode 100644 kernel/arm64/zsymm_ucopy_sve.c diff --git a/kernel/arm64/cgemm_ncopy_sve_v1.c b/kernel/arm64/cgemm_ncopy_sve_v1.c new file mode 100644 index 000000000..6aa44a8f6 --- /dev/null +++ b/kernel/arm64/cgemm_ncopy_sve_v1.c @@ -0,0 +1,79 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + svint32_t lda_vec = svindex_s32(0, lda * 2); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b32(j, n); + uint32_t active = svcntp_b32(svptrue_b32(), pg); + do { + + aoffset1 = aoffset; + + uint32_t i_cnt = m; + while (i_cnt--) { + svfloat32_t a_vec_real = svld1_gather_index(pg, (float *) aoffset1, lda_vec); + svfloat32_t a_vec_imag = svld1_gather_index(pg, ((float *) aoffset1) + 1, lda_vec); + svst2_f32(pg, (float *) boffset, svcreate2(a_vec_real, a_vec_imag)); + aoffset1 += 2; + boffset += active * 2; + } + aoffset += active * lda * 2; + + j += svcntw(); + pg = svwhilelt_b32(j, n); + active = svcntp_b32(svptrue_b32(), pg); + + + } while (svptest_any(svptrue_b32(), pg)); + + return 0; +} diff --git a/kernel/arm64/cgemm_tcopy_sve_v1.c b/kernel/arm64/cgemm_tcopy_sve_v1.c new file mode 100644 index 000000000..748cd954e --- /dev/null +++ b/kernel/arm64/cgemm_tcopy_sve_v1.c @@ -0,0 +1,75 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b32(j, n); + uint32_t active = svcntp_b32(svptrue_b32(), pg); + do { + + aoffset1 = aoffset; + + uint32_t i_cnt = m; + while (i_cnt--) { + svfloat32x2_t a_vec = svld2(pg, (float *)aoffset1); + svst2_f32(pg, (float *) boffset, a_vec); + aoffset1 += lda * 2; + boffset += active * 2; + } + aoffset += active * 2; + + j += svcntw(); + pg = svwhilelt_b32(j, n); + active = svcntp_b32(svptrue_b32(), pg); + + } while (svptest_any(svptrue_b32(), pg)); + + return 0; +} diff --git a/kernel/arm64/chemm_ltcopy_sve.c b/kernel/arm64/chemm_ltcopy_sve.c new file mode 100644 index 000000000..40cf9ea31 --- /dev/null +++ b/kernel/arm64/chemm_ltcopy_sve.c @@ -0,0 +1,107 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + int offset, i; + + lda *= 2; + + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t j = 0; + int32_t N = n; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmul_z(pg, temp, 2); + temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); + svint32_t temp2 = svmul_z(pg, temp, lda_vec); + temp2 = svmla_z(pg, temp2, posY_vec, 2); + svint32_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, lda_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); + if (offset <= 0) { + svbool_t off_g = svwhilelt_b32(offset, 0); + data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); + } + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + // dealing with ZERO separately + if (offset > -active && offset < 1) + b[ -2*offset + 1 ] = ZERO; + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + + return 0; +} diff --git a/kernel/arm64/chemm_utcopy_sve.c b/kernel/arm64/chemm_utcopy_sve.c new file mode 100644 index 000000000..440acdb1b --- /dev/null +++ b/kernel/arm64/chemm_utcopy_sve.c @@ -0,0 +1,108 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + int offset, i; + + lda *= 2; + + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t j = 0; + int32_t N = n; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmul_z(pg, temp, lda); + temp1 = svmla_z(pg, temp1, posY_vec, 2); + svint32_t temp2 = svmul_z(pg, temp, 2); + temp2 = svmla_z(pg, temp2, posY_vec, lda); + svint32_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, 2); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); + data_vec_imag = svneg_z(pg, data_vec_imag); + if (offset <= 0) { + svbool_t off_g = svwhilelt_b32(offset, 0); + data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); + } + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + // dealing with ZERO separately + if (offset > -active && offset < 1) + b[ -2*offset + 1 ] = ZERO; + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + + return 0; +} diff --git a/kernel/arm64/zsymm_lcopy_sve.c b/kernel/arm64/zsymm_lcopy_sve.c new file mode 100644 index 000000000..6f18aa956 --- /dev/null +++ b/kernel/arm64/zsymm_lcopy_sve.c @@ -0,0 +1,150 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, offset; + lda *= 2; + +#if defined(DOUBLE) + uint64_t sve_size = svcntd(); + svint64_t posY_vec = svdup_s64(posY); + svint64_t posX_vec = svdup_s64(posX); + svint64_t lda_vec = svdup_s64(lda); + svint64_t one_vec = svdup_s64(1LL); + + int64_t j = 0; + svbool_t pg = svwhilelt_b64(j, n); + int64_t active = svcntp_b64(svptrue_b64(), pg); + svint64_t index_neg = svindex_s64(0LL, -1LL); + svint64_t index = svindex_s64(0LL, 1LL); + do { + offset = posX - posY; + svint64_t vec_off = svdup_s64(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint64_t temp = svadd_z(pg, posX_vec, index); + svint64_t temp1 = svmul_z(pg, temp, 2); + temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); + svint64_t temp2 = svmul_z(pg, temp, lda_vec); + temp2 = svmla_z(pg, temp2, posY_vec, 2); + svint64_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, lda_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s64(posX); + j += sve_size; + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + } while (svptest_any(svptrue_b64(), pg)); + +#else + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t N = n; + int32_t j = 0; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmul_z(pg, temp, 2); + temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); + svint32_t temp2 = svmul_z(pg, temp, lda_vec); + temp2 = svmla_z(pg, temp2, posY_vec, 2); + svint32_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, lda_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + +#endif + + return 0; +} diff --git a/kernel/arm64/zsymm_ucopy_sve.c b/kernel/arm64/zsymm_ucopy_sve.c new file mode 100644 index 000000000..6be48cdaf --- /dev/null +++ b/kernel/arm64/zsymm_ucopy_sve.c @@ -0,0 +1,150 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, offset; + lda *= 2; + +#if defined(DOUBLE) + uint64_t sve_size = svcntd(); + svint64_t posY_vec = svdup_s64(posY); + svint64_t posX_vec = svdup_s64(posX); + svint64_t lda_vec = svdup_s64(lda); + svint64_t one_vec = svdup_s64(1LL); + + int64_t j = 0; + svbool_t pg = svwhilelt_b64(j, n); + int64_t active = svcntp_b64(svptrue_b64(), pg); + svint64_t index_neg = svindex_s64(0LL, -1LL); + svint64_t index = svindex_s64(0LL, 1LL); + do { + offset = posX - posY; + svint64_t vec_off = svdup_s64(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint64_t temp = svadd_z(pg, posX_vec, index); + svint64_t temp1 = svmul_z(pg, temp, lda_vec); + temp1 = svmla_z(pg, temp1, posY_vec, 2); + svint64_t temp2 = svmul_z(pg, temp, 2); + temp2 = svmla_z(pg, temp2, posY_vec, lda); + svint64_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, 2); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s64(posX); + j += sve_size; + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + } while (svptest_any(svptrue_b64(), pg)); + +#else + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t N = n; + int32_t j = 0; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmul_z(pg, temp, lda_vec); + temp1 = svmla_z(pg, temp1, posY_vec, 2); + svint32_t temp2 = svmul_z(pg, temp, 2); + temp2 = svmla_z(pg, temp2, posY_vec, lda); + svint32_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, 2); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + +#endif + + return 0; +} diff --git a/param.h b/param.h index 8dd2a7461..5d46991a2 100644 --- a/param.h +++ b/param.h @@ -3325,11 +3325,13 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define DGEMM_DEFAULT_UNROLL_MN 32 -#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_MN 32 -#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_MN 32 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 160 From 0c91d043ae8d2dba0c7d3eeb2f63d17d9776c7e9 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Wed, 5 Jan 2022 14:36:39 +0100 Subject: [PATCH 32/77] adapt CMake for SVE --- kernel/CMakeLists.txt | 50 ++++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 9849ddc93..717c1ea72 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -323,35 +323,61 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) #hemm - GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "hemm_iutcopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "hemm_iltcopy" false "" "" false ${float_type}) +if (NOT DEFINED ${float_char}HEMMUTCOPY_M) + set(HEMMUTCOPY_M "generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c") + set(HEMMLTCOPY_M "generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(HEMMUTCOPY_M "${KERNELDIR}/${${float_char}HEMMUTCOPY_M}") + set(HEMMLTCOPY_M "${KERNELDIR}/${${float_char}HEMMLTCOPY_M}") +endif() + GenerateNamedObjects(${HEMMUTCOPY_M} "" "hemm_iutcopy" false "" "" false ${float_type}) + GenerateNamedObjects(${HEMMLTCOPY_M} "LOWER" "hemm_iltcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "hemm_outcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "hemm_oltcopy" false "" "" false ${float_type}) # symm for c and z +if (NOT DEFINED ${float_char}SYMMUCOPY_M) + set(SYMMUCOPY_M "generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c") + set(SYMMLCOPY_M "generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}") + set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}") +endif() GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type}) + GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) + GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type}) + +if (NOT DEFINED ${float_char}TRMMUNCOPY_M) + set(TRMMUNCOPY_M "generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRMMLNCOPY_M "generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRMMUTCOPY_M "generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRMMLTCOPY_M "generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}") + set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}") + set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}") + set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}") +endif () + GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) From f33543d029199ee1bf0786e16ff0610a6711c726 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Wed, 5 Jan 2022 14:42:37 +0100 Subject: [PATCH 33/77] combine zchemm into single file --- kernel/arm64/KERNEL.A64FX | 4 +- kernel/arm64/KERNEL.ARMV8SVE | 4 +- kernel/arm64/chemm_ltcopy_sve.c | 107 ------------------------------- kernel/arm64/chemm_utcopy_sve.c | 108 -------------------------------- kernel/arm64/zhemm_ltcopy_sve.c | 66 +++++++++++++++++++ kernel/arm64/zhemm_utcopy_sve.c | 65 +++++++++++++++++++ 6 files changed, 135 insertions(+), 219 deletions(-) delete mode 100644 kernel/arm64/chemm_ltcopy_sve.c delete mode 100644 kernel/arm64/chemm_utcopy_sve.c diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index 76dda0c65..d74f0592d 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -174,8 +174,8 @@ CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c -CHEMMLTCOPY_M = chemm_ltcopy_sve.c -CHEMMUTCOPY_M = chemm_utcopy_sve.c +CHEMMLTCOPY_M = zhemm_ltcopy_sve.c +CHEMMUTCOPY_M = zhemm_utcopy_sve.c CSYMMUCOPY_M = zsymm_ucopy_sve.c CSYMMLCOPY_M = zsymm_lcopy_sve.c diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index 63dfde22f..66de642a5 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -174,8 +174,8 @@ CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c -CHEMMLTCOPY_M = chemm_ltcopy_sve.c -CHEMMUTCOPY_M = chemm_utcopy_sve.c +CHEMMLTCOPY_M = zhemm_ltcopy_sve.c +CHEMMUTCOPY_M = zhemm_utcopy_sve.c CSYMMUCOPY_M = zsymm_ucopy_sve.c CSYMMLCOPY_M = zsymm_lcopy_sve.c diff --git a/kernel/arm64/chemm_ltcopy_sve.c b/kernel/arm64/chemm_ltcopy_sve.c deleted file mode 100644 index 40cf9ea31..000000000 --- a/kernel/arm64/chemm_ltcopy_sve.c +++ /dev/null @@ -1,107 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#include -#include "common.h" -#include - -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - - int offset, i; - - lda *= 2; - - uint32_t sve_size = svcntw(); - svint32_t posY_vec = svdup_s32(posY); - svint32_t posX_vec = svdup_s32(posX); - svint32_t lda_vec = svdup_s32(lda); - svint32_t one_vec = svdup_s32(1); - - int32_t j = 0; - int32_t N = n; - svbool_t pg = svwhilelt_b32(j, N); - int32_t active = svcntp_b32(svptrue_b32(), pg); - svint32_t index_neg = svindex_s32(0, -1); - svint32_t index = svindex_s32(0, 1); - - do { - offset = posX - posY; - svint32_t vec_off = svdup_s32(offset); - svbool_t cmp = svcmpgt(pg, vec_off, index_neg); - - svint32_t temp = svadd_z(pg, posX_vec, index); - svint32_t temp1 = svmul_z(pg, temp, 2); - temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); - svint32_t temp2 = svmul_z(pg, temp, lda_vec); - temp2 = svmla_z(pg, temp2, posY_vec, 2); - svint32_t gat_ind = svsel(cmp, temp1, temp2); - - i = m; - while (i>0) { - svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); - svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); - - gat_ind = svadd_m(cmp, gat_ind, lda_vec); - gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); - if (offset <= 0) { - svbool_t off_g = svwhilelt_b32(offset, 0); - data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); - } - - svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); - // dealing with ZERO separately - if (offset > -active && offset < 1) - b[ -2*offset + 1 ] = ZERO; - - b += active * 2; - offset --; - vec_off = svsub_z(pg, vec_off, one_vec); - cmp = svcmpgt(pg, vec_off, index_neg); - - i--; - } - - posX += sve_size; - posX_vec = svdup_s32(posX); - j += sve_size; - pg = svwhilelt_b32(j, N); - active = svcntp_b32(svptrue_b32(), pg); - } while (svptest_any(svptrue_b32(), pg)); - - return 0; -} diff --git a/kernel/arm64/chemm_utcopy_sve.c b/kernel/arm64/chemm_utcopy_sve.c deleted file mode 100644 index 440acdb1b..000000000 --- a/kernel/arm64/chemm_utcopy_sve.c +++ /dev/null @@ -1,108 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#include -#include "common.h" -#include - -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - - int offset, i; - - lda *= 2; - - uint32_t sve_size = svcntw(); - svint32_t posY_vec = svdup_s32(posY); - svint32_t posX_vec = svdup_s32(posX); - svint32_t lda_vec = svdup_s32(lda); - svint32_t one_vec = svdup_s32(1); - - int32_t j = 0; - int32_t N = n; - svbool_t pg = svwhilelt_b32(j, N); - int32_t active = svcntp_b32(svptrue_b32(), pg); - svint32_t index_neg = svindex_s32(0, -1); - svint32_t index = svindex_s32(0, 1); - - do { - offset = posX - posY; - svint32_t vec_off = svdup_s32(offset); - svbool_t cmp = svcmpgt(pg, vec_off, index_neg); - - svint32_t temp = svadd_z(pg, posX_vec, index); - svint32_t temp1 = svmul_z(pg, temp, lda); - temp1 = svmla_z(pg, temp1, posY_vec, 2); - svint32_t temp2 = svmul_z(pg, temp, 2); - temp2 = svmla_z(pg, temp2, posY_vec, lda); - svint32_t gat_ind = svsel(cmp, temp1, temp2); - - i = m; - while (i>0) { - svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); - svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); - - gat_ind = svadd_m(cmp, gat_ind, 2); - gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); - data_vec_imag = svneg_z(pg, data_vec_imag); - if (offset <= 0) { - svbool_t off_g = svwhilelt_b32(offset, 0); - data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); - } - - svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); - // dealing with ZERO separately - if (offset > -active && offset < 1) - b[ -2*offset + 1 ] = ZERO; - - b += active * 2; - offset --; - vec_off = svsub_z(pg, vec_off, one_vec); - cmp = svcmpgt(pg, vec_off, index_neg); - - i--; - } - - posX += sve_size; - posX_vec = svdup_s32(posX); - j += sve_size; - pg = svwhilelt_b32(j, N); - active = svcntp_b32(svptrue_b32(), pg); - } while (svptest_any(svptrue_b32(), pg)); - - return 0; -} diff --git a/kernel/arm64/zhemm_ltcopy_sve.c b/kernel/arm64/zhemm_ltcopy_sve.c index 58e9ff589..37dbfe4e1 100644 --- a/kernel/arm64/zhemm_ltcopy_sve.c +++ b/kernel/arm64/zhemm_ltcopy_sve.c @@ -42,6 +42,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ +#if defined(DOUBLE) BLASLONG offset, i; lda *= 2; @@ -102,5 +103,70 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); +#else + + int offset, i; + + lda *= 2; + + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t j = 0; + int32_t N = n; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmul_z(pg, temp, 2); + temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); + svint32_t temp2 = svmul_z(pg, temp, lda_vec); + temp2 = svmla_z(pg, temp2, posY_vec, 2); + svint32_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, lda_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); + if (offset <= 0) { + svbool_t off_g = svwhilelt_b32(offset, 0); + data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); + } + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + // dealing with ZERO separately + if (offset > -active && offset < 1) + b[ -2*offset + 1 ] = ZERO; + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + +#endif + return 0; } diff --git a/kernel/arm64/zhemm_utcopy_sve.c b/kernel/arm64/zhemm_utcopy_sve.c index 9ddbf6cbd..21e03b7be 100644 --- a/kernel/arm64/zhemm_utcopy_sve.c +++ b/kernel/arm64/zhemm_utcopy_sve.c @@ -42,6 +42,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ +#if defined(DOUBLE) BLASLONG offset, i; lda *= 2; @@ -102,6 +103,70 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON pg = svwhilelt_b64(j, n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); +#else + int offset, i; + + lda *= 2; + + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t j = 0; + int32_t N = n; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmul_z(pg, temp, lda); + temp1 = svmla_z(pg, temp1, posY_vec, 2); + svint32_t temp2 = svmul_z(pg, temp, 2); + temp2 = svmla_z(pg, temp2, posY_vec, lda); + svint32_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, 2); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); + data_vec_imag = svneg_z(pg, data_vec_imag); + if (offset <= 0) { + svbool_t off_g = svwhilelt_b32(offset, 0); + data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); + } + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + // dealing with ZERO separately + if (offset > -active && offset < 1) + b[ -2*offset + 1 ] = ZERO; + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + +#endif return 0; } From bb33446b409a388b05d918dd251efd4b445e6f47 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Thu, 6 Jan 2022 10:26:11 +0100 Subject: [PATCH 34/77] fix makefile.L3 --- kernel/Makefile.L3 | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 1c0931d96..2a10ac980 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -1712,10 +1712,10 @@ $(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) $(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ else -$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c +$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ -$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c +$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ endif @@ -1726,10 +1726,10 @@ $(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) $(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ else -$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c +$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ -$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c +$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ endif @@ -1740,10 +1740,10 @@ $(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) $(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ else -$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c +$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ -$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c +$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ endif From cbcea149f0ed0bf966dafb5bd5b6612945b54858 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Thu, 6 Jan 2022 10:29:35 +0100 Subject: [PATCH 35/77] update contributors --- CONTRIBUTORS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 39ec96246..879aaebe3 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -201,3 +201,5 @@ In chronological order: * Bine Brank * [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE * [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM + * [2021-11-12] SVE kernels for SGEMM, STRMM and corresponding SVE copy functions + * [2022-01-06] SVE kernels for CGEMM, ZGEMM, CTRMM, ZTRMM and corresponding SVE copy functions From 19c8f615dc507e20aee724aedb572bdddc2cd497 Mon Sep 17 00:00:00 2001 From: Sunita Nadampalli Date: Fri, 7 Jan 2022 00:28:17 +0000 Subject: [PATCH 36/77] OpenBLAS: aarch64: Add neoverse-v1/n2 architecture specifics --- Makefile.arm64 | 60 +++++++++++ Makefile.system | 3 + TargetList.txt | 2 + cmake/arch.cmake | 2 +- cmake/prebuild.cmake | 62 ++++++++++- cpuid_arm64.c | 44 +++++++- driver/others/dynamic_arm64.c | 2 + getarch.c | 37 ++++++- kernel/arm64/KERNEL.NEOVERSEN2 | 189 +++++++++++++++++++++++++++++++++ kernel/arm64/KERNEL.NEOVERSEV1 | 189 +++++++++++++++++++++++++++++++++ param.h | 58 ++++++++++ 11 files changed, 641 insertions(+), 7 deletions(-) create mode 100644 kernel/arm64/KERNEL.NEOVERSEN2 create mode 100644 kernel/arm64/KERNEL.NEOVERSEV1 diff --git a/Makefile.arm64 b/Makefile.arm64 index 801601030..2eade8d78 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -78,6 +78,66 @@ endif endif endif +# Use a72 tunings because Neoverse-V1 is only available +# in GCC>=9.4 +ifeq ($(CORE), NEOVERSEV1) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) +ifeq ($(GCCVERSIONGTEQ9), 1) +ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) +CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 +endif +else +CCOMMON_OPT += -march=armv8.4-a -mtune=native +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.4-a -mtune=native +endif +endif +else +CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +endif +endif +else +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +endif +endif +endif + +# Use a72 tunings because Neoverse-N2 is only available +# in GCC>=9.4 +ifeq ($(CORE), NEOVERSEN2) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) +ifeq ($(GCCVERSIONGTEQ9), 1) +ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) +CCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2 +endif +else +CCOMMON_OPT += -march=armv8.5-a -mtune=native +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.5-a -mtune=native +endif +endif +else +CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +endif +endif +else +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +endif +endif +endif + # Use a53 tunings because a55 is only available in GCC>=8.1 ifeq ($(CORE), CORTEXA55) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) diff --git a/Makefile.system b/Makefile.system index 97fdc3f91..9203f49cb 100644 --- a/Makefile.system +++ b/Makefile.system @@ -374,6 +374,7 @@ else endif GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1) GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2) +GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 4) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) endif @@ -654,6 +655,8 @@ DYNAMIC_CORE += CORTEXA57 DYNAMIC_CORE += CORTEXA72 DYNAMIC_CORE += CORTEXA73 DYNAMIC_CORE += NEOVERSEN1 +DYNAMIC_CORE += NEOVERSEV1 +DYNAMIC_CORE += NEOVERSEN2 DYNAMIC_CORE += CORTEXA55 DYNAMIC_CORE += FALKOR DYNAMIC_CORE += THUNDERX diff --git a/TargetList.txt b/TargetList.txt index b02a011d5..97c8a8f06 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -93,6 +93,8 @@ CORTEXA57 CORTEXA72 CORTEXA73 NEOVERSEN1 +NEOVERSEV1 +NEOVERSEN2 CORTEXA55 EMAG8180 FALKOR diff --git a/cmake/arch.cmake b/cmake/arch.cmake index d468eb60b..f4a135e82 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -44,7 +44,7 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 NEOVERSEV1 NEOVERSEN2 THUNDERX3T110) if (DYNAMIC_LIST) set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) endif () diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 259d9c738..5f12bb145 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -243,11 +243,11 @@ endif () "#define L1_CODE_ASSOCIATIVE\t4\n" "#define L1_DATA_SIZE\t65536\n" "#define L1_DATA_LINESIZE\t64\n" - "#define L1_DATA_ASSOCIATIVE\t2\n" + "#define L1_DATA_ASSOCIATIVE\t4\n" "#define L2_SIZE\t1048576\n\n" "#define L2_LINESIZE\t64\n" - "#define L2_ASSOCIATIVE\t16\n" - "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define DTB_DEFAULT_ENTRIES\t48\n" "#define DTB_SIZE\t4096\n" "#define HAVE_VFPV4\n" "#define HAVE_VFPV3\n" @@ -263,6 +263,62 @@ endif () set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "NEOVERSEV1") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t4\n" + "#define L1_DATA_SIZE\t65536\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t4\n" + "#define L2_SIZE\t1048576\n\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define DTB_DEFAULT_ENTRIES\t48\n" + "#define DTB_SIZE\t4096\n" + "#define HAVE_VFPV4\n" + "#define HAVE_VFPV3\n" + "#define HAVE_VFP\n" + "#define HAVE_NEON\n" + "#define HAVE_SVE\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "NEOVERSEN2") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t4\n" + "#define L1_DATA_SIZE\t65536\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t2\n" + "#define L2_SIZE\t1048576\n\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define DTB_DEFAULT_ENTRIES\t48\n" + "#define DTB_SIZE\t4096\n" + "#define HAVE_VFPV4\n" + "#define HAVE_VFPV3\n" + "#define HAVE_VFP\n" + "#define HAVE_NEON\n" + "#define HAVE_SVE\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "FALKOR") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t65536\n" diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 958e94abc..cc3a82815 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -43,6 +43,8 @@ size_t length64=sizeof(value64); #define CPU_CORTEXA72 4 #define CPU_CORTEXA73 5 #define CPU_NEOVERSEN1 11 +#define CPU_NEOVERSEV1 16 +#define CPU_NEOVERSEN2 17 // Qualcomm #define CPU_FALKOR 6 // Cavium @@ -71,6 +73,8 @@ static char *cpuname[] = { "TSV110", "EMAG8180", "NEOVERSEN1", + "NEOVERSEV1" + "NEOVERSEN2" "THUNDERX3T110", "VORTEX", "CORTEXA55", @@ -90,6 +94,8 @@ static char *cpuname_lower[] = { "tsv110", "emag8180", "neoversen1", + "neoversev1", + "neoversen2", "thunderx3t110", "vortex", "cortexa55", @@ -170,6 +176,10 @@ int detect(void) return CPU_CORTEXA73; else if (strstr(cpu_part, "0xd0c")) return CPU_NEOVERSEN1; + else if (strstr(cpu_part, "0xd40")) + return CPU_NEOVERSEV1; + else if (strstr(cpu_part, "0xd49")) + return CPU_NEOVERSEN2; else if (strstr(cpu_part, "0xd05")) return CPU_CORTEXA55; } @@ -338,11 +348,41 @@ void get_cpuconfig(void) printf("#define L1_DATA_ASSOCIATIVE 4\n"); printf("#define L2_SIZE 1048576\n"); printf("#define L2_LINESIZE 64\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + printf("#define DTB_DEFAULT_ENTRIES 48\n"); printf("#define DTB_SIZE 4096\n"); break; + case CPU_NEOVERSEV1: + printf("#define %s\n", cpuname[d]); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 4\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + printf("#define DTB_DEFAULT_ENTRIES 48\n"); + printf("#define DTB_SIZE 4096\n"); + break; + + case CPU_NEOVERSEN2: + printf("#define %s\n", cpuname[d]); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 4\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + printf("#define DTB_DEFAULT_ENTRIES 48\n"); + printf("#define DTB_SIZE 4096\n"); + break; + case CPU_FALKOR: printf("#define FALKOR\n"); printf("#define L1_CODE_SIZE 65536\n"); diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 04ceaaf6d..45ea9f113 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -147,6 +147,8 @@ static char *corename[] = { "tsv110", "emag8180", "neoversen1", + "neoversev1", + "neoversen2", "thunderx3t110", "cortexa55", "unknown" diff --git a/getarch.c b/getarch.c index 6063a2a1d..73bbf1892 100644 --- a/getarch.c +++ b/getarch.c @@ -1302,12 +1302,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8 " \ - "-march=armv8.2-a -mtune=cortex-a72" + "-march=armv8.2-a -mtune=neoverse-n1" #define LIBNAME "neoversen1" #define CORENAME "NEOVERSEN1" #else #endif +#ifdef FORCE_NEOVERSEV1 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "NEOVERSEV1" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DNEOVERSEV1 " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \ + "-march=armv8.4-a -mtune=neoverse-v1" +#define LIBNAME "neoversev1" +#define CORENAME "NEOVERSEV1" +#else +#endif + + +#ifdef FORCE_NEOVERSEN2 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "NEOVERSEN2" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DNEOVERSEN2 " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \ + "-march=armv8.5-a -mtune=neoverse-n2" +#define LIBNAME "neoversen2" +#define CORENAME "NEOVERSEN2" +#else +#endif + #ifdef FORCE_CORTEXA55 #define FORCE #define ARCHITECTURE "ARM64" diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2 new file mode 100644 index 000000000..ea010db42 --- /dev/null +++ b/kernel/arm64/KERNEL.NEOVERSEN2 @@ -0,0 +1,189 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c + +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +SNRM2KERNEL = scnrm2_thunderx2t99.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c + +DDOTKERNEL = dot_thunderx2t99.c +SDOTKERNEL = dot_thunderx2t99.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(SGEMM_UNROLL_N), 16) +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +else +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +endif +ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +else +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +endif +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 new file mode 100644 index 000000000..ea010db42 --- /dev/null +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -0,0 +1,189 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c + +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +SNRM2KERNEL = scnrm2_thunderx2t99.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c + +DDOTKERNEL = dot_thunderx2t99.c +SDOTKERNEL = dot_thunderx2t99.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(SGEMM_UNROLL_N), 16) +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +else +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +endif +ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +else +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +endif +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/param.h b/param.h index 8dd2a7461..eb4dcb8f0 100644 --- a/param.h +++ b/param.h @@ -3307,6 +3307,64 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#elif defined(NEOVERSEV1) + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#elif defined(NEOVERSEN2) + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #elif defined(ARMV8SVE) || defined(A64FX) /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". From 15d4b379138b9a5b84a2fbc2d37cb47b33efdeec Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 8 Jan 2022 23:48:13 +0100 Subject: [PATCH 37/77] SkylakeX: match parameters to dgemm kernels for dyn/non-dyn --- param.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/param.h b/param.h index 2dffaae3c..4155131f0 100644 --- a/param.h +++ b/param.h @@ -1669,10 +1669,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define SGEMM_DEFAULT_UNROLL_M 16 -#ifndef DYNAMIC_ARCH -#define DGEMM_DEFAULT_UNROLL_M 16 -#else +#ifdef DYNAMIC_ARCH #define DGEMM_DEFAULT_UNROLL_M 4 +#else +#define DGEMM_DEFAULT_UNROLL_M 16 #endif #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 8 @@ -1680,10 +1680,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 -#ifndef DYNAMIC_ARCH -#define DGEMM_DEFAULT_UNROLL_N 2 -#else +#ifdef DYNAMIC_ARCH #define DGEMM_DEFAULT_UNROLL_N 8 +#else +#define DGEMM_DEFAULT_UNROLL_N 2 #endif #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 @@ -1718,17 +1718,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define SGEMM_DEFAULT_P 448 +#ifndef DYNAMIC_ARCH #define DGEMM_DEFAULT_P 192 +#else +#define DGEMM_DEFAULT_P 384 +#endif #define CGEMM_DEFAULT_P 384 #define ZGEMM_DEFAULT_P 256 #define SGEMM_DEFAULT_Q 448 +#ifndef DYNAMIC_ARCH #define DGEMM_DEFAULT_Q 384 +#else +#define DGEMM_DEFAULT_Q 168 +#endif #define CGEMM_DEFAULT_Q 192 #define ZGEMM_DEFAULT_Q 128 #define SGEMM_DEFAULT_R sgemm_r +#ifndef DYNAMIC_ARCH #define DGEMM_DEFAULT_R 8640 +#else +#define DGEMM_DEFAULT_R 13824 +#endif #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r From f1ac59f20057cefe4dd45122954e2403f1330835 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 8 Jan 2022 23:48:58 +0100 Subject: [PATCH 38/77] Forward DYNAMIC_ARCH option to Makefile.prebuild --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 97fdc3f91..7909f677a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -277,7 +277,7 @@ HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS) # Generating Makefile.conf and config.h -DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) +DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf From 2573ccfb2e02abec3f537479d65b58c4d6e746f3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 8 Jan 2022 23:50:34 +0100 Subject: [PATCH 39/77] make DYNAMIC_ARCH option available to getarch_2nd/param.h --- Makefile.prebuild | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.prebuild b/Makefile.prebuild index d6395da7b..399db956f 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -3,6 +3,10 @@ export BINARY export USE_OPENMP +ifdef DYNAMIC_ARCH +override HOST_CFLAGS += -DDYNAMIC_ARCH +endif + ifdef TARGET_CORE TARGET_MAKE = Makefile_kernel.conf TARGET_CONF = config_kernel.h From be7e55880c91d626a667aff699447c3ba5ab280e Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 9 Jan 2022 19:40:04 +0100 Subject: [PATCH 40/77] sve trsm_kernel_LN --- kernel/arm64/trsm_kernel_LN_sve.c | 301 ++++++++++++++++++++++++++++++ 1 file changed, 301 insertions(+) create mode 100644 kernel/arm64/trsm_kernel_LN_sve.c diff --git a/kernel/arm64/trsm_kernel_LN_sve.c b/kernel/arm64/trsm_kernel_LN_sve.c new file mode 100644 index 000000000..8ca10036b --- /dev/null +++ b/kernel/arm64/trsm_kernel_LN_sve.c @@ -0,0 +1,301 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include "arm_sve.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = 0; k < i; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a -= m; + b -= 2 * n; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a -= m * 2; + b -= 4 * n; + } + +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + int sve_size = svcntd(); + +#if 0 + fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = m + offset; + + i = m % sve_size; + if (i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + + } + + i = sve_size; + if (i <= m) { + aa = a + ((m & ~(sve_size - 1)) - sve_size) * k * COMPSIZE; + cc = c + ((m & ~(sve_size - 1)) - sve_size) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(sve_size, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + sve_size * kk * COMPSIZE, + b + sve_size * kk * COMPSIZE, + cc, + ldc); + } + + solve(sve_size, GEMM_UNROLL_N, + aa + (kk - sve_size) * sve_size * COMPSIZE, + b + (kk - sve_size) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa -= sve_size * k * COMPSIZE; + cc -= sve_size * COMPSIZE; + kk -= sve_size; + + i += sve_size; + } while (i <= m); + } + + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = m + offset; + + i = m % sve_size; + if (i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + + } + + i = sve_size; + if (i <= m) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(sve_size, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + sve_size * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(sve_size, j, + aa + (kk - sve_size) * sve_size * COMPSIZE, + b + (kk - sve_size) * j * COMPSIZE, + cc, ldc); + + aa -= sve_size * k * COMPSIZE; + cc -= sve_size * COMPSIZE; + kk -= sve_size; + + i += sve_size; + } while (i <= m); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} From 098672b51b0c3a903be4be951ff60741cba43664 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 9 Jan 2022 20:11:47 +0100 Subject: [PATCH 41/77] add trsm_kernel_LT_sve --- kernel/arm64/trsm_kernel_LN_sve.c | 21 ++- kernel/arm64/trsm_kernel_LT_sve.c | 290 ++++++++++++++++++++++++++++++ 2 files changed, 307 insertions(+), 4 deletions(-) create mode 100644 kernel/arm64/trsm_kernel_LT_sve.c diff --git a/kernel/arm64/trsm_kernel_LN_sve.c b/kernel/arm64/trsm_kernel_LN_sve.c index 8ca10036b..c29c3b57a 100644 --- a/kernel/arm64/trsm_kernel_LN_sve.c +++ b/kernel/arm64/trsm_kernel_LN_sve.c @@ -47,9 +47,22 @@ static FLOAT dm1 = -1.; #define GEMM_KERNEL GEMM_KERNEL_N #endif -#if GEMM_DEFAULT_UNROLL_M == 16 -#define GEMM_UNROLL_M_SHIFT 4 +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 #endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + #if GEMM_DEFAULT_UNROLL_N == 16 #define GEMM_UNROLL_N_SHIFT 4 #endif @@ -262,8 +275,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, i = sve_size; if (i <= m) { - aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; - cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + aa = a + ((m & ~(sve_size - 1)) - sve_size) * k * COMPSIZE; + cc = c + ((m & ~(sve_size - 1)) - sve_size) * COMPSIZE; do { if (k - kk > 0) { diff --git a/kernel/arm64/trsm_kernel_LT_sve.c b/kernel/arm64/trsm_kernel_LT_sve.c new file mode 100644 index 000000000..a35696836 --- /dev/null +++ b/kernel/arm64/trsm_kernel_LT_sve.c @@ -0,0 +1,290 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < m; i++) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = i + 1; k < m; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a += m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = i + 1; k < m; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a += m * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + int sve_size = svcntd(); + +#if 0 + fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = offset; + aa = a; + cc = c; + + i = sve_size; + + while (i <= m) { + + if (kk > 0) { + GEMM_KERNEL(sve_size, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(sve_size, GEMM_UNROLL_N, + aa + kk * sve_size * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += sve_size * k * COMPSIZE; + cc += sve_size * COMPSIZE; + kk += sve_size; + i += sve_size; + } + + i = m % sve_size; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += sve_size; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = offset; + aa = a; + cc = c; + + i = sve_size; + + while (i <= m) { + if (kk > 0) { + GEMM_KERNEL(sve_size, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(sve_size, j, + aa + kk * sve_size * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += sve_size * k * COMPSIZE; + cc += sve_size * COMPSIZE; + kk += sve_size; + i += sve_size; + } + + i = sve_size % m; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} From a9e297e4764faa53b146de1b0c3ed82e2632e42c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Jan 2022 23:31:59 +0100 Subject: [PATCH 42/77] Fix handling of ifdef/ifndef --- cmake/utils.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index c5ee65384..56c1cb060 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -125,7 +125,7 @@ macro(ParseMakefileVars MAKEFILE_IN) if (NOT "${line_match}" STREQUAL "") #message(STATUS "${CMAKE_MATCH_1} first: ${CMAKE_MATCH_2}") set (ElseSeen 0) - if (DEFINED ${CMAKE_MATCH_2}) + if (${CMAKE_MATCH_2}) if (${CMAKE_MATCH_1} STREQUAL "ifdef") #message (STATUS "condition is true") set (IfElse 1) From e8939b3d30e090b162303fcfbec2e7479a98ca6c Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Mon, 10 Jan 2022 20:42:20 +0100 Subject: [PATCH 43/77] sve trsmRN and trsmRT --- kernel/arm64/trsm_kernel_LT_sve.c | 1 + kernel/arm64/trsm_kernel_RN_sve.c | 289 +++++++++++++++++++++++++++ kernel/arm64/trsm_kernel_RT_sve.c | 313 ++++++++++++++++++++++++++++++ 3 files changed, 603 insertions(+) create mode 100644 kernel/arm64/trsm_kernel_RN_sve.c create mode 100644 kernel/arm64/trsm_kernel_RT_sve.c diff --git a/kernel/arm64/trsm_kernel_LT_sve.c b/kernel/arm64/trsm_kernel_LT_sve.c index a35696836..7f5459702 100644 --- a/kernel/arm64/trsm_kernel_LT_sve.c +++ b/kernel/arm64/trsm_kernel_LT_sve.c @@ -37,6 +37,7 @@ /*********************************************************************/ #include "common.h" +#include "arm_sve.h" static FLOAT dm1 = -1.; diff --git a/kernel/arm64/trsm_kernel_RN_sve.c b/kernel/arm64/trsm_kernel_RN_sve.c new file mode 100644 index 000000000..2f6611c1c --- /dev/null +++ b/kernel/arm64/trsm_kernel_RN_sve.c @@ -0,0 +1,289 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include "arm_sve.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < n; i++) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = i + 1; k < n; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b += n; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = -aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = i + 1; k < n; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + int sve_size = svcntd(); + +#if 0 + fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + j = (n >> GEMM_UNROLL_N_SHIFT); + kk = -offset; + + while (j > 0) { + + aa = a; + cc = c; + + i = sve_size; + + if (i <= m) { + do { + if (kk > 0) { + GEMM_KERNEL(sve_size, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(sve_size, GEMM_UNROLL_N, + aa + kk * sve_size * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += sve_size * k * COMPSIZE; + cc += sve_size * COMPSIZE; + i += sve_size; + } while (i <= m); + } + + + i = m % sve_size; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += sve_size; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + aa = a; + cc = c; + + i = sve_size; + + while (i <= m) { + if (kk > 0) { + GEMM_KERNEL(sve_size, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(sve_size, j, + aa + kk * sve_size * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += sve_size * k * COMPSIZE; + cc += sve_size * COMPSIZE; + i += sve_size; + } + + i = m % sve_size; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/arm64/trsm_kernel_RT_sve.c b/kernel/arm64/trsm_kernel_RT_sve.c new file mode 100644 index 000000000..d93ebe7ad --- /dev/null +++ b/kernel/arm64/trsm_kernel_RT_sve.c @@ -0,0 +1,313 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include "arm_sve.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = 0; k < i; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b -= n; + a -= 2 * m; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = - aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b -= n * 2; + a -= 4 * m; + } + +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + int sve_size = svcntd(); + +#if 0 + fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + if (n & (GEMM_UNROLL_N - 1)) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (n & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = sve_size; + if (i <= m) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(sve_size, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + sve_size * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(sve_size, j, + aa + (kk - j) * sve_size * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += sve_size * k * COMPSIZE; + cc += sve_size * COMPSIZE; + i += sve_size; + } while (i <= m); + } + + i = m % sve_size; + if (i) { + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + kk -= j; + } + j <<= 1; + } + } + + j = (n >> GEMM_UNROLL_N_SHIFT); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = sve_size; + if (i <= m) { + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i += sve_size; + } while (i <= m); + } + + i = m % sve_size; + if (i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + From f87468ac916c7a64a9d8256bb6b81a36245f3bae Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Mon, 10 Jan 2022 21:45:37 +0100 Subject: [PATCH 44/77] trsm_lncopy_sve --- kernel/arm64/trsm_lncopy_sve.c | 114 +++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 kernel/arm64/trsm_lncopy_sve.c diff --git a/kernel/arm64/trsm_lncopy_sve.c b/kernel/arm64/trsm_lncopy_sve.c new file mode 100644 index 000000000..d96a1f383 --- /dev/null +++ b/kernel/arm64/trsm_lncopy_sve.c @@ -0,0 +1,114 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT *ao; + + jj = offset; + int js = 0; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + for (int k = 0; k < j; k++) { + *(b + j * n_active + k) = *(a + k * lda + j); + } + *(b + j * n_active + j) = INV(*(a + j * lda + j)); + } + } + + if (ii > jj) { + for (int j = 0; j < n_active; j++) { + svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); + svst1(pn, b, aj_vec); + ao++; + } + + } + + b += n_active * n_active; + + i += n_active; + ii += n_active; + } while (i < m); + + + a += n_active * lda; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} From 8071e179f1ba0c65da0841cc533d0f8d6b15c6ef Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Tue, 11 Jan 2022 21:16:38 +0100 Subject: [PATCH 45/77] add remaining sve trsm copy kernels --- kernel/arm64/trsm_ltcopy_sve.c | 114 +++++++++++++++++++++++++++++++++ kernel/arm64/trsm_uncopy_sve.c | 113 ++++++++++++++++++++++++++++++++ kernel/arm64/trsm_utcopy_sve.c | 114 +++++++++++++++++++++++++++++++++ 3 files changed, 341 insertions(+) create mode 100644 kernel/arm64/trsm_ltcopy_sve.c create mode 100644 kernel/arm64/trsm_uncopy_sve.c create mode 100644 kernel/arm64/trsm_utcopy_sve.c diff --git a/kernel/arm64/trsm_ltcopy_sve.c b/kernel/arm64/trsm_ltcopy_sve.c new file mode 100644 index 000000000..9012f7fe5 --- /dev/null +++ b/kernel/arm64/trsm_ltcopy_sve.c @@ -0,0 +1,114 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT *ao; + + jj = offset; + int js = 0; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + *(b + j * n_active + j) = INV(*(a + j * lda + j)); + for (int k = j+1; k < n_active; k++) { + *(b + j * n_active + k) = *(a + j * lda + k); + } + } + } + + if (ii < jj) { + for (int j = 0; j < n_active; j++) { + svfloat64_t aj_vec = svld1(pn, ao); + svst1(pn, b, aj_vec); + ao += lda; + } + + } + + b += n_active * n_active; + + i += n_active; + ii += n_active; + } while (i < m); + + + a += n_active; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} diff --git a/kernel/arm64/trsm_uncopy_sve.c b/kernel/arm64/trsm_uncopy_sve.c new file mode 100644 index 000000000..242e99f60 --- /dev/null +++ b/kernel/arm64/trsm_uncopy_sve.c @@ -0,0 +1,113 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT *ao; + + jj = offset; + int js = 0; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + *(b + j * n_active + j) = INV(*(a + j * lda + j)); + for (int k = j+1; k < n_active; k++) { + *(b + j * n_active + k) = *(a + k * lda + j); + } + } + } + + if (ii < jj) { + for (int j = 0; j < n_active; j++) { + svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); + svst1(pn, b, aj_vec); + ao++; + } + } + + b += n_active * n_active; + + i += n_active; + ii += n_active; + } while (i < m); + + + a += n_active * lda; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} diff --git a/kernel/arm64/trsm_utcopy_sve.c b/kernel/arm64/trsm_utcopy_sve.c new file mode 100644 index 000000000..9eefb8c18 --- /dev/null +++ b/kernel/arm64/trsm_utcopy_sve.c @@ -0,0 +1,114 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT *ao; + + jj = offset; + int js = 0; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + for (int k = 0; k < j; k++) { + *(b + j * n_active + k) = *(a + j * lda + k); + } + *(b + j * n_active + j) = INV(*(a + j * lda + j)); + } + } + + if (ii > jj) { + for (int j = 0; j < n_active; j++) { + svfloat64_t aj_vec = svld1(pn, ao); + svst1(pn, b, aj_vec); + ao += lda; + } + + } + + b += n_active * n_active; + + i += n_active; + ii += n_active; + } while (i < m); + + + a += n_active; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} From aaa2b1a861623eb012288c2b401fa923933da55c Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sat, 15 Jan 2022 21:02:14 +0100 Subject: [PATCH 46/77] fix sve dtrsm kernels --- kernel/arm64/trsm_kernel_LN_sve.c | 20 ++++++++++-------- kernel/arm64/trsm_kernel_LT_sve.c | 2 +- kernel/arm64/trsm_kernel_RT_sve.c | 12 +++++------ kernel/arm64/trsm_lncopy_sve.c | 30 +++++++++++++-------------- kernel/arm64/trsm_ltcopy_sve.c | 32 ++++++++++++++--------------- kernel/arm64/trsm_uncopy_sve.c | 29 +++++++++++++------------- kernel/arm64/trsm_utcopy_sve.c | 34 +++++++++++++++---------------- 7 files changed, 79 insertions(+), 80 deletions(-) diff --git a/kernel/arm64/trsm_kernel_LN_sve.c b/kernel/arm64/trsm_kernel_LN_sve.c index c29c3b57a..57f79ac3a 100644 --- a/kernel/arm64/trsm_kernel_LN_sve.c +++ b/kernel/arm64/trsm_kernel_LN_sve.c @@ -182,8 +182,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, i = m % sve_size; if (i) { - aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; - cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + aa = a + (m - i) * k * COMPSIZE; + cc = c + (m - i) * COMPSIZE; if (k - kk > 0) { GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, @@ -205,10 +205,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, } + int mod = i; i = sve_size; if (i <= m) { - aa = a + ((m & ~(sve_size - 1)) - sve_size) * k * COMPSIZE; - cc = c + ((m & ~(sve_size - 1)) - sve_size) * COMPSIZE; + aa = a + (m - mod - sve_size) * k * COMPSIZE; + cc = c + (m - mod - sve_size) * COMPSIZE; do { if (k - kk > 0) { @@ -217,7 +218,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, ZERO, #endif aa + sve_size * kk * COMPSIZE, - b + sve_size * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc); } @@ -251,8 +252,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, i = m % sve_size; if (i) { - aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; - cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + aa = a + (m - i) * k * COMPSIZE; + cc = c + (m - i) * COMPSIZE; if (k - kk > 0) { GEMM_KERNEL(i, j, k - kk, dm1, @@ -273,10 +274,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, } + int mod = i; i = sve_size; if (i <= m) { - aa = a + ((m & ~(sve_size - 1)) - sve_size) * k * COMPSIZE; - cc = c + ((m & ~(sve_size - 1)) - sve_size) * COMPSIZE; + aa = a + (m - mod - sve_size) * k * COMPSIZE; + cc = c + (m - mod - sve_size) * COMPSIZE; do { if (k - kk > 0) { diff --git a/kernel/arm64/trsm_kernel_LT_sve.c b/kernel/arm64/trsm_kernel_LT_sve.c index 7f5459702..8c6a57a6d 100644 --- a/kernel/arm64/trsm_kernel_LT_sve.c +++ b/kernel/arm64/trsm_kernel_LT_sve.c @@ -257,7 +257,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, i += sve_size; } - i = sve_size % m; + i = m % sve_size; if (i) { if (kk > 0) { GEMM_KERNEL(i, j, kk, dm1, diff --git a/kernel/arm64/trsm_kernel_RT_sve.c b/kernel/arm64/trsm_kernel_RT_sve.c index d93ebe7ad..efafc9d11 100644 --- a/kernel/arm64/trsm_kernel_RT_sve.c +++ b/kernel/arm64/trsm_kernel_RT_sve.c @@ -258,23 +258,23 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, if (i <= m) { do { if (k - kk > 0) { - GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, + GEMM_KERNEL(sve_size, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif - aa + GEMM_UNROLL_M * kk * COMPSIZE, + aa + sve_size * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc); } - solve(GEMM_UNROLL_M, GEMM_UNROLL_N, - aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + solve(sve_size, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * sve_size * COMPSIZE, b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); - aa += GEMM_UNROLL_M * k * COMPSIZE; - cc += GEMM_UNROLL_M * COMPSIZE; + aa += sve_size * k * COMPSIZE; + cc += sve_size * COMPSIZE; i += sve_size; } while (i <= m); } diff --git a/kernel/arm64/trsm_lncopy_sve.c b/kernel/arm64/trsm_lncopy_sve.c index d96a1f383..7f480dcad 100644 --- a/kernel/arm64/trsm_lncopy_sve.c +++ b/kernel/arm64/trsm_lncopy_sve.c @@ -48,17 +48,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ - BLASLONG i, ii, j, jj; + BLASLONG i, ii, jj; FLOAT *ao; jj = offset; - int js = 0; #ifdef DOUBLE + int64_t js = 0; svint64_t index = svindex_s64(0LL, lda); svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else + int32_t js = 0; svint32_t index = svindex_s32(0, lda); svbool_t pn = svwhilelt_b32(js, n); int n_active = svcntp_b32(svptrue_b32(), pn); @@ -74,25 +75,24 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT if (ii == jj) { for (int j = 0; j < n_active; j++) { for (int k = 0; k < j; k++) { - *(b + j * n_active + k) = *(a + k * lda + j); + *(b + j * n_active + k) = *(ao + k * lda + j); } - *(b + j * n_active + j) = INV(*(a + j * lda + j)); + *(b + j * n_active + j) = INV(*(ao + j * lda + j)); } - } - - if (ii > jj) { - for (int j = 0; j < n_active; j++) { + ao += n_active; + b += n_active * n_active; + i += n_active; + ii += n_active; + } else { + if (ii > jj) { svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); svst1(pn, b, aj_vec); - ao++; } - + ao++; + b += n_active; + i++; + ii++; } - - b += n_active * n_active; - - i += n_active; - ii += n_active; } while (i < m); diff --git a/kernel/arm64/trsm_ltcopy_sve.c b/kernel/arm64/trsm_ltcopy_sve.c index 9012f7fe5..d7b2a4e8d 100644 --- a/kernel/arm64/trsm_ltcopy_sve.c +++ b/kernel/arm64/trsm_ltcopy_sve.c @@ -48,18 +48,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ - BLASLONG i, ii, j, jj; + BLASLONG i, ii, jj; FLOAT *ao; jj = offset; - int js = 0; #ifdef DOUBLE - svint64_t index = svindex_s64(0LL, lda); + int64_t js = 0; svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svint32_t index = svindex_s32(0, lda); + int32_t js = 0; svbool_t pn = svwhilelt_b32(js, n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif @@ -73,26 +72,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT if (ii == jj) { for (int j = 0; j < n_active; j++) { - *(b + j * n_active + j) = INV(*(a + j * lda + j)); + *(b + j * n_active + j) = INV(*(ao + j * lda + j)); for (int k = j+1; k < n_active; k++) { - *(b + j * n_active + k) = *(a + j * lda + k); + *(b + j * n_active + k) = *(ao + j * lda + k); } } - } - - if (ii < jj) { - for (int j = 0; j < n_active; j++) { + b += n_active * n_active; + ao += lda * n_active; + i += n_active; + ii += n_active; + } else { + if (ii < jj) { svfloat64_t aj_vec = svld1(pn, ao); svst1(pn, b, aj_vec); - ao += lda; } - + ao += lda; + b += n_active; + i ++; + ii ++; } - - b += n_active * n_active; - - i += n_active; - ii += n_active; } while (i < m); diff --git a/kernel/arm64/trsm_uncopy_sve.c b/kernel/arm64/trsm_uncopy_sve.c index 242e99f60..b2851452b 100644 --- a/kernel/arm64/trsm_uncopy_sve.c +++ b/kernel/arm64/trsm_uncopy_sve.c @@ -48,17 +48,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ - BLASLONG i, ii, j, jj; + BLASLONG i, ii, jj; FLOAT *ao; jj = offset; - int js = 0; #ifdef DOUBLE + int64_t js = 0; svint64_t index = svindex_s64(0LL, lda); svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else + int32_t js = 0; svint32_t index = svindex_s32(0, lda); svbool_t pn = svwhilelt_b32(js, n); int n_active = svcntp_b32(svptrue_b32(), pn); @@ -73,25 +74,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT if (ii == jj) { for (int j = 0; j < n_active; j++) { - *(b + j * n_active + j) = INV(*(a + j * lda + j)); + *(b + j * n_active + j) = INV(*(ao + j * lda + j)); for (int k = j+1; k < n_active; k++) { - *(b + j * n_active + k) = *(a + k * lda + j); + *(b + j * n_active + k) = *(ao + k * lda + j); } } - } - - if (ii < jj) { - for (int j = 0; j < n_active; j++) { + ao += n_active; + b += n_active * n_active; + i += n_active; + ii += n_active; + } else { + if (ii < jj) { svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); svst1(pn, b, aj_vec); - ao++; } + ao++; + b += n_active; + i++; + ii++; } - - b += n_active * n_active; - - i += n_active; - ii += n_active; } while (i < m); diff --git a/kernel/arm64/trsm_utcopy_sve.c b/kernel/arm64/trsm_utcopy_sve.c index 9eefb8c18..558955801 100644 --- a/kernel/arm64/trsm_utcopy_sve.c +++ b/kernel/arm64/trsm_utcopy_sve.c @@ -48,18 +48,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ - BLASLONG i, ii, j, jj; + BLASLONG i, ii, jj; FLOAT *ao; jj = offset; - int js = 0; #ifdef DOUBLE - svint64_t index = svindex_s64(0LL, lda); + int64_t js = 0; svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svint32_t index = svindex_s32(0, lda); + int32_t js = 0; svbool_t pn = svwhilelt_b32(js, n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif @@ -74,25 +73,24 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT if (ii == jj) { for (int j = 0; j < n_active; j++) { for (int k = 0; k < j; k++) { - *(b + j * n_active + k) = *(a + j * lda + k); + *(b + j * n_active + k) = *(ao + j * lda + k); } - *(b + j * n_active + j) = INV(*(a + j * lda + j)); + *(b + j * n_active + j) = INV(*(ao + j * lda + j)); } - } - - if (ii > jj) { - for (int j = 0; j < n_active; j++) { + ao += lda * n_active; + b += n_active * n_active; + i += n_active; + ii += n_active; + } else { + if (ii > jj) { svfloat64_t aj_vec = svld1(pn, ao); svst1(pn, b, aj_vec); - ao += lda; } - - } - - b += n_active * n_active; - - i += n_active; - ii += n_active; + ao += lda; + b += n_active; + i ++; + ii ++; + } } while (i < m); From f1315288a8d9f4e06da7b7ccb9a37f04ded95c5f Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sat, 15 Jan 2022 22:27:25 +0100 Subject: [PATCH 47/77] add sve ztrsm --- kernel/arm64/KERNEL.A64FX | 43 +++++++---- kernel/arm64/trsm_kernel_LN_sve.c | 4 + kernel/arm64/trsm_kernel_LT_sve.c | 4 + kernel/arm64/trsm_kernel_RN_sve.c | 4 + kernel/arm64/trsm_kernel_RT_sve.c | 4 + kernel/arm64/trsm_lncopy_sve.c | 9 ++- kernel/arm64/trsm_ltcopy_sve.c | 9 ++- kernel/arm64/trsm_uncopy_sve.c | 9 ++- kernel/arm64/trsm_utcopy_sve.c | 9 ++- kernel/arm64/ztrsm_lncopy_sve.c | 119 ++++++++++++++++++++++++++++++ kernel/arm64/ztrsm_ltcopy_sve.c | 115 +++++++++++++++++++++++++++++ kernel/arm64/ztrsm_uncopy_sve.c | 119 ++++++++++++++++++++++++++++++ kernel/arm64/ztrsm_utcopy_sve.c | 115 +++++++++++++++++++++++++++++ 13 files changed, 539 insertions(+), 24 deletions(-) create mode 100644 kernel/arm64/ztrsm_lncopy_sve.c create mode 100644 kernel/arm64/ztrsm_ltcopy_sve.c create mode 100644 kernel/arm64/ztrsm_uncopy_sve.c create mode 100644 kernel/arm64/ztrsm_utcopy_sve.c diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index d74f0592d..bd25f7cd8 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -20,25 +20,36 @@ IDMAXKERNEL = ../arm/imax.c ISMINKERNEL = ../arm/imin.c IDMINKERNEL = ../arm/imin.c -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = trsm_kernel_LN_sve.c +STRSMKERNEL_LT = trsm_kernel_LT_sve.c +STRSMKERNEL_RN = trsm_kernel_RN_sve.c +STRSMKERNEL_RT = trsm_kernel_RT_sve.c -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DTRSMKERNEL_LN = trsm_kernel_LN_sve.c +DTRSMKERNEL_LT = trsm_kernel_LT_sve.c +DTRSMKERNEL_RN = trsm_kernel_RN_sve.c +DTRSMKERNEL_RT = trsm_kernel_RT_sve.c -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +TRSMCOPYLN_M = trsm_lncopy_sve.c +TRSMCOPYLT_M = trsm_ltcopy_sve.c +TRSMCOPYUN_M = trsm_uncopy_sve.c +TRSMCOPYUT_M = trsm_utcopy_sve.c + +CTRSMKERNEL_LN = trsm_kernel_LN_sve.c +CTRSMKERNEL_LT = trsm_kernel_LT_sve.c +CTRSMKERNEL_RN = trsm_kernel_RN_sve.c +CTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c +ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c +ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c +ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c +ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c +ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c +ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c SAMAXKERNEL = amax.S DAMAXKERNEL = amax.S diff --git a/kernel/arm64/trsm_kernel_LN_sve.c b/kernel/arm64/trsm_kernel_LN_sve.c index 57f79ac3a..fa1c6e984 100644 --- a/kernel/arm64/trsm_kernel_LN_sve.c +++ b/kernel/arm64/trsm_kernel_LN_sve.c @@ -167,7 +167,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, BLASLONG i, j; FLOAT *aa, *cc; BLASLONG kk; +#ifdef DOUBLE int sve_size = svcntd(); +#else + int sve_size = svcntw(); +#endif #if 0 fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", diff --git a/kernel/arm64/trsm_kernel_LT_sve.c b/kernel/arm64/trsm_kernel_LT_sve.c index 8c6a57a6d..2cbb2aafb 100644 --- a/kernel/arm64/trsm_kernel_LT_sve.c +++ b/kernel/arm64/trsm_kernel_LT_sve.c @@ -157,7 +157,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *aa, *cc; BLASLONG kk; BLASLONG i, j, jj; +#ifdef DOUBLE int sve_size = svcntd(); +#else + int sve_size = svcntw(); +#endif #if 0 fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", diff --git a/kernel/arm64/trsm_kernel_RN_sve.c b/kernel/arm64/trsm_kernel_RN_sve.c index 2f6611c1c..5e4e8d9b1 100644 --- a/kernel/arm64/trsm_kernel_RN_sve.c +++ b/kernel/arm64/trsm_kernel_RN_sve.c @@ -157,7 +157,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *aa, *cc; BLASLONG kk; BLASLONG i, j, jj; +#ifdef DOUBLE int sve_size = svcntd(); +#else + int sve_size = svcntw(); +#endif #if 0 fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", diff --git a/kernel/arm64/trsm_kernel_RT_sve.c b/kernel/arm64/trsm_kernel_RT_sve.c index efafc9d11..c376c0e33 100644 --- a/kernel/arm64/trsm_kernel_RT_sve.c +++ b/kernel/arm64/trsm_kernel_RT_sve.c @@ -169,7 +169,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, BLASLONG i, j; FLOAT *aa, *cc; BLASLONG kk; +#ifdef DOUBLE int sve_size = svcntd(); +#else + int sve_size = svcntw(); +#endif #if 0 fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", diff --git a/kernel/arm64/trsm_lncopy_sve.c b/kernel/arm64/trsm_lncopy_sve.c index 7f480dcad..5a9d4194a 100644 --- a/kernel/arm64/trsm_lncopy_sve.c +++ b/kernel/arm64/trsm_lncopy_sve.c @@ -59,9 +59,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else + int32_t N = n; int32_t js = 0; svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32(js, N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -85,7 +86,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii += n_active; } else { if (ii > jj) { +#ifdef DOUBLE svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); +#else + svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); +#endif svst1(pn, b, aj_vec); } ao++; @@ -105,7 +110,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32(js, N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trsm_ltcopy_sve.c b/kernel/arm64/trsm_ltcopy_sve.c index d7b2a4e8d..ac4019e26 100644 --- a/kernel/arm64/trsm_ltcopy_sve.c +++ b/kernel/arm64/trsm_ltcopy_sve.c @@ -58,8 +58,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else + int32_t N = n; int32_t js = 0; - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32(js, N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -83,7 +84,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii += n_active; } else { if (ii < jj) { +#ifdef DOUBLE svfloat64_t aj_vec = svld1(pn, ao); +#else + svfloat32_t aj_vec = svld1(pn, ao); +#endif svst1(pn, b, aj_vec); } ao += lda; @@ -103,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32(js, N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trsm_uncopy_sve.c b/kernel/arm64/trsm_uncopy_sve.c index b2851452b..8fdcd0f4b 100644 --- a/kernel/arm64/trsm_uncopy_sve.c +++ b/kernel/arm64/trsm_uncopy_sve.c @@ -59,9 +59,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else + int32_t N = n; int32_t js = 0; svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32(js, N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -85,7 +86,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii += n_active; } else { if (ii < jj) { +#ifdef DOUBLE svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); +#else + svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); +#endif svst1(pn, b, aj_vec); } ao++; @@ -105,7 +110,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32(js, N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trsm_utcopy_sve.c b/kernel/arm64/trsm_utcopy_sve.c index 558955801..0f5f0dccd 100644 --- a/kernel/arm64/trsm_utcopy_sve.c +++ b/kernel/arm64/trsm_utcopy_sve.c @@ -58,8 +58,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else + int32_t N = n; int32_t js = 0; - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32(js, N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -83,7 +84,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii += n_active; } else { if (ii > jj) { +#ifdef DOUBLE svfloat64_t aj_vec = svld1(pn, ao); +#else + svfloat32_t aj_vec = svld1(pn, ao); +#endif svst1(pn, b, aj_vec); } ao += lda; @@ -103,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32(js, N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrsm_lncopy_sve.c b/kernel/arm64/ztrsm_lncopy_sve.c new file mode 100644 index 000000000..eb7cd0294 --- /dev/null +++ b/kernel/arm64/ztrsm_lncopy_sve.c @@ -0,0 +1,119 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj; + + FLOAT *ao; + + lda *= 2; + + jj = offset; +#ifdef DOUBLE + int64_t js = 0; + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + int32_t N = n; + int32_t js = 0; + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, N); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + for (int k = 0; k < j; k++) { + *(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j); + *(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1); + } + compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); + //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); + } + ao += n_active * 2; + b += n_active * n_active * 2; + i += n_active; + ii += n_active; + } else { + if (ii > jj) { +#ifdef DOUBLE + svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#else + svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#endif + svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); + } + ao += 2; + b += n_active * 2; + i++; + ii++; + } + } while (i < m); + + + a += n_active * lda; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, N); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} diff --git a/kernel/arm64/ztrsm_ltcopy_sve.c b/kernel/arm64/ztrsm_ltcopy_sve.c new file mode 100644 index 000000000..27cd1a941 --- /dev/null +++ b/kernel/arm64/ztrsm_ltcopy_sve.c @@ -0,0 +1,115 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj; + + FLOAT *ao; + + lda *= 2; + + jj = offset; +#ifdef DOUBLE + int64_t js = 0; + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + int32_t N = n; + int32_t js = 0; + svbool_t pn = svwhilelt_b32(js, N); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); + //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); + for (int k = j+1; k < n_active; k++) { + *(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k); + *(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1); + } + } + b += n_active * n_active * 2; + ao += lda * n_active * 2; + i += n_active; + ii += n_active; + } else { + if (ii < jj) { +#ifdef DOUBLE + svfloat64x2_t aj_vec = svld2(pn, ao); +#else + svfloat32x2_t aj_vec = svld2(pn, ao); +#endif + svst2(pn, b, aj_vec); + } + ao += lda; + b += n_active * 2; + i ++; + ii ++; + } + } while (i < m); + + + a += n_active * 2; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, N); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} diff --git a/kernel/arm64/ztrsm_uncopy_sve.c b/kernel/arm64/ztrsm_uncopy_sve.c new file mode 100644 index 000000000..92e086b75 --- /dev/null +++ b/kernel/arm64/ztrsm_uncopy_sve.c @@ -0,0 +1,119 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj; + + FLOAT *ao; + + lda *= 2; + + jj = offset; +#ifdef DOUBLE + int64_t js = 0; + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + int32_t N = n; + int32_t js = 0; + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, N); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); + //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); + for (int k = j+1; k < n_active; k++) { + *(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j); + *(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1); + } + } + ao += n_active * 2; + b += n_active * n_active * 2; + i += n_active; + ii += n_active; + } else { + if (ii < jj) { +#ifdef DOUBLE + svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#else + svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#endif + svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); + } + ao += 2; + b += n_active * 2; + i++; + ii++; + } + } while (i < m); + + + a += n_active * lda; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, N); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} diff --git a/kernel/arm64/ztrsm_utcopy_sve.c b/kernel/arm64/ztrsm_utcopy_sve.c new file mode 100644 index 000000000..d82a9d0c8 --- /dev/null +++ b/kernel/arm64/ztrsm_utcopy_sve.c @@ -0,0 +1,115 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj; + + FLOAT *ao; + + lda *= 2; + + jj = offset; +#ifdef DOUBLE + int64_t js = 0; + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + int32_t N = n; + int32_t js = 0; + svbool_t pn = svwhilelt_b32(js, N); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + for (int k = 0; k < j; k++) { + *(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k); + *(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1); + } + compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); + //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); + } + ao += lda * n_active * 2; + b += n_active * n_active * 2; + i += n_active; + ii += n_active; + } else { + if (ii > jj) { +#ifdef DOUBLE + svfloat64x2_t aj_vec = svld2(pn, ao); +#else + svfloat32x2_t aj_vec = svld2(pn, ao); +#endif + svst2(pn, b, aj_vec); + } + ao += lda; + b += n_active * 2; + i ++; + ii ++; + } + } while (i < m); + + + a += n_active * 2; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, N); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} From 0fb6cc07bf9fdf0cbe7a7595e82379a0040d9e9a Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 16 Jan 2022 21:39:57 +0100 Subject: [PATCH 48/77] fix ztrsm lt/ut copy --- kernel/arm64/ztrsm_ltcopy_sve.c | 2 +- kernel/arm64/ztrsm_utcopy_sve.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/arm64/ztrsm_ltcopy_sve.c b/kernel/arm64/ztrsm_ltcopy_sve.c index 27cd1a941..34dbf8a30 100644 --- a/kernel/arm64/ztrsm_ltcopy_sve.c +++ b/kernel/arm64/ztrsm_ltcopy_sve.c @@ -77,7 +77,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT } } b += n_active * n_active * 2; - ao += lda * n_active * 2; + ao += lda * n_active; i += n_active; ii += n_active; } else { diff --git a/kernel/arm64/ztrsm_utcopy_sve.c b/kernel/arm64/ztrsm_utcopy_sve.c index d82a9d0c8..ccb942e1b 100644 --- a/kernel/arm64/ztrsm_utcopy_sve.c +++ b/kernel/arm64/ztrsm_utcopy_sve.c @@ -76,7 +76,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); } - ao += lda * n_active * 2; + ao += lda * n_active; b += n_active * n_active * 2; i += n_active; ii += n_active; From b6a445cfd88ab0bfa1687aeba7cc2d6705497f77 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 16 Jan 2022 21:40:56 +0100 Subject: [PATCH 49/77] adapt Makefile for SVE trsm --- kernel/Makefile.L3 | 128 +++++++++++++++++++++++++++++++++++++++++++++ param.h | 4 +- 2 files changed, 130 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 2a10ac980..2d5740183 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -2391,29 +2391,61 @@ $(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNR $(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ +ifdef TRSMCOPYUN_M +$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef TRSMCOPYLN_M +$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif +ifdef TRSMCOPYUT_M +$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef TRSMCOPYLT_M +$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)strsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -2439,29 +2471,61 @@ $(KDIR)strsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N $(KDIR)strsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ +ifdef TRSMCOPYUN_M +$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef TRSMCOPYLN_M +$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif +ifdef TRSMCOPYUT_M +$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef TRSMCOPYLT_M +$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -2535,29 +2599,61 @@ $(KDIR)qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N $(KDIR)qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ +ifdef ZTRSMCOPYUN_M +$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef ZTRSMCOPYLN_M +$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif +ifdef ZTRSMCOPYUT_M +$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef ZTRSMCOPYLT_M +$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -2583,29 +2679,61 @@ $(KDIR)ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_ $(KDIR)ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ +ifdef ZTRSMCOPYUN_M +$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef ZTRSMCOPYLN_M +$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif +ifdef ZTRSMCOPYUT_M +$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef ZTRSMCOPYLT_M +$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ diff --git a/param.h b/param.h index 5d46991a2..ab6eab6eb 100644 --- a/param.h +++ b/param.h @@ -3327,11 +3327,11 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_MN 32 +#define CGEMM_DEFAULT_UNROLL_MN 16 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_MN 32 +#define ZGEMM_DEFAULT_UNROLL_MN 16 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 160 From 1b49ef8dcf6b01aecef30f804654d0efc97bc37a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Jan 2022 00:05:33 +0100 Subject: [PATCH 50/77] Fix pivot index for negative increments --- lapack/laswp/generic/laswp_k_1.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lapack/laswp/generic/laswp_k_1.c b/lapack/laswp/generic/laswp_k_1.c index 88648cf29..556889291 100644 --- a/lapack/laswp/generic/laswp_k_1.c +++ b/lapack/laswp/generic/laswp_k_1.c @@ -57,10 +57,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a--; k1 --; -#ifndef MINUS ipiv += k1; -#else - ipiv -= (k2 - 1) * incx; +#ifdef MINUS + ipiv -= (k2 - k1 - 1) * incx; #endif if (n <= 0) return 0; From 0e9e9513067665ec0a505ed935c89752a60dbb81 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Jan 2022 00:06:41 +0100 Subject: [PATCH 51/77] Fix pivot offset calculation for negative incx --- lapack/laswp/generic/laswp_k_2.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lapack/laswp/generic/laswp_k_2.c b/lapack/laswp/generic/laswp_k_2.c index 93b9a2c01..f76cd078f 100644 --- a/lapack/laswp/generic/laswp_k_2.c +++ b/lapack/laswp/generic/laswp_k_2.c @@ -59,10 +59,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a--; k1 --; -#ifndef MINUS ipiv += k1; -#else - ipiv -= (k2 - 1) * incx; +#ifdef MINUS + ipiv -= (k2 - k1 - 1) * incx; #endif if (n <= 0) return 0; From eca2f50b48a9941e2f3d2cd75fb699ace070f9cc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Jan 2022 00:07:33 +0100 Subject: [PATCH 52/77] Fix pivot offset calculation for negative incx --- lapack/laswp/generic/laswp_k_4.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lapack/laswp/generic/laswp_k_4.c b/lapack/laswp/generic/laswp_k_4.c index 191a229a9..6520ed799 100644 --- a/lapack/laswp/generic/laswp_k_4.c +++ b/lapack/laswp/generic/laswp_k_4.c @@ -65,10 +65,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a--; k1 --; -#ifndef MINUS ipiv += k1; -#else - ipiv -= (k2 - 1) * incx; +#ifdef MINUS + ipiv -= (k2 - k1 - 1) * incx; #endif if (n <= 0) return 0; From afa0cece5cbca7ce9c749b3101ac36b15518508e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Jan 2022 00:08:20 +0100 Subject: [PATCH 53/77] Fix pivot offset calculation for negative incx --- lapack/laswp/generic/laswp_k_8.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lapack/laswp/generic/laswp_k_8.c b/lapack/laswp/generic/laswp_k_8.c index 947941839..a7bf06817 100644 --- a/lapack/laswp/generic/laswp_k_8.c +++ b/lapack/laswp/generic/laswp_k_8.c @@ -78,10 +78,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a--; k1 --; -#ifndef MINUS ipiv += k1; -#else - ipiv -= (k2 - 1) * incx; +#ifdef MINUS + ipiv -= (k2 - k1 - 1) * incx; #endif if (n <= 0) return 0; From 3b6293f5a0e4371d81074ee0ebc19d173ca696ed Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Jan 2022 00:09:14 +0100 Subject: [PATCH 54/77] Fix offset calculation for negative incx --- lapack/laswp/generic/zlaswp_k_1.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lapack/laswp/generic/zlaswp_k_1.c b/lapack/laswp/generic/zlaswp_k_1.c index d1204778a..42aaed528 100644 --- a/lapack/laswp/generic/zlaswp_k_1.c +++ b/lapack/laswp/generic/zlaswp_k_1.c @@ -59,10 +59,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, lda *= 2; k1 --; -#ifndef MINUS ipiv += k1; -#else - ipiv -= (k2 - 1) * incx; +#ifdef MINUS + ipiv -= (k2 - k1 - 1) * incx; #endif if (n <= 0) return 0; From 57e2a72f40aaf008c48b7f0ec6e5216aee1499c4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Jan 2022 00:10:21 +0100 Subject: [PATCH 55/77] Fix pivot offset calculation for negative incx --- lapack/laswp/generic/zlaswp_k_2.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lapack/laswp/generic/zlaswp_k_2.c b/lapack/laswp/generic/zlaswp_k_2.c index c18ab4bee..1220870f8 100644 --- a/lapack/laswp/generic/zlaswp_k_2.c +++ b/lapack/laswp/generic/zlaswp_k_2.c @@ -60,10 +60,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, lda *= 2; k1 --; -#ifndef MINUS ipiv += k1; -#else - ipiv -= (k2 - 1) * incx; +#ifdef MINUS + ipiv -= (k2 - k1 - 1) * incx; #endif if (n <= 0) return 0; From 40003f8edb9e5c529c1c12589f9e1b53f9ac8f2d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Jan 2022 00:11:18 +0100 Subject: [PATCH 56/77] Fix pivot offset calculation for negative incx --- lapack/laswp/generic/zlaswp_k_4.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lapack/laswp/generic/zlaswp_k_4.c b/lapack/laswp/generic/zlaswp_k_4.c index 45e1bf01e..cc7e296e1 100644 --- a/lapack/laswp/generic/zlaswp_k_4.c +++ b/lapack/laswp/generic/zlaswp_k_4.c @@ -69,10 +69,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, lda *= 2; k1 --; -#ifndef MINUS ipiv += k1; -#else - ipiv -= (k2 - 1) * incx; +#ifdef MINUS + ipiv -= (k2 - k1 - 1) * incx; #endif if (n <= 0) return 0; From f158d59087c518fa924023d62a00eac176678dae Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Mon, 17 Jan 2022 22:36:48 +0100 Subject: [PATCH 57/77] adapt CMake --- kernel/CMakeLists.txt | 56 ++++++++++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 717c1ea72..8aa6728d5 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -381,23 +381,35 @@ endif () GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type}) + +if (NOT DEFINED ZTRSMCOPYLN_M) + set(ZTRSMUNCOPY_M "generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c") + set(ZTRSMLNCOPY_M "generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c") + set(ZTRSMUTCOPY_M "generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c") + set(ZTRSMLTCOPY_M "generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(ZTRSMUNCOPY_M "${KERNELDIR}/${ZTRSMCOPYUN_M}") + set(ZTRSMLNCOPY_M "${KERNELDIR}/${ZTRSMCOPYLN_M}") + set(ZTRSMUTCOPY_M "${KERNELDIR}/${ZTRSMCOPYUT_M}") + set(ZTRSMLTCOPY_M "${KERNELDIR}/${ZTRSMCOPYLT_M}") +endif () + GenerateNamedObjects(${ZTRSMUNCOPY_M} "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${ZTRSMUNCOPY_M} "" "trsm_iunncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${ZTRSMLNCOPY_M} "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${ZTRSMLNCOPY_M} "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${ZTRSMUTCOPY_M} "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${ZTRSMUTCOPY_M} "" "trsm_iutncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${ZTRSMLTCOPY_M} "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${ZTRSMLTCOPY_M} "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type}) @@ -491,23 +503,35 @@ endif () GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type}) + +if (NOT DEFINED TRSMCOPYLN_M) + set(TRSMUNCOPY_M "generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRSMLNCOPY_M "generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRSMUTCOPY_M "generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRSMLTCOPY_M "generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(TRSMUNCOPY_M "${KERNELDIR}/${TRSMCOPYUN_M}") + set(TRSMLNCOPY_M "${KERNELDIR}/${TRSMCOPYLN_M}") + set(TRSMUTCOPY_M "${KERNELDIR}/${TRSMCOPYUT_M}") + set(TRSMLTCOPY_M "${KERNELDIR}/${TRSMCOPYLT_M}") +endif () + GenerateNamedObjects(${TRSMUNCOPY_M} "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRSMUNCOPY_M} "" "trsm_iunncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRSMLNCOPY_M} "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRSMLNCOPY_M} "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRSMUTCOPY_M} "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRSMUTCOPY_M} "" "trsm_iutncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRSMLTCOPY_M} "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRSMLTCOPY_M} "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type}) From 19d435b1b3a5d0d5719189ba29b13e728a2bb41c Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Tue, 18 Jan 2022 08:28:31 +0100 Subject: [PATCH 58/77] update armv8sve + contributors --- CONTRIBUTORS.md | 1 + kernel/arm64/KERNEL.ARMV8SVE | 47 ++++++++++++++++++++++-------------- 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 879aaebe3..5378c79bf 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -203,3 +203,4 @@ In chronological order: * [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM * [2021-11-12] SVE kernels for SGEMM, STRMM and corresponding SVE copy functions * [2022-01-06] SVE kernels for CGEMM, ZGEMM, CTRMM, ZTRMM and corresponding SVE copy functions + * [2022-01-18] SVE kernels and copy functions for TRSM diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index 66de642a5..bd25f7cd8 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -20,25 +20,36 @@ IDMAXKERNEL = ../arm/imax.c ISMINKERNEL = ../arm/imin.c IDMINKERNEL = ../arm/imin.c -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = trsm_kernel_LN_sve.c +STRSMKERNEL_LT = trsm_kernel_LT_sve.c +STRSMKERNEL_RN = trsm_kernel_RN_sve.c +STRSMKERNEL_RT = trsm_kernel_RT_sve.c -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DTRSMKERNEL_LN = trsm_kernel_LN_sve.c +DTRSMKERNEL_LT = trsm_kernel_LT_sve.c +DTRSMKERNEL_RN = trsm_kernel_RN_sve.c +DTRSMKERNEL_RT = trsm_kernel_RT_sve.c -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +TRSMCOPYLN_M = trsm_lncopy_sve.c +TRSMCOPYLT_M = trsm_ltcopy_sve.c +TRSMCOPYUN_M = trsm_uncopy_sve.c +TRSMCOPYUT_M = trsm_utcopy_sve.c + +CTRSMKERNEL_LN = trsm_kernel_LN_sve.c +CTRSMKERNEL_LT = trsm_kernel_LT_sve.c +CTRSMKERNEL_RN = trsm_kernel_RN_sve.c +CTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c +ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c +ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c +ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c +ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c +ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c +ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c SAMAXKERNEL = amax.S DAMAXKERNEL = amax.S @@ -140,8 +151,8 @@ DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S DGEMMINCOPY = dgemm_ncopy_sve_v1.c DGEMMITCOPY = dgemm_tcopy_sve_v1.c -DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c -DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) From 00f44bfff74e7173a881c4d6849deb75b9dfbd6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=BCtzel?= Date: Fri, 21 Jan 2022 13:27:17 +0100 Subject: [PATCH 59/77] cmake: Check if Fortran compiler is usable before enabling it. --- cmake/f_check.cmake | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index 0f5d0e15d..14683ed21 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -20,19 +20,16 @@ # NEEDBUNDERSCORE # NEED2UNDERSCORES -if (NOT NO_LAPACK) - include(CheckLanguage) - check_language(Fortran) - if(CMAKE_Fortran_COMPILER) - enable_language(Fortran) - else() - message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK") +include(CheckLanguage) +check_language(Fortran) +if(CMAKE_Fortran_COMPILER) + enable_language(Fortran) +else() + if (NOT NO_LAPACK) + message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK") + endif() set (NOFORTRAN 1) set (NO_LAPACK 1) - endif() -else() - include(CMakeForceCompiler) - CMAKE_FORCE_Fortran_COMPILER(gfortran GNU) endif() if (NOT ONLY_CBLAS) From 1937b4e435cce48dbf8d7d124800e03e1ba5d30d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Jan 2022 18:27:38 +0100 Subject: [PATCH 60/77] Add Elbrus e2k architecture detection --- c_check | 7 +++++++ ctest.c | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/c_check b/c_check index 030f5e632..999f5a7a7 100644 --- a/c_check +++ b/c_check @@ -84,6 +84,7 @@ $os = Haiku if ($data =~ /OS_HAIKU/); $architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/); +$architecture = e2k if ($data =~ /ARCH_E2K/); $architecture = power if ($data =~ /ARCH_POWER/); $architecture = mips if ($data =~ /ARCH_MIPS/); $architecture = mips64 if ($data =~ /ARCH_MIPS64/); @@ -124,6 +125,11 @@ if ($architecture eq "zarch") { $binary = 64; } +if ($architecture eq "e2k") { + $defined = 1; + $binary = 64; +} + if ($architecture eq "alpha") { $defined = 1; $binary = 64; @@ -223,6 +229,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { $architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/); +$architecture = e2k if ($data =~ /ARCH_E2K/); $architecture = power if ($data =~ /ARCH_POWER/); $architecture = mips if ($data =~ /ARCH_MIPS/); $architecture = mips64 if ($data =~ /ARCH_MIPS64/); diff --git a/ctest.c b/ctest.c index 2afd93f68..fc52b43a6 100644 --- a/ctest.c +++ b/ctest.c @@ -165,3 +165,7 @@ ARCH_LOONGARCH64 HAVE_C11 #endif +#if defined(__e2k__) +ARCH_E2K +#endif + From bc93f468ef98c7bb76bdcaf779e9dbe7231303b4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Jan 2022 18:53:38 +0100 Subject: [PATCH 61/77] Add Elbrus E2000 architecture as generic x86_64 compatible --- Makefile.e2k | 1 + TargetList.txt | 4 ++++ common.h | 4 ++++ common_e2k.h | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++ common_macro.h | 2 +- getarch.c | 11 +++++++++ 6 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 Makefile.e2k create mode 100644 common_e2k.h diff --git a/Makefile.e2k b/Makefile.e2k new file mode 100644 index 000000000..a5e50b1f0 --- /dev/null +++ b/Makefile.e2k @@ -0,0 +1 @@ +COPT = -Wall -O2 # -DGEMMTEST diff --git a/TargetList.txt b/TargetList.txt index 97c8a8f06..a5a07a661 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -115,3 +115,7 @@ C910V 11.LOONGARCH64: LOONGSON3R5 + +12. Elbrus E2000: +E2K + diff --git a/common.h b/common.h index ff5254a5c..00d1d0baf 100644 --- a/common.h +++ b/common.h @@ -474,6 +474,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_loongarch64.h" #endif +#ifdef ARCH_E2K +#include "common_e2k.h" +#endif + #ifndef ASSEMBLER #ifdef OS_WINDOWSSTORE typedef char env_var_t[MAX_PATH]; diff --git a/common_e2k.h b/common_e2k.h new file mode 100644 index 000000000..0739c9473 --- /dev/null +++ b/common_e2k.h @@ -0,0 +1,64 @@ +/***************************************************************************** +Copyright (c) 2011-2016, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#ifndef COMMON_E2K +#define COMMON_E2K + +#ifdef ASSEMBLER +#error +#endif + +#define MB do { __asm__ __volatile__("": : :"memory"); } while (0) +#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0) +#define RMB + +#define INLINE __attribute__((__always_inline__)) inline + +static inline int blas_quickdivide(blasint x, blasint y) { + return x / y; +} + +#ifndef PAGESIZE +#define PAGESIZE ( 4 << 10) +#endif +#define HUGE_PAGESIZE ( 2 << 20) + +#ifndef BUFFERSIZE +#define BUFFER_SIZE (32 << 20) +#else +#define BUFFER_SIZE (32 << BUFFERSIZE) +#endif + +#define SEEK_ADDRESS + +#endif + diff --git a/common_macro.h b/common_macro.h index cf2a3fd88..9826f1809 100644 --- a/common_macro.h +++ b/common_macro.h @@ -2611,7 +2611,7 @@ #ifndef ASSEMBLER #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\ -|| defined(ARCH_LOONGARCH64) +|| defined(ARCH_LOONGARCH64) || defined(ARCH_E2K) extern BLASLONG gemm_offset_a; extern BLASLONG gemm_offset_b; extern BLASLONG sbgemm_p; diff --git a/getarch.c b/getarch.c index 73bbf1892..00e544bc7 100644 --- a/getarch.c +++ b/getarch.c @@ -1536,6 +1536,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(FORCE_E2K) || defined(__e2k__) +#define FORCE +#define ARCHITECTURE "E2K" +#define ARCHCONFIG "-DGENERIC " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "generic" +#define CORENAME "generic" +#endif + #ifndef FORCE #ifdef USER_TARGET From 898cf5faf3fa3eaa6566c45276f7c6ba08082318 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Jan 2022 18:55:10 +0100 Subject: [PATCH 62/77] Add Elbrus e2k architecture support --- kernel/Makefile.L3 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 2d5740183..bea6cb048 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -617,6 +617,10 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ +ifeq ($(ARCH), E2K) +USE_TRMM = 1 +endif + ifeq ($(BUILD_BFLOAT16), 1) From 3492bea60225d795deb4e1b507914482133fc6a4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Jan 2022 18:57:28 +0100 Subject: [PATCH 63/77] Create Makefile --- kernel/e2k/Makefile | 1 + 1 file changed, 1 insertion(+) create mode 100644 kernel/e2k/Makefile diff --git a/kernel/e2k/Makefile b/kernel/e2k/Makefile new file mode 100644 index 000000000..520349bd6 --- /dev/null +++ b/kernel/e2k/Makefile @@ -0,0 +1 @@ +clean :: From 299d4d70a371c9fed9792daeb80329fd7961f841 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Jan 2022 18:59:36 +0100 Subject: [PATCH 64/77] Add default KERNEL file for Elbrus E2K arch --- kernel/e2k/KERNEL | 149 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 kernel/e2k/KERNEL diff --git a/kernel/e2k/KERNEL b/kernel/e2k/KERNEL new file mode 100644 index 000000000..afa8a0881 --- /dev/null +++ b/kernel/e2k/KERNEL @@ -0,0 +1,149 @@ +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../arm/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +DSDOTKERNEL = ../generic/dot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c +LSAME_KERNEL = ../generic/lsame.c + +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + + From 66a15e15a87a7e89d7341006edd013f3b2843468 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Jan 2022 19:02:57 +0100 Subject: [PATCH 65/77] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 5378c79bf..7e23dec8b 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -204,3 +204,6 @@ In chronological order: * [2021-11-12] SVE kernels for SGEMM, STRMM and corresponding SVE copy functions * [2022-01-06] SVE kernels for CGEMM, ZGEMM, CTRMM, ZTRMM and corresponding SVE copy functions * [2022-01-18] SVE kernels and copy functions for TRSM + +* Ilya Kurdyukov + * [2021-02-21] Add basic support for the Elbrus E2000 architecture From 5d24f3d2102270e0cdc00823a06a35c2993bc361 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Jan 2022 19:09:00 +0100 Subject: [PATCH 66/77] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 7e23dec8b..92be1fe42 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -205,5 +205,5 @@ In chronological order: * [2022-01-06] SVE kernels for CGEMM, ZGEMM, CTRMM, ZTRMM and corresponding SVE copy functions * [2022-01-18] SVE kernels and copy functions for TRSM -* Ilya Kurdyukov +* Ilya Kurdyukov * [2021-02-21] Add basic support for the Elbrus E2000 architecture From addc2a7aaa46eb1501a7c9c153951051eb82442d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 Jan 2022 19:56:32 +0100 Subject: [PATCH 67/77] Add proper defaults for IMIN/IMAX --- kernel/sparc/KERNEL | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/sparc/KERNEL b/kernel/sparc/KERNEL index 594fd05e5..a8c958bb4 100644 --- a/kernel/sparc/KERNEL +++ b/kernel/sparc/KERNEL @@ -39,11 +39,19 @@ IZAMINKERNEL = izamax.S endif ifndef ISMINKERNEL -ISMINKERNEL = iamax.S +ISMINKERNEL = imax.S endif ifndef IDMINKERNEL -IDMINKERNEL = iamax.S +IDMINKERNEL = imax.S +endif + +ifndef ISMAXKERNEL +ISMAXKERNEL = imax.S +endif + +ifndef IDMAXKERNEL +IDMAXKERNEL = imax.S endif ifndef SNRM2KERNEL From 7f0b11fbc189e95c8ee2fd249980962b9f5a1125 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 Jan 2022 22:00:39 +0100 Subject: [PATCH 68/77] Exclude some complex drivers when NO_LAPACK is set --- driver/level2/Makefile | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/driver/level2/Makefile b/driver/level2/Makefile index caecf4f97..9bef6e2a5 100644 --- a/driver/level2/Makefile +++ b/driver/level2/Makefile @@ -64,9 +64,9 @@ CBLASOBJS += \ chpmv_U.$(SUFFIX) chpmv_L.$(SUFFIX) chpmv_V.$(SUFFIX) chpmv_M.$(SUFFIX) \ chpr_U.$(SUFFIX) chpr_L.$(SUFFIX) chpr_V.$(SUFFIX) chpr_M.$(SUFFIX) \ chpr2_U.$(SUFFIX) chpr2_L.$(SUFFIX) chpr2_V.$(SUFFIX) chpr2_M.$(SUFFIX) \ - csbmv_U.$(SUFFIX) csbmv_L.$(SUFFIX) cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \ - cspr_U.$(SUFFIX) cspr_L.$(SUFFIX) cspr2_U.$(SUFFIX) cspr2_L.$(SUFFIX) \ - csyr_U.$(SUFFIX) csyr_L.$(SUFFIX) csyr2_U.$(SUFFIX) csyr2_L.$(SUFFIX) \ + csbmv_U.$(SUFFIX) csbmv_L.$(SUFFIX) \ + cspr2_U.$(SUFFIX) cspr2_L.$(SUFFIX) \ + csyr2_U.$(SUFFIX) csyr2_L.$(SUFFIX) \ ctbmv_NUU.$(SUFFIX) ctbmv_NUN.$(SUFFIX) ctbmv_NLU.$(SUFFIX) ctbmv_NLN.$(SUFFIX) \ ctbmv_TUU.$(SUFFIX) ctbmv_TUN.$(SUFFIX) ctbmv_TLU.$(SUFFIX) ctbmv_TLN.$(SUFFIX) \ ctbmv_RUU.$(SUFFIX) ctbmv_RUN.$(SUFFIX) ctbmv_RLU.$(SUFFIX) ctbmv_RLN.$(SUFFIX) \ @@ -92,6 +92,13 @@ CBLASOBJS += \ ctrsv_RUU.$(SUFFIX) ctrsv_RUN.$(SUFFIX) ctrsv_RLU.$(SUFFIX) ctrsv_RLN.$(SUFFIX) \ ctrsv_CUU.$(SUFFIX) ctrsv_CUN.$(SUFFIX) ctrsv_CLU.$(SUFFIX) ctrsv_CLN.$(SUFFIX) +ifndef NO_LAPACK +CBLASOBJS += \ + cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \ + cspr_U.$(SUFFIX) cspr_L.$(SUFFIX) \ + csyr_U.$(SUFFIX) csyr_L.$(SUFFIX) +endif + ZBLASOBJS += \ zgbmv_n.$(SUFFIX) zgbmv_t.$(SUFFIX) zgbmv_r.$(SUFFIX) zgbmv_c.$(SUFFIX) \ zgbmv_o.$(SUFFIX) zgbmv_u.$(SUFFIX) zgbmv_s.$(SUFFIX) zgbmv_d.$(SUFFIX) \ From d2b5fbf80f02539243cca20b496b0358d2829420 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 Jan 2022 22:02:08 +0100 Subject: [PATCH 69/77] Exclude some complex (LAPACK) functions when NO_LAPACK is set --- interface/CMakeLists.txt | 19 +++++++++++++++---- interface/Makefile | 8 ++++++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index ccb5fce3f..0b2998237 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -28,14 +28,21 @@ set(BLAS1_MANGLED_SOURCES # these all have 'z' sources for complex versions set(BLAS2_SOURCES gemv.c ger.c - trsv.c trmv.c symv.c - syr.c syr2.c gbmv.c - sbmv.c spmv.c - spr.c spr2.c + trsv.c trmv.c + syr2.c gbmv.c + sbmv.c + spr2.c tbsv.c tbmv.c tpsv.c tpmv.c ) +set(BLAS2_REAL_ONLY_SOURCES + symv.c syr.c spmv.c spr.c +) +set(BLAS2_COMPLEX_LAPACK_SOURCES + symv.c syr.c spmv.c spr.c +) + set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES hemv.c hbmv.c her.c her2.c @@ -78,6 +85,10 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS}) GenerateNamedObjects("${BLAS1_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 1) GenerateNamedObjects("${BLAS1_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) GenerateNamedObjects("${BLAS2_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) + GenerateNamedObjects("${BLAS2_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 1) + if (NOT DEFINED NO_LAPACK) + GenerateNamedObjects("${BLAS2_COMPLEX_LAPACK_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) + endif () GenerateNamedObjects("${BLAS2_COMPLEX_ONLY_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 4) GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX}) GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) diff --git a/interface/Makefile b/interface/Makefile index 3252601d2..f57d0bda0 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -1016,11 +1016,13 @@ dsymv.$(SUFFIX) dsymv.$(PSUFFIX) : symv.c qsymv.$(SUFFIX) qsymv.$(PSUFFIX) : symv.c $(CC) -c $(CFLAGS) $< -o $(@F) +ifndef NO_LAPACK csymv.$(SUFFIX) csymv.$(PSUFFIX) : zsymv.c $(CC) -c $(CFLAGS) $< -o $(@F) zsymv.$(SUFFIX) zsymv.$(PSUFFIX) : zsymv.c $(CC) -c $(CFLAGS) $< -o $(@F) +endif xsymv.$(SUFFIX) xsymv.$(PSUFFIX) : zsymv.c $(CC) -c $(CFLAGS) $< -o $(@F) @@ -1034,11 +1036,13 @@ dsyr.$(SUFFIX) dsyr.$(PSUFFIX) : syr.c qsyr.$(SUFFIX) qsyr.$(PSUFFIX) : syr.c $(CC) -c $(CFLAGS) $< -o $(@F) +ifndef NO_LAPACK csyr.$(SUFFIX) csyr.$(PSUFFIX) : zsyr.c $(CC) -c $(CFLAGS) $< -o $(@F) zsyr.$(SUFFIX) zsyr.$(PSUFFIX) : zsyr.c $(CC) -c $(CFLAGS) $< -o $(@F) +endif xsyr.$(SUFFIX) xsyr.$(PSUFFIX) : zsyr.c $(CC) -c $(CFLAGS) $< -o $(@F) @@ -1106,11 +1110,13 @@ dspmv.$(SUFFIX) dspmv.$(PSUFFIX) : spmv.c qspmv.$(SUFFIX) qspmv.$(PSUFFIX) : spmv.c $(CC) -c $(CFLAGS) $< -o $(@F) +ifndef NO_LAPACK cspmv.$(SUFFIX) cspmv.$(PSUFFIX) : zspmv.c $(CC) -c $(CFLAGS) $< -o $(@F) zspmv.$(SUFFIX) zspmv.$(PSUFFIX) : zspmv.c $(CC) -c $(CFLAGS) $< -o $(@F) +endif xspmv.$(SUFFIX) xspmv.$(PSUFFIX) : zspmv.c $(CC) -c $(CFLAGS) $< -o $(@F) @@ -1124,11 +1130,13 @@ dspr.$(SUFFIX) dspr.$(PSUFFIX) : spr.c qspr.$(SUFFIX) qspr.$(PSUFFIX) : spr.c $(CC) -c $(CFLAGS) $< -o $(@F) +ifndef NO_LAPACK cspr.$(SUFFIX) cspr.$(PSUFFIX) : zspr.c $(CC) -c $(CFLAGS) $< -o $(@F) zspr.$(SUFFIX) zspr.$(PSUFFIX) : zspr.c $(CC) -c $(CFLAGS) $< -o $(@F) +endif xspr.$(SUFFIX) xspr.$(PSUFFIX) : zspr.c $(CC) -c $(CFLAGS) $< -o $(@F) From a3eea3e127fb9f3682e1e132c75b515d7b7d5241 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Feb 2022 11:43:17 +0100 Subject: [PATCH 70/77] Fix input argument check (LAPACK PR 646) --- lapack-netlib/SRC/cgeqrt2.f | 11 ++++------- lapack-netlib/SRC/dgeqrt2.f | 11 ++++------- lapack-netlib/SRC/sgeqrt2.f | 11 ++++------- lapack-netlib/SRC/zgeqrt2.f | 11 ++++------- 4 files changed, 16 insertions(+), 28 deletions(-) diff --git a/lapack-netlib/SRC/cgeqrt2.f b/lapack-netlib/SRC/cgeqrt2.f index 9ee3e4f79..11221636d 100644 --- a/lapack-netlib/SRC/cgeqrt2.f +++ b/lapack-netlib/SRC/cgeqrt2.f @@ -97,8 +97,6 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 -* *> \ingroup complexGEcomputational * *> \par Further Details: @@ -127,10 +125,9 @@ * ===================================================================== SUBROUTINE CGEQRT2( M, N, A, LDA, T, LDT, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 * * .. Scalar Arguments .. INTEGER INFO, LDA, LDT, M, N @@ -157,10 +154,10 @@ * Test the input arguments * INFO = 0 - IF( M.LT.0 ) THEN - INFO = -1 - ELSE IF( N.LT.0 ) THEN + IF( N.LT.0 ) THEN INFO = -2 + ELSE IF( M.LT.N ) THEN + INFO = -1 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 ELSE IF( LDT.LT.MAX( 1, N ) ) THEN diff --git a/lapack-netlib/SRC/dgeqrt2.f b/lapack-netlib/SRC/dgeqrt2.f index 138dd4d9c..00f800d43 100644 --- a/lapack-netlib/SRC/dgeqrt2.f +++ b/lapack-netlib/SRC/dgeqrt2.f @@ -97,8 +97,6 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 -* *> \ingroup doubleGEcomputational * *> \par Further Details: @@ -127,10 +125,9 @@ * ===================================================================== SUBROUTINE DGEQRT2( M, N, A, LDA, T, LDT, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 * * .. Scalar Arguments .. INTEGER INFO, LDA, LDT, M, N @@ -157,10 +154,10 @@ * Test the input arguments * INFO = 0 - IF( M.LT.0 ) THEN - INFO = -1 - ELSE IF( N.LT.0 ) THEN + IF( N.LT.0 ) THEN INFO = -2 + ELSE IF( M.LT.N ) THEN + INFO = -1 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 ELSE IF( LDT.LT.MAX( 1, N ) ) THEN diff --git a/lapack-netlib/SRC/sgeqrt2.f b/lapack-netlib/SRC/sgeqrt2.f index 349fd4b60..f6532f812 100644 --- a/lapack-netlib/SRC/sgeqrt2.f +++ b/lapack-netlib/SRC/sgeqrt2.f @@ -97,8 +97,6 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 -* *> \ingroup realGEcomputational * *> \par Further Details: @@ -127,10 +125,9 @@ * ===================================================================== SUBROUTINE SGEQRT2( M, N, A, LDA, T, LDT, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 * * .. Scalar Arguments .. INTEGER INFO, LDA, LDT, M, N @@ -157,10 +154,10 @@ * Test the input arguments * INFO = 0 - IF( M.LT.0 ) THEN - INFO = -1 - ELSE IF( N.LT.0 ) THEN + IF( N.LT.0 ) THEN INFO = -2 + ELSE IF( M.LT.N ) THEN + INFO = -1 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 ELSE IF( LDT.LT.MAX( 1, N ) ) THEN diff --git a/lapack-netlib/SRC/zgeqrt2.f b/lapack-netlib/SRC/zgeqrt2.f index bad708498..34d9d544f 100644 --- a/lapack-netlib/SRC/zgeqrt2.f +++ b/lapack-netlib/SRC/zgeqrt2.f @@ -97,8 +97,6 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 -* *> \ingroup complex16GEcomputational * *> \par Further Details: @@ -127,10 +125,9 @@ * ===================================================================== SUBROUTINE ZGEQRT2( M, N, A, LDA, T, LDT, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 * * .. Scalar Arguments .. INTEGER INFO, LDA, LDT, M, N @@ -157,10 +154,10 @@ * Test the input arguments * INFO = 0 - IF( M.LT.0 ) THEN - INFO = -1 - ELSE IF( N.LT.0 ) THEN + IF( N.LT.0 ) THEN INFO = -2 + ELSE IF( M.LT.N ) THEN + INFO = -1 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 ELSE IF( LDT.LT.MAX( 1, N ) ) THEN From aec32e5bd4cdc6d69a04000ae9530983eec0e756 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Feb 2022 22:39:03 +0100 Subject: [PATCH 71/77] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 710940924..04ed428de 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -224,7 +224,7 @@ jobs: - job: OSX_IOS_ARMV8 pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' variables: CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch arm64 -miphoneos-version-min=10.0 From f7e8f9ec57dcbe7c3a94a18575f0379dfe828dae Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 7 Feb 2022 00:00:15 +0100 Subject: [PATCH 72/77] Support AVX512-enabled AlderLake --- cpuid_x86.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 6466bd148..d7d85eb20 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1495,6 +1495,10 @@ int get_cpuname(void){ switch (model) { case 7: // Alder Lake desktop case 10: // Alder Lake mobile + if(support_avx512_bf16()) + return CPUTYPE_COOPERLAKE; + if(support_avx512()) + return CPUTYPE_SKYLAKEX; if(support_avx2()) return CPUTYPE_HASWELL; if(support_avx()) From fa3e9f25e633d5eb735e9183dfa72b6ed09fee0e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 7 Feb 2022 00:00:56 +0100 Subject: [PATCH 73/77] Support AVX512-enabled Alder Lake --- driver/others/dynamic.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index b12fb069a..52a7c6087 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -708,8 +708,11 @@ static gotoblas_t *get_coretype(void){ case 9: if (model == 7 || model == 10) { // Alder Lake + if(support_avx512_bf16()) + return &gotoblas_COOPERLAKE; + if (support_avx512()) + return &gotoblas_SKYLAKEX; if(support_avx2()){ - openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); return &gotoblas_HASWELL; } if(support_avx()) { From e2bf3f31a6e75223d864ffeb39c12bb3c68393e3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 9 Feb 2022 22:09:25 +0100 Subject: [PATCH 74/77] Add .NOTPARALLEL: as a workaround for builds on DFS --- lapack-netlib/TESTING/MATGEN/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/lapack-netlib/TESTING/MATGEN/Makefile b/lapack-netlib/TESTING/MATGEN/Makefile index e21ebd6c3..0b94e3aaa 100644 --- a/lapack-netlib/TESTING/MATGEN/Makefile +++ b/lapack-netlib/TESTING/MATGEN/Makefile @@ -66,6 +66,7 @@ ZMATGEN = zlatms.o zlatme.o zlatmr.o zlatmt.o \ endif .PHONY: all +.NOTPARALLEL: all: $(TMGLIB) ALLOBJ = $(SMATGEN) $(CMATGEN) $(SCATGEN) $(DMATGEN) $(ZMATGEN) \ From 0e04710099df5dd9369d49d435488c6f3705691a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Feb 2022 23:03:05 +0100 Subject: [PATCH 75/77] filter out libflangmain as well --- f_check | 1 + 1 file changed, 1 insertion(+) diff --git a/f_check b/f_check index 4825fb09a..71293b53f 100644 --- a/f_check +++ b/f_check @@ -361,6 +361,7 @@ if ($link ne "") { ($flags =~ /^\-l/) && ($flags !~ /ibrary/) && ($flags !~ /gfortranbegin/) + && ($flags !~ /flangmain/) && ($flags !~ /frtbegin/) && ($flags !~ /pathfstart/) && ($flags !~ /crt[0-9]/) From db7a03dd4c414c8053090bf5bcc18f0fc8e01095 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Feb 2022 23:04:45 +0100 Subject: [PATCH 76/77] keep flang-classic on MacOS from trying to create an executable instead of a library --- exports/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/exports/Makefile b/exports/Makefile index 903836dd6..baaa33623 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -142,10 +142,14 @@ ifneq (,$(filter 1 2,$(NOFORTRAN))) else ifeq ($(F_COMPILER), INTEL) $(FC) $(FFLAGS) $(LDFLAGS) -all-load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def +else +ifeq ($(F_COMPILER), FLANG) + $(FC) $(FFLAGS) $(LDFLAGS) -fno-fortran-main -Mnomain -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) else $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) endif endif +endif dllinit.$(SUFFIX) : dllinit.c $(CC) $(CFLAGS) -c -o $(@F) -s $< From c352ac0ae3593a30262b20d54f95c19f517b56a1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Feb 2022 22:16:04 +0100 Subject: [PATCH 77/77] Update with 0.3.20 changes --- Changelog.txt | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 180f7adec..97af4cbd9 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,39 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.20 + 20-Feb-2022 + +general: + - some code cleanup, with added casts etc. + - fixed obtaining the cpu count with OpenMP and OMP_PROC_BIND unset + - fixed pivot index calculation by ?LASWP for negative increments other than one + - fixed input argument check in LAPACK ? GEQRT2 + - improved the check for a Fortran compiler in CMAKE builds + - disabled building OpenBLAS' optimized versions of LAPACK complex SPMV,SPR,SYMV,SYR with NO_LAPACK=1 + - fixed building of LAPACK on certain distributed filesystems with parallel gmake + - fixed building the shared library on MacOS with classic flang + +x86_64: + - fixed cross-compilation with CMAKE for CORE2 target + - fixed miscompilation of AVX512 code in DYNAMIC_ARCH builds + - added support for the "incidental" AVX512 hardware in Alder Lake when enabled in BIOS + +E2K: + - add new architecture (Russian Elbrus E2000 family) + +SPARC: + - fix IMIN/IMAX + +ARMV8: + - added SVE-enabled CGEMM and ZGEMM kernels for ARMV8SVE and A64FX + - added support for Neoverse N2 and V1 cpus + +MIPS,MIPS64: + - fixed autodetection of MSA capability + +LOONGARCH64: + - added an optimized DGEMM kernel + ==================================================================== Version 0.3.19 19-Dec-2021