From e112191b549afa98062acc292ac62e1b766ae53d Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Wed, 22 May 2024 08:00:06 -0500 Subject: [PATCH 1/9] POWER: Fix issues in zscal to address lapack failures This patch fixes following lapack failures with clang compiler on POWER. zed.out: ZVX: 18 out of 5190 tests failed to pass the threshold zgd.out: ZGV drivers: 25 out of 1092 tests failed to pass the threshold zgd.out: ZGV drivers: 6 out of 1092 tests failed to pass the threshold --- kernel/power/zscal.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index 0068138e8..6b7392d0c 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -38,6 +38,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" +#if defined(__clang__) +#pragma clang fp contract(off) +#endif + #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #if defined(DOUBLE) From 44004178aadd1a55a273b4261a345b24ad77ac2e Mon Sep 17 00:00:00 2001 From: Jake Arkinstall Date: Sat, 1 Jun 2024 11:22:26 +0100 Subject: [PATCH 2/9] Updated CONTRIBUTORS.md As requested on X (https://x.com/KroekerMartin/status/1755218919290278185) --- CONTRIBUTORS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 203320826..d885a01b9 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -198,6 +198,9 @@ In chronological order: * PingTouGe Semiconductor Co., Ltd. * [2020-10] Add RISC-V Vector (0.7.1) support. Optimize BLAS kernels for Xuantie C910 +* Jake Arkinstall + * [2021-02-10] Remove in-source configure_file to enable builds in read-only contexts (issue #3100, PR #3101) + * River Dillon * [2021-07-10] fix compilation with musl libc From db9f7bc5526e70a601c374709a24424c21ab91a9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 3 Jun 2024 00:22:16 +0200 Subject: [PATCH 3/9] fix float array types to include bfloat16 --- interface/gemm_batch.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/gemm_batch.c b/interface/gemm_batch.c index 846d8e0f4..56ccc12ce 100644 --- a/interface/gemm_batch.c +++ b/interface/gemm_batch.c @@ -118,8 +118,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE * transa_array, enum CB blasint * m_array, blasint * n_array, blasint * k_array, #ifndef COMPLEX FLOAT * alpha_array, - FLOAT ** a_array, blasint * lda_array, - FLOAT ** b_array, blasint * ldb_array, + IFLOAT ** a_array, blasint * lda_array, + IFLOAT ** b_array, blasint * ldb_array, FLOAT * beta_array, FLOAT ** c_array, blasint * ldc_array, blasint group_count, blasint * group_size) { #else From df87aeb5a2a3785e15a3d94dbf92e2e03448500f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 4 Jun 2024 09:49:18 +0200 Subject: [PATCH 4/9] Drop the -static Fortran flag from generic builds as it breaks OpenMP --- Makefile.riscv64 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index 113cc57c5..9f6e48b7a 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -8,13 +8,13 @@ FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static endif ifeq ($(CORE), RISCV64_ZVL256B) CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d -FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static +FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d endif ifeq ($(CORE), RISCV64_ZVL128B) CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static +FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d endif ifeq ($(CORE), RISCV64_GENERIC) CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static +FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d endif From 8ab2e9ec65c8c182a7189cf5574e4f2f51e02e5c Mon Sep 17 00:00:00 2001 From: gxw Date: Sat, 16 Sep 2023 11:19:12 +0800 Subject: [PATCH 5/9] LoongArch: DGEMM small matrix opt --- Makefile.system | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 10 + .../loongarch64/dgemm_small_kernel_nn_lasx.S | 549 +++++++++++++++ .../loongarch64/dgemm_small_kernel_nt_lasx.S | 500 ++++++++++++++ .../loongarch64/dgemm_small_kernel_tn_lasx.S | 639 ++++++++++++++++++ .../loongarch64/dgemm_small_kernel_tt_lasx.S | 534 +++++++++++++++ .../loongarch64/dgemm_small_matrix_permit.c | 44 ++ 7 files changed, 2279 insertions(+) create mode 100644 kernel/loongarch64/dgemm_small_kernel_nn_lasx.S create mode 100644 kernel/loongarch64/dgemm_small_kernel_nt_lasx.S create mode 100644 kernel/loongarch64/dgemm_small_kernel_tn_lasx.S create mode 100644 kernel/loongarch64/dgemm_small_kernel_tt_lasx.S create mode 100644 kernel/loongarch64/dgemm_small_matrix_permit.c diff --git a/Makefile.system b/Makefile.system index f452011ad..4cd4e4a1c 100644 --- a/Makefile.system +++ b/Makefile.system @@ -269,6 +269,9 @@ else ifeq ($(ARCH), power) SMALL_MATRIX_OPT = 1 BUILD_BFLOAT16 = 1 endif +ifeq ($(ARCH), loongarch64) +SMALL_MATRIX_OPT = 1 +endif ifeq ($(SMALL_MATRIX_OPT), 1) CCOMMON_OPT += -DSMALL_MATRIX_OPT endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 2c1ab87e5..eff1581d9 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -162,4 +162,14 @@ STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DGEMM_SMALL_M_PERMIT = dgemm_small_matrix_permit.c +DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_lasx.S +DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_nn_lasx.S +DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_lasx.S +DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_nt_lasx.S +DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_lasx.S +DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_lasx.S +DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_lasx.S +DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_lasx.S endif diff --git a/kernel/loongarch64/dgemm_small_kernel_nn_lasx.S b/kernel/loongarch64/dgemm_small_kernel_nn_lasx.S new file mode 100644 index 000000000..a50350ddc --- /dev/null +++ b/kernel/loongarch64/dgemm_small_kernel_nn_lasx.S @@ -0,0 +1,549 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +#define M $a0 +#define N $a1 +#define K $a2 +#define A $a3 +#define LDA $a4 +#define ALPHA $f0 +#define B $a5 +#define LDB $a6 +#define C $a7 +#define LDC $t0 +#ifdef B0 +#define BETA $f1 +#endif +#undef ZERO +#define ZERO $r0 + +#define M16 $t1 +#define M8 $t1 +#define M4 $t1 +#define M2 $t1 +#define M1 $t1 +#define N4 $t2 +#define N2 $t2 +#define N1 $t2 +#define K8 $t3 +#define A0 $t4 +#define X0 $t5 +#define B1 $t6 +#define B2 $t7 +#define B3 $t8 +#define C0 $s0 +#define C1 $s1 +#define C2 $s2 +#define C3 $s3 +#define K1 $s4 + +#define VALPHA $xr0 +#ifndef B0 +#define VBETA $xr1 +#endif +#define D0 $xr2 +#define D1 $xr3 +#define D2 $xr4 +#define D3 $xr5 +#define D4 $xr6 +#define D5 $xr7 +#define D6 $xr8 +#define D7 $xr9 +#define D8 $xr10 +#define D9 $xr11 +#define D10 $xr12 +#define D11 $xr13 +#define D12 $xr14 +#define D13 $xr15 +#define D14 $xr16 +#define D15 $xr17 +#define S0 $xr18 +#define S1 $xr19 +#define S2 $xr20 +#define S3 $xr21 +#define Z0 $xr22 +#define Z1 $xr23 +#define Z2 $xr24 +#define Z3 $xr25 +#define V0 $vr2 +#define V1 $vr3 +#define V2 $vr4 +#define V3 $vr5 +#define F0 $f2 +#define F1 $f3 +#define F2 $f4 +#define F3 $f5 + +.macro DGEMM_SMALL_KERNEL_NN_TAIL M + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + PTR_ADD B1, X0, LDB + PTR_ADD B2, B1, LDB + PTR_ADD B3, B2, LDB + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M\M\()_N3 +.L_M\M\()_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M\M\()_N4_END +.L_M\M\()_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S0, Z1, D1, D2, S0, Z2, D2, D3, S0, Z3, D3 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI B2, B2, 0x08 + PTR_ADDI B3, B3, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M\M\()_N4_K1 +.L_M\M\()_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 + GLD xv, , S0, C1, 0x00 + GMADD xvf, d, D1, S0, VBETA, D1 + GLD xv, , S0, C2, 0x00 + GMADD xvf, d, D2, S0, VBETA, D2 + GLD xv, , S0, C3, 0x00 + GMADD xvf, d, D3, S0, VBETA, D3 +#endif +.if \M == 4 + GST xv, , D0, C0, 0x00, D1, C1, 0x00, D2, C2, 0x00, D3, C3, 0x00 +.elseif \M == 2 + GST v, , V0, C0, 0x00, V1, C1, 0x00, V2, C2, 0x00, V3, C3, 0x00 +.elseif \M == 1 + GST f, d, F0, C0, 0x00, F1, C1, 0x00, F2, C2, 0x00, F3, C3, 0x00 +.endif + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_SUB B2, B2, K8 + PTR_SUB B3, B3, K8 + PTR_ALSL X0, LDB, X0, 2 + PTR_ALSL B1, LDB, B1, 2 + PTR_ALSL B2, LDB, B2, 2 + PTR_ALSL B3, LDB, B3, 2 + // Restore A0 + move A0, A + bnez N4, .L_M\M\()_N4 +.L_M\M\()_N3: + andi N2, N, 0x02 + beqz N2, .L_M\M\()_N1 +.L_M\M\()_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M\M\()_N2_END +.L_M\M\()_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S0, Z1, D1 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M\M\()_N2_K1 +.L_M\M\()_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 + GLD xv, , S0, C1, 0x00 + GMADD xvf, d, D1, S0, VBETA, D1 +#endif +.if \M == 4 + GST xv, , D0, C0, 0x00, D1, C1, 0x00 +.elseif \M == 2 + GST v, , V0, C0, 0x00, V1, C1, 0x00 +.elseif \M == 1 + GST f, d, F0, C0, 0x00, F1, C1, 0x00 +.endif + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0, B1 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_ALSL X0, LDB, X0, 1 + PTR_ALSL B1, LDB, B1, 1 + // Restore A0 + move A0, A +.L_M\M\()_N1: + andi N1, N, 0x01 + beqz N1, .L_M\M\()_END + GXOR xv, v, D0, D0, D0 + move K1, K // Restore K1 + bge ZERO, K, .L_M\M\()_N1_END +.L_M\M\()_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0 + PTR_ADDI X0, X0, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M\M\()_N1_K1 +.L_M\M\()_N1_END: + GMUL xvf, d, D0, D0, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 +#endif +.if \M == 4 + GST xv, , D0, C0, 0x00 +.elseif \M == 2 + GST v, , V0, C0, 0x00 +.elseif \M == 1 + GST f, d, F0, C0, 0x00 +.endif +.L_M\M\()_END: +.if \M == 4 + PTR_ADDI A, A, 0x20 + PTR_ADDI C, C, 0x20 +.elseif \M == 2 + PTR_ADDI A, A, 0x10 + PTR_ADDI C, C, 0x10 +.elseif \M == 1 +.endif +.endm + + PROLOGUE + PTR_LD LDC, $sp, 0 + push_if_used 5, 2 + xvreplve0.d VALPHA, VALPHA +#ifndef B0 + xvreplve0.d VBETA, VBETA +#endif + PTR_SLLI LDA, LDA, 3 + PTR_SLLI LDB, LDB, 3 + PTR_SLLI LDC, LDC, 3 + PTR_SLLI K8, K, 3 + PTR_SRAI M16, M, 4 // M >> 4 + beqz M16, .L_M15 +.L_M16: + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + PTR_ADD B1, X0, LDB + PTR_ADD B2, B1, LDB + PTR_ADD B3, B2, LDB + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M16_N3 +.L_M16_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \ + D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7, \ + D8, D8, D8, D9, D9, D9, D10, D10, D10, D11, D11, D11, \ + D12, D12, D12, D13, D13, D13, D14, D14, D14, D15, D15, D15 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M16_N4_END +.L_M16_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3, \ + D4, S0, Z1, D4, D5, S1, Z1, D5, D6, S2, Z1, D6, D7, S3, Z1, D7, \ + D8, S0, Z2, D8, D9, S1, Z2, D9, D10, S2, Z2, D10, D11, S3, Z2, D11, \ + D12, S0, Z3, D12, D13, S1, Z3, D13, D14, S2, Z3, D14, D15, S3, Z3, D15 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI B2, B2, 0x08 + PTR_ADDI B3, B3, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M16_N4_K1 + .L_M16_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA, \ + D8, D8, VALPHA, D9, D9, VALPHA, D10, D10, VALPHA, D11, D11, VALPHA, \ + D12, D12, VALPHA, D13, D13, VALPHA, D14, D14, VALPHA, D15, D15, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20, S2, C1, 0x40, S3, C1, 0x60 + GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5, D6, S2, VBETA, D6, D7, S3, VBETA, D7 + GLD xv, , S0, C2, 0x00, S1, C2, 0x20, S2, C2, 0x40, S3, C2, 0x60 + GMADD xvf, d, D8, S0, VBETA, D8, D9, S1, VBETA, D9, D10, S2, VBETA, D10, D11, S3, VBETA, D11 + GLD xv, , S0, C3, 0x00, S1, C3, 0x20, S2, C3, 0x40, S3, C3, 0x60 + GMADD xvf, d, D12, S0, VBETA, D12, D13, S1, VBETA, D13, D14, S2, VBETA, D14, D15, S3, VBETA, D15 +#endif + GST xv, , D12, C3, 0x00, D13, C3, 0x20, D14, C3, 0x40, D15, C3, 0x60, \ + D8, C2, 0x00, D9, C2, 0x20, D10, C2, 0x40, D11, C2, 0x60, \ + D4, C1, 0x00, D5, C1, 0x20, D6, C1, 0x40, D7, C1, 0x60, \ + D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_SUB B2, B2, K8 + PTR_SUB B3, B3, K8 + + PTR_ALSL X0, LDB, X0, 2 + PTR_ALSL B1, LDB, B1, 2 + PTR_ALSL B2, LDB, B2, 2 + PTR_ALSL B3, LDB, B3, 2 + // Restore A0 + move A0, A + bnez N4, .L_M16_N4 +.L_M16_N3: + andi N2, N, 0x02 + beqz N2, .L_M16_N1 +.L_M16_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \ + D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7 + move K1, K // Restore K1 + bge ZERO, K, .L_M16_N2_END +.L_M16_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3, \ + D4, S0, Z1, D4, D5, S1, Z1, D5, D6, S2, Z1, D6, D7, S3, Z1, D7 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M16_N2_K1 +.L_M16_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20, S2, C1, 0x40, S3, C1, 0x60 + GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5, D6, S2, VBETA, D6, D7, S3, VBETA, D7 +#endif + GST xv, , D4, C1, 0x00, D5, C1, 0x20, D6, C1, 0x40, D7, C1, 0x60, \ + D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_ALSL X0, LDB, X0, 1 + PTR_ALSL B1, LDB, B1, 1 + // Restore A0 + move A0, A +.L_M16_N1: + andi N1, N, 0x01 + beqz N1, .L_M16_END + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + bge ZERO, K, .L_M16_N1_END +.L_M16_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3 + PTR_ADDI X0, X0, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M16_N1_K1 +.L_M16_N1_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 +#endif + GST xv, , D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_ALSL X0, LDB, X0, 2 + // Restore A0 + move A0, A +.L_M16_END: + PTR_ADDI M16, M16, -1 + PTR_ADDI A, A, 0x80 + PTR_ADDI C, C, 0x80 + bnez M16, .L_M16 +.L_M15: + andi M8, M, 0x08 + beqz M8, .L_M7 +.L_M8: + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + PTR_ADD B1, X0, LDB + PTR_ADD B2, B1, LDB + PTR_ADD B3, B2, LDB + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M8_N3 +.L_M8_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \ + D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M8_N4_END +.L_M8_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, \ + D2, S0, Z1, D2, D3, S1, Z1, D3, \ + D4, S0, Z2, D4, D5, S1, Z2, D5, \ + D6, S0, Z3, D6, D7, S1, Z3, D7, + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI B2, B2, 0x08 + PTR_ADDI B3, B3, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M8_N4_K1 +.L_M8_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20 + GMADD xvf, d, D2, S0, VBETA, D2, D3, S1, VBETA, D3 + GLD xv, , S0, C2, 0x00, S1, C2, 0x20 + GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5 + GLD xv, , S0, C3, 0x00, S1, C3, 0x20 + GMADD xvf, d, D6, S0, VBETA, D6, D7, S1, VBETA, D7 +#endif + GST xv, , D4, C2, 0x00, D5, C2, 0x20, D6, C3, 0x00, D7, C3, 0x20, \ + D0, C0, 0x00, D1, C0, 0x20, D2, C1, 0x00, D3, C1, 0x20 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_SUB B2, B2, K8 + PTR_SUB B3, B3, K8 + PTR_ALSL X0, LDB, X0, 2 + PTR_ALSL B1, LDB, B1, 2 + PTR_ALSL B2, LDB, B2, 2 + PTR_ALSL B3, LDB, B3, 2 + // Restore A0 + move A0, A + bnez N4, .L_M8_N4 +.L_M8_N3: + andi N2, N, 0x02 + beqz N2, .L_M8_N1 +.L_M8_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + bge ZERO, K, .L_M8_N2_END +.L_M8_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, \ + D2, S0, Z1, D2, D3, S1, Z1, D3 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M8_N2_K1 +.L_M8_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20 + GMADD xvf, d, D2, S0, VBETA, D2, D3, S1, VBETA, D3 +#endif + GST xv, , D0, C0, 0x00, D1, C0, 0x20, D2, C1, 0x00, D3, C1, 0x20 + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0, B1 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_ALSL X0, LDB, X0, 1 + PTR_ALSL B1, LDB, B1, 1 + // Restore A0 + move A0, A +.L_M8_N1: + andi N1, N, 0x01 + beqz N1, .L_M8_END + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M8_N1_END +.L_M8_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1 + PTR_ADDI X0, X0, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M8_N1_K1 +.L_M8_N1_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 +#endif + GST xv, , D0, C0, 0x00, D1, C0, 0x20 +.L_M8_END: + PTR_ADDI A, A, 0x40 + PTR_ADDI C, C, 0x40 +.L_M7: + andi M4, M, 0x04 + beqz M4, .L_M3 +.L_M4: + DGEMM_SMALL_KERNEL_NN_TAIL 4 +.L_M3: + andi M2, M, 0x02 + beqz M2, .L_M1 +.L_M2: + DGEMM_SMALL_KERNEL_NN_TAIL 2 +.L_M1: + andi M1, M, 0x01 + beqz M1, .L_M0 + DGEMM_SMALL_KERNEL_NN_TAIL 1 +.L_M0: + pop_if_used 5, 2 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_small_kernel_nt_lasx.S b/kernel/loongarch64/dgemm_small_kernel_nt_lasx.S new file mode 100644 index 000000000..aee0586f5 --- /dev/null +++ b/kernel/loongarch64/dgemm_small_kernel_nt_lasx.S @@ -0,0 +1,500 @@ +/*************************************************************************** +Copyright (c) 2024 The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +#define M $a0 +#define N $a1 +#define K $a2 +#define A $a3 +#define LDA $a4 +#define ALPHA $f0 +#define B $a5 +#define LDB $a6 +#define C $a7 +#define LDC $t0 +#ifdef B0 +#define BETA $f1 +#endif +#undef ZERO +#define ZERO $r0 + +#define M16 $t1 +#define M8 $t1 +#define M4 $t1 +#define M2 $t1 +#define M1 $t1 +#define N4 $t2 +#define N2 $t2 +#define N1 $t2 +#define K_LDB $t3 +#define A0 $t4 +#define X0 $t5 +#define C0 $t6 +#define C1 $t7 +#define C2 $t8 +#define C3 $s0 +#define K1 $s1 + +#define VALPHA $xr0 +#ifndef B0 +#define VBETA $xr1 +#endif +#define D0 $xr2 +#define D1 $xr3 +#define D2 $xr4 +#define D3 $xr5 +#define D4 $xr6 +#define D5 $xr7 +#define D6 $xr8 +#define D7 $xr9 +#define D8 $xr10 +#define D9 $xr11 +#define D10 $xr12 +#define D11 $xr13 +#define D12 $xr14 +#define D13 $xr15 +#define D14 $xr16 +#define D15 $xr17 +#define S0 $xr18 +#define S1 $xr19 +#define S2 $xr20 +#define S3 $xr21 +#define Z0 $xr22 +#define Z1 $xr23 +#define Z2 $xr24 +#define Z3 $xr25 +#define V0 $vr2 +#define V1 $vr3 +#define V2 $vr4 +#define V3 $vr5 +#define F0 $f2 +#define F1 $f3 +#define F2 $f4 +#define F3 $f5 + +.macro DGEMM_SMALL_KERNEL_NT_TAIL M + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M\M\()_N3 +.L_M\M\()_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M\M\()_N4_END +.L_M\M\()_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08, Z2, X0, 0x10, Z3, X0, 0x18 + GMADD xvf, d, D0, S0, Z0, D0, D1, S0, Z1, D1, D2, S0, Z2, D2, D3, S0, Z3, D3 + PTR_ADD X0, X0, LDB + PTR_ADD A0, A0, LDA + bnez K1, .L_M\M\()_N4_K1 +.L_M\M\()_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 + GLD xv, , S0, C1, 0x00 + GMADD xvf, d, D1, S0, VBETA, D1 + GLD xv, , S0, C2, 0x00 + GMADD xvf, d, D2, S0, VBETA, D2 + GLD xv, , S0, C3, 0x00 + GMADD xvf, d, D3, S0, VBETA, D3 +#endif +.if \M == 4 + GST xv, , D0, C0, 0x00, D1, C1, 0x00, D2, C2, 0x00, D3, C3, 0x00 +.elseif \M == 2 + GST v, , V0, C0, 0x00, V1, C1, 0x00, V2, C2, 0x00, V3, C3, 0x00 +.elseif \M == 1 + GST f, d, F0, C0, 0x00, F1, C1, 0x00, F2, C2, 0x00, F3, C3, 0x00 +.endif + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x20 + // Restore A0 + move A0, A + bnez N4, .L_M\M\()_N4 +.L_M\M\()_N3: + andi N2, N, 0x02 + beqz N2, .L_M\M\()_N1 +.L_M\M\()_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M\M\()_N2_END +.L_M\M\()_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08 + GMADD xvf, d, D0, S0, Z0, D0, D1, S0, Z1, D1 + PTR_ADD X0, X0, LDB + PTR_ADD A0, A0, LDA + bnez K1, .L_M\M\()_N2_K1 +.L_M\M\()_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 + GLD xv, , S0, C1, 0x00 + GMADD xvf, d, D1, S0, VBETA, D1 +#endif +.if \M == 4 + GST xv, , D0, C0, 0x00, D1, C1, 0x00 +.elseif \M == 2 + GST v, , V0, C0, 0x00, V1, C1, 0x00 +.elseif \M == 1 + GST f, d, F0, C0, 0x00, F1, C1, 0x00 +.endif + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x10 + // Restore A0 + move A0, A +.L_M\M\()_N1: + andi N1, N, 0x01 + beqz N1, .L_M\M\()_END + GXOR xv, v, D0, D0, D0 + move K1, K // Restore K1 + bge ZERO, K, .L_M\M\()_N1_END +.L_M\M\()_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0 + PTR_ADD X0, X0, LDB + PTR_ADD A0, A0, LDA + bnez K1, .L_M\M\()_N1_K1 +.L_M\M\()_N1_END: + GMUL xvf, d, D0, D0, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 +#endif +.if \M == 4 + GST xv, , D0, C0, 0x00 +.elseif \M == 2 + GST v, , V0, C0, 0x00 +.elseif \M == 1 + GST f, d, F0, C0, 0x00 +.endif +.L_M\M\()_END: +.if \M == 4 + PTR_ADDI A, A, 0x20 + PTR_ADDI C, C, 0x20 +.elseif \M == 2 + PTR_ADDI A, A, 0x10 + PTR_ADDI C, C, 0x10 +.elseif \M == 1 +.endif +.endm + + PROLOGUE + PTR_LD LDC, $sp, 0 + push_if_used 2, 2 + xvreplve0.d VALPHA, VALPHA +#ifndef B0 + xvreplve0.d VBETA, VBETA +#endif + PTR_SLLI LDA, LDA, 3 + PTR_SLLI LDB, LDB, 3 + PTR_SLLI LDC, LDC, 3 + PTR_MUL K_LDB, K, LDB + PTR_SRAI M16, M, 4 // M >> 4 + beqz M16, .L_M15 +.L_M16: + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M16_N3 +.L_M16_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \ + D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7, \ + D8, D8, D8, D9, D9, D9, D10, D10, D10, D11, D11, D11, \ + D12, D12, D12, D13, D13, D13, D14, D14, D14, D15, D15, D15 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M16_N4_END +.L_M16_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08, Z2, X0, 0x10, Z3, X0, 0x18 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3, \ + D4, S0, Z1, D4, D5, S1, Z1, D5, D6, S2, Z1, D6, D7, S3, Z1, D7, \ + D8, S0, Z2, D8, D9, S1, Z2, D9, D10, S2, Z2, D10, D11, S3, Z2, D11, \ + D12, S0, Z3, D12, D13, S1, Z3, D13, D14, S2, Z3, D14, D15, S3, Z3, D15 + PTR_ADD X0, X0, LDB + PTR_ADD A0, A0, LDA + bnez K1, .L_M16_N4_K1 + .L_M16_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA, \ + D8, D8, VALPHA, D9, D9, VALPHA, D10, D10, VALPHA, D11, D11, VALPHA, \ + D12, D12, VALPHA, D13, D13, VALPHA, D14, D14, VALPHA, D15, D15, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20, S2, C1, 0x40, S3, C1, 0x60 + GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5, D6, S2, VBETA, D6, D7, S3, VBETA, D7 + GLD xv, , S0, C2, 0x00, S1, C2, 0x20, S2, C2, 0x40, S3, C2, 0x60 + GMADD xvf, d, D8, S0, VBETA, D8, D9, S1, VBETA, D9, D10, S2, VBETA, D10, D11, S3, VBETA, D11 + GLD xv, , S0, C3, 0x00, S1, C3, 0x20, S2, C3, 0x40, S3, C3, 0x60 + GMADD xvf, d, D12, S0, VBETA, D12, D13, S1, VBETA, D13, D14, S2, VBETA, D14, D15, S3, VBETA, D15 +#endif + GST xv, , D12, C3, 0x00, D13, C3, 0x20, D14, C3, 0x40, D15, C3, 0x60, \ + D8, C2, 0x00, D9, C2, 0x20, D10, C2, 0x40, D11, C2, 0x60, \ + D4, C1, 0x00, D5, C1, 0x20, D6, C1, 0x40, D7, C1, 0x60, \ + D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x20 + // Restore A0 + move A0, A + bnez N4, .L_M16_N4 +.L_M16_N3: + andi N2, N, 0x02 + beqz N2, .L_M16_N1 +.L_M16_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \ + D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7 + move K1, K // Restore K1 + bge ZERO, K, .L_M16_N2_END +.L_M16_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3, \ + D4, S0, Z1, D4, D5, S1, Z1, D5, D6, S2, Z1, D6, D7, S3, Z1, D7 + PTR_ADD X0, X0, LDB + PTR_ADD A0, A0, LDA + bnez K1, .L_M16_N2_K1 +.L_M16_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20, S2, C1, 0x40, S3, C1, 0x60 + GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5, D6, S2, VBETA, D6, D7, S3, VBETA, D7 +#endif + GST xv, , D4, C1, 0x00, D5, C1, 0x20, D6, C1, 0x40, D7, C1, 0x60, \ + D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60 + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x10 + // Restore A0 + move A0, A +.L_M16_N1: + andi N1, N, 0x01 + beqz N1, .L_M16_END + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + bge ZERO, K, .L_M16_N1_END +.L_M16_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3 + PTR_ADD X0, X0, LDB + PTR_ADD A0, A0, LDA + bnez K1, .L_M16_N1_K1 +.L_M16_N1_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 +#endif + GST xv, , D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60 + // Update C0 + PTR_ALSL C0, LDC, C0, 2 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x08 + // Restore A0 + move A0, A +.L_M16_END: + PTR_ADDI M16, M16, -1 + PTR_ADDI A, A, 0x80 + PTR_ADDI C, C, 0x80 + bnez M16, .L_M16 +.L_M15: + andi M8, M, 0x08 + beqz M8, .L_M7 +.L_M8: + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M8_N3 +.L_M8_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \ + D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M8_N4_END +.L_M8_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08, Z2, X0, 0x10, Z3, X0, 0x18 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, \ + D2, S0, Z1, D2, D3, S1, Z1, D3, \ + D4, S0, Z2, D4, D5, S1, Z2, D5, \ + D6, S0, Z3, D6, D7, S1, Z3, D7, + PTR_ADD X0, X0, LDB + PTR_ADD A0, A0, LDA + bnez K1, .L_M8_N4_K1 +.L_M8_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20 + GMADD xvf, d, D2, S0, VBETA, D2, D3, S1, VBETA, D3 + GLD xv, , S0, C2, 0x00, S1, C2, 0x20 + GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5 + GLD xv, , S0, C3, 0x00, S1, C3, 0x20 + GMADD xvf, d, D6, S0, VBETA, D6, D7, S1, VBETA, D7 +#endif + GST xv, , D4, C2, 0x00, D5, C2, 0x20, D6, C3, 0x00, D7, C3, 0x20, \ + D0, C0, 0x00, D1, C0, 0x20, D2, C1, 0x00, D3, C1, 0x20 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x20 + // Restore A0 + move A0, A + bnez N4, .L_M8_N4 +.L_M8_N3: + andi N2, N, 0x02 + beqz N2, .L_M8_N1 +.L_M8_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + bge ZERO, K, .L_M8_N2_END +.L_M8_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, \ + D2, S0, Z1, D2, D3, S1, Z1, D3 + PTR_ADD X0, X0, LDB + PTR_ADD A0, A0, LDA + bnez K1, .L_M8_N2_K1 +.L_M8_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20 + GMADD xvf, d, D2, S0, VBETA, D2, D3, S1, VBETA, D3 +#endif + GST xv, , D0, C0, 0x00, D1, C0, 0x20, D2, C1, 0x00, D3, C1, 0x20 + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x10 + // Restore A0 + move A0, A +.L_M8_N1: + andi N1, N, 0x01 + beqz N1, .L_M8_END + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M8_N1_END +.L_M8_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1 + PTR_ADD X0, X0, LDB + PTR_ADD A0, A0, LDA + bnez K1, .L_M8_N1_K1 +.L_M8_N1_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 +#endif + GST xv, , D0, C0, 0x00, D1, C0, 0x20 +.L_M8_END: + PTR_ADDI A, A, 0x40 + PTR_ADDI C, C, 0x40 +.L_M7: + andi M4, M, 0x04 + beqz M4, .L_M3 +.L_M4: + DGEMM_SMALL_KERNEL_NT_TAIL 4 +.L_M3: + andi M2, M, 0x02 + beqz M2, .L_M1 +.L_M2: + DGEMM_SMALL_KERNEL_NT_TAIL 2 +.L_M1: + andi M1, M, 0x01 + beqz M1, .L_M0 + DGEMM_SMALL_KERNEL_NT_TAIL 1 +.L_M0: + pop_if_used 2, 2 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_small_kernel_tn_lasx.S b/kernel/loongarch64/dgemm_small_kernel_tn_lasx.S new file mode 100644 index 000000000..b1e588606 --- /dev/null +++ b/kernel/loongarch64/dgemm_small_kernel_tn_lasx.S @@ -0,0 +1,639 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +#define M $a0 +#define N $a1 +#define K $a2 +#define A $a3 +#define LDA $a4 +#define ALPHA $f0 +#define B $a5 +#define LDB $a6 +#define C $a7 +#define LDC $t0 +#ifdef B0 +#define BETA $f1 +#endif +#undef ZERO +#define ZERO $r0 + +#define M4 $t1 +#define M2 $t1 +#define M1 $t1 +#define N4 $t2 +#define N2 $t2 +#define N1 $t2 +#define K8 $t3 +#define A0 $t4 +#define X0 $t5 +#define B1 $t6 +#define B2 $t7 +#define B3 $t8 +#define C0 $s0 +#define C1 $s1 +#define C2 $s2 +#define C3 $s3 +#define K1 $s4 +#define A1 $s5 +#define A2 $s6 +#define A3 $s7 + +#define VALPHA $xr0 +#ifndef B0 +#define VBETA $xr1 +#endif +#define D0 $xr2 +#define D1 $xr3 +#define D2 $xr4 +#define D3 $xr5 +#define T0 $xr6 +#define T1 $xr7 +#define T2 $xr8 +#define T3 $xr9 +#define Y0 $xr10 +#define Y1 $xr11 +#define Y2 $xr12 +#define Y3 $xr13 +#define G0 $xr14 +#define G1 $xr15 +#define G2 $xr16 +#define G3 $xr17 +#define S0 $xr18 +#define S1 $xr19 +#define S2 $xr20 +#define S3 $xr21 +#define Z0 $xr22 +#define Z1 $xr23 +#define Z2 $xr24 +#define Z3 $xr25 +#define V0 $vr2 +#define V1 $vr3 +#define V2 $vr4 +#define V3 $vr5 +#define F0 $f2 +#define F1 $f3 +#define F2 $f4 +#define F3 $f5 + + PROLOGUE + PTR_LD LDC, $sp, 0 + push_if_used 8, 2 + xvreplve0.d VALPHA, VALPHA +#ifndef B0 + xvreplve0.d VBETA, VBETA +#endif + PTR_SLLI LDA, LDA, 3 + PTR_SLLI LDB, LDB, 3 + PTR_SLLI LDC, LDC, 3 + PTR_SLLI K8, K, 3 + PTR_SRAI M4, M, 2 // M >> 2 + beqz M4, .L_M3 +.L_M4: + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + PTR_ADD A1, A0, LDA + PTR_ADD A2, A1, LDA + PTR_ADD A3, A2, LDA + move X0, B // Restore X0 + PTR_ADD B1, X0, LDB + PTR_ADD B2, B1, LDB + PTR_ADD B3, B2, LDB + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M4_N3 +.L_M4_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M4_N4_END + PTR_SRAI K1, K1, 3 + beq ZERO, K1, .L_M4_N4_K7 +.L_M4_N4_K8: + PTR_ADDI K1, K1, -1 + GLD xv, , T0, A0, 0x00, T1, A1, 0x00, T2, A2, 0x00, T3, A3, 0x00 + GTRANSPOSE4x4_D T0, T1, T2, T3, S0, S1, S2, S3, Z0, Z1 + + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GLDREPL xv, d, T0, X0, 0x08, T1, B1, 0x08, T2, B2, 0x08, T3, B3, 0x08 + GLDREPL xv, d, Y0, X0, 0x10, Y1, B1, 0x10, Y2, B2, 0x10, Y3, B3, 0x10 + GLDREPL xv, d, G0, X0, 0x18, G1, B1, 0x18, G2, B2, 0x18, G3, B3, 0x18 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1, \ + D2, S0, Z2, D2, \ + D3, S0, Z3, D3 + GMADD xvf, d, D0, S1, T0, D0, \ + D1, S1, T1, D1, \ + D2, S1, T2, D2, \ + D3, S1, T3, D3 + GMADD xvf, d, D0, S2, Y0, D0, \ + D1, S2, Y1, D1, \ + D2, S2, Y2, D2, \ + D3, S2, Y3, D3 + GMADD xvf, d, D0, S3, G0, D0, \ + D1, S3, G1, D1, \ + D2, S3, G2, D2, \ + D3, S3, G3, D3 + + GLD xv, , T0, A0, 0x20, T1, A1, 0x20, T2, A2, 0x20, T3, A3, 0x20 + GTRANSPOSE4x4_D T0, T1, T2, T3, S0, S1, S2, S3, Z0, Z1 + + GLDREPL xv, d, Z0, X0, 0x20, Z1, B1, 0x20, Z2, B2, 0x20, Z3, B3, 0x20 + GLDREPL xv, d, T0, X0, 0x28, T1, B1, 0x28, T2, B2, 0x28, T3, B3, 0x28 + GLDREPL xv, d, Y0, X0, 0x30, Y1, B1, 0x30, Y2, B2, 0x30, Y3, B3, 0x30 + GLDREPL xv, d, G0, X0, 0x38, G1, B1, 0x38, G2, B2, 0x38, G3, B3, 0x38 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1, \ + D2, S0, Z2, D2, \ + D3, S0, Z3, D3 + GMADD xvf, d, D0, S1, T0, D0, \ + D1, S1, T1, D1, \ + D2, S1, T2, D2, \ + D3, S1, T3, D3 + GMADD xvf, d, D0, S2, Y0, D0, \ + D1, S2, Y1, D1, \ + D2, S2, Y2, D2, \ + D3, S2, Y3, D3 + GMADD xvf, d, D0, S3, G0, D0, \ + D1, S3, G1, D1, \ + D2, S3, G2, D2, \ + D3, S3, G3, D3 + + PTR_ADDI X0, X0, 0x40 + PTR_ADDI B1, B1, 0x40 + PTR_ADDI B2, B2, 0x40 + PTR_ADDI B3, B3, 0x40 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI A1, A1, 0x40 + PTR_ADDI A2, A2, 0x40 + PTR_ADDI A3, A3, 0x40 + bnez K1, .L_M4_N4_K8 + .L_M4_N4_K7: + andi K1, K, 4 + beqz K1, .L_M4_N4_3 + .L_M4_N4_K4: + GLD xv, , T0, A0, 0x00, T1, A1, 0x00, T2, A2, 0x00, T3, A3, 0x00 + GTRANSPOSE4x4_D T0, T1, T2, T3, S0, S1, S2, S3, Z0, Z1 + + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GLDREPL xv, d, T0, X0, 0x08, T1, B1, 0x08, T2, B2, 0x08, T3, B3, 0x08 + GLDREPL xv, d, Y0, X0, 0x10, Y1, B1, 0x10, Y2, B2, 0x10, Y3, B3, 0x10 + GLDREPL xv, d, G0, X0, 0x18, G1, B1, 0x18, G2, B2, 0x18, G3, B3, 0x18 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1, \ + D2, S0, Z2, D2, \ + D3, S0, Z3, D3 + GMADD xvf, d, D0, S1, T0, D0, \ + D1, S1, T1, D1, \ + D2, S1, T2, D2, \ + D3, S1, T3, D3 + GMADD xvf, d, D0, S2, Y0, D0, \ + D1, S2, Y1, D1, \ + D2, S2, Y2, D2, \ + D3, S2, Y3, D3 + GMADD xvf, d, D0, S3, G0, D0, \ + D1, S3, G1, D1, \ + D2, S3, G2, D2, \ + D3, S3, G3, D3 + PTR_ADDI X0, X0, 0x20 + PTR_ADDI B1, B1, 0x20 + PTR_ADDI B2, B2, 0x20 + PTR_ADDI B3, B3, 0x20 + PTR_ADDI A0, A0, 0x20 + PTR_ADDI A1, A1, 0x20 + PTR_ADDI A2, A2, 0x20 + PTR_ADDI A3, A3, 0x20 + .L_M4_N4_3: + andi K1, K, 3 + beqz K1, .L_M4_N4_END + .L_M4_N4_K1: + GLD xv, , S0, A0, 0x00, S1, A1, 0x00, S2, A2, 0x00, S3, A3, 0x00 + GINSVE0 xv, d, S0, S1, 1, S0, S2, 2, S0, S3, 3 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1, \ + D2, S0, Z2, D2, \ + D3, S0, Z3, D3 + PTR_ADDI K1, K1, -1 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI B2, B2, 0x08 + PTR_ADDI B3, B3, 0x08 + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + PTR_ADDI A2, A2, 0x08 + PTR_ADDI A3, A3, 0x08 + bnez K1, .L_M4_N4_K1 + .L_M4_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00, S2, C2, 0x00, S3, C3, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, \ + D1, S1, VBETA, D1, \ + D2, S2, VBETA, D2, \ + D3, S3, VBETA, D3 +#endif + GST xv, , D3, C3, 0x00, \ + D2, C2, 0x00, \ + D1, C1, 0x00, \ + D0, C0, 0x00 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_SUB B2, B2, K8 + PTR_SUB B3, B3, K8 + + PTR_ALSL X0, LDB, X0, 2 + PTR_ALSL B1, LDB, B1, 2 + PTR_ALSL B2, LDB, B2, 2 + PTR_ALSL B3, LDB, B3, 2 + // Restore A0, A1, A2, A3 + move A0, A + PTR_ADD A1, A0, LDA + PTR_ADD A2, A1, LDA + PTR_ADD A3, A2, LDA + bnez N4, .L_M4_N4 +.L_M4_N3: + andi N2, N, 0x02 + beqz N2, .L_M4_N1 +.L_M4_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M4_N2_END +.L_M4_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00, S2, A2, 0x00, S3, A3, 0x00 + GINSVE0 xv, d, S0, S1, 1, S0, S2, 2, S0, S3, 3 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + PTR_ADDI A2, A2, 0x08 + PTR_ADDI A3, A3, 0x08 + bnez K1, .L_M4_N2_K1 +.L_M4_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 +#endif + GST xv, , D1, C1, 0x00, \ + D0, C0, 0x00 + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0, B1 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_ALSL X0, LDB, X0, 1 + PTR_ALSL B1, LDB, B1, 1 + // Restore A0 + move A0, A + PTR_ADD A1, A0, LDA + PTR_ADD A2, A1, LDA + PTR_ADD A3, A2, LDA +.L_M4_N1: + andi N1, N, 0x01 + beqz N1, .L_M4_END + GXOR xv, v, D0, D0, D0 + move K1, K // Restore K1 + bge ZERO, K, .L_M4_N1_END +.L_M4_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00, S2, A2, 0x00, S3, A3, 0x00 + GINSVE0 xv, d, S0, S1, 1, S0, S2, 2, S0, S3, 3 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + PTR_ADDI A2, A2, 0x08 + PTR_ADDI A3, A3, 0x08 + bnez K1, .L_M4_N1_K1 +.L_M4_N1_END: + GMUL xvf, d, D0, D0, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 +#endif + GST xv, , D0, C0, 0x00 + // Update C0 + PTR_ALSL C0, LDC, C0, 2 + // Update X0 + PTR_SUB X0, X0, K8 + PTR_ALSL X0, LDB, X0, 2 + // Restore A0 + move A0, A + PTR_ADD A1, A0, LDA + PTR_ADD A2, A1, LDA + PTR_ADD A3, A2, LDA +.L_M4_END: + PTR_ADDI M4, M4, -1 + PTR_ALSL A, LDA, A, 2 // A += LDA << 2; + PTR_ADDI C, C, 0x20 + bnez M4, .L_M4 +.L_M3: + andi M2, M, 0x02 + beqz M2, .L_M1 +.L_M2: + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + PTR_ADD A1, A0, LDA + move X0, B // Restore X0 + PTR_ADD B1, X0, LDB + PTR_ADD B2, B1, LDB + PTR_ADD B3, B2, LDB + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M2_N3 +.L_M2_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M2_N4_END +.L_M2_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00 + GINSVE0 xv, d, S0, S1, 1 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1, \ + D2, S0, Z2, D2, \ + D3, S0, Z3, D3 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI B2, B2, 0x08 + PTR_ADDI B3, B3, 0x08 + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + bnez K1, .L_M2_N4_K1 + .L_M2_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00, S2, C2, 0x00, S3, C3, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 +#endif + GST v, , V3, C3, 0x00, \ + V2, C2, 0x00, \ + V1, C1, 0x00, \ + V0, C0, 0x00 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_SUB B2, B2, K8 + PTR_SUB B3, B3, K8 + + PTR_ALSL X0, LDB, X0, 2 + PTR_ALSL B1, LDB, B1, 2 + PTR_ALSL B2, LDB, B2, 2 + PTR_ALSL B3, LDB, B3, 2 + // Restore A0, A1 + move A0, A + PTR_ADD A1, A0, LDA + bnez N4, .L_M2_N4 +.L_M2_N3: + andi N2, N, 0x02 + beqz N2, .L_M2_N1 +.L_M2_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M2_N2_END +.L_M2_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00 + GINSVE0 xv, d, S0, S1, 1 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + bnez K1, .L_M2_N2_K1 +.L_M2_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 +#endif + GST v, , V1, C1, 0x00, \ + V0, C0, 0x00 + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0, B1 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_ALSL X0, LDB, X0, 1 + PTR_ALSL B1, LDB, B1, 1 + // Restore A0, A1 + move A0, A + PTR_ADD A1, A0, LDA +.L_M2_N1: + andi N1, N, 0x01 + beqz N1, .L_M2_END + GXOR xv, v, D0, D0, D0 + move K1, K // Restore K1 + bge ZERO, K, .L_M2_N1_END +.L_M2_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00 + GINSVE0 xv, d, S0, S1, 1 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + bnez K1, .L_M2_N1_K1 +.L_M2_N1_END: + GMUL xvf, d, D0, D0, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 +#endif + GST v, , V0, C0, 0x00 + // Update C0 + PTR_ALSL C0, LDC, C0, 2 + // Update X0 + PTR_SUB X0, X0, K8 + PTR_ALSL X0, LDB, X0, 2 + // Restore A0, A1 + move A0, A + PTR_ADD A1, A0, LDA +.L_M2_END: + PTR_ALSL A, LDA, A, 1 // A += LDA << 1; + PTR_ADDI C, C, 0x10 +.L_M1: + andi M1, M, 0x01 + beqz M1, .L_M0 + + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + PTR_ADD B1, X0, LDB + PTR_ADD B2, B1, LDB + PTR_ADD B3, B2, LDB + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M1_N3 +.L_M1_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M1_N4_END +.L_M1_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1, \ + D2, S0, Z2, D2, \ + D3, S0, Z3, D3 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI B2, B2, 0x08 + PTR_ADDI B3, B3, 0x08 + PTR_ADDI A0, A0, 0x08 + bnez K1, .L_M1_N4_K1 + .L_M1_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00, S2, C2, 0x00, S3, C3, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 +#endif + GST f, d, F3, C3, 0x00, \ + F2, C2, 0x00, \ + F1, C1, 0x00, \ + F0, C0, 0x00 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_SUB B2, B2, K8 + PTR_SUB B3, B3, K8 + + PTR_ALSL X0, LDB, X0, 2 + PTR_ALSL B1, LDB, B1, 2 + PTR_ALSL B2, LDB, B2, 2 + PTR_ALSL B3, LDB, B3, 2 + // Restore A0, A1 + move A0, A + bnez N4, .L_M1_N4 +.L_M1_N3: + andi N2, N, 0x02 + beqz N2, .L_M1_N1 +.L_M1_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M1_N2_END +.L_M1_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI A0, A0, 0x08 + bnez K1, .L_M1_N2_K1 +.L_M1_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 +#endif + GST f, d, F1, C1, 0x00, \ + F0, C0, 0x00 + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0, B1 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_ALSL X0, LDB, X0, 1 + PTR_ALSL B1, LDB, B1, 1 + // Restore A0 + move A0, A +.L_M1_N1: + andi N1, N, 0x01 + beqz N1, .L_M0 + GXOR xv, v, D0, D0, D0 + move K1, K // Restore K1 + bge ZERO, K, .L_M1_N1_END +.L_M1_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI A0, A0, 0x08 + bnez K1, .L_M1_N1_K1 +.L_M1_N1_END: + GMUL xvf, d, D0, D0, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 +#endif + GST f, d, F0, C0, 0x00 + // Update C0 + PTR_ALSL C0, LDC, C0, 2 + // Update X0 + PTR_SUB X0, X0, K8 + PTR_ALSL X0, LDB, X0, 2 + // Restore A0 + move A0, A +.L_M0: + pop_if_used 8, 2 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_small_kernel_tt_lasx.S b/kernel/loongarch64/dgemm_small_kernel_tt_lasx.S new file mode 100644 index 000000000..b3e338514 --- /dev/null +++ b/kernel/loongarch64/dgemm_small_kernel_tt_lasx.S @@ -0,0 +1,534 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +#define M $a0 +#define N $a1 +#define K $a2 +#define A $a3 +#define LDA $a4 +#define ALPHA $f0 +#define B $a5 +#define LDB $a6 +#define C $a7 +#define LDC $t0 +#ifdef B0 +#define BETA $f1 +#endif +#undef ZERO +#define ZERO $r0 + +#define M4 $t1 +#define M2 $t1 +#define M1 $t1 +#define N4 $t2 +#define N2 $t2 +#define N1 $t2 +#define K_LDB $t3 +#define A0 $t4 +#define X0 $t5 +#define A1 $t6 +#define A2 $t7 +#define A3 $t8 +#define C0 $s0 +#define C1 $s1 +#define C2 $s2 +#define C3 $s3 +#define K1 $s4 +#define B1 $s5 +#define B2 $s6 +#define B3 $s7 + +#define VALPHA $xr0 +#ifndef B0 +#define VBETA $xr1 +#endif +#define D0 $xr2 +#define D1 $xr3 +#define D2 $xr4 +#define D3 $xr5 +#define T0 $xr6 +#define T1 $xr7 +#define T2 $xr8 +#define T3 $xr9 +#define Y0 $xr10 +#define Y1 $xr11 +#define Y2 $xr12 +#define Y3 $xr13 +#define G0 $xr14 +#define G1 $xr15 +#define G2 $xr16 +#define G3 $xr17 +#define S0 $xr18 +#define S1 $xr19 +#define S2 $xr20 +#define S3 $xr21 +#define Z0 $xr22 +#define Z1 $xr23 +#define Z2 $xr24 +#define Z3 $xr25 +#define V0 $vr2 +#define V1 $vr3 +#define V2 $vr4 +#define V3 $vr5 +#define F0 $f2 +#define F1 $f3 +#define F2 $f4 +#define F3 $f5 + + PROLOGUE + PTR_LD LDC, $sp, 0 + push_if_used 8, 2 + xvreplve0.d VALPHA, VALPHA +#ifndef B0 + xvreplve0.d VBETA, VBETA +#endif + PTR_SLLI LDA, LDA, 3 + PTR_SLLI LDB, LDB, 3 + PTR_SLLI LDC, LDC, 3 + PTR_MUL K_LDB, K, LDB + PTR_SRAI M4, M, 2 // M >> 2 + beqz M4, .L_M3 +.L_M4: + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + PTR_ADD A1, A0, LDA + PTR_ADD A2, A1, LDA + PTR_ADD A3, A2, LDA + move X0, B // Restore X0 + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M4_N3 +.L_M4_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M4_N4_END + PTR_SRAI K1, K1, 2 + beq ZERO, K1, .L_M4_N4_K3 + PTR_ADD B1, X0, LDB + PTR_ADD B2, B1, LDB + PTR_ADD B3, B2, LDB +.L_M4_N4_K4: + PTR_ADDI K1, K1, -1 + GLD xv, , T0, A0, 0x00, T1, A1, 0x00, T2, A2, 0x00, T3, A3, 0x00 + GTRANSPOSE4x4_D T0, T1, T2, T3, S0, S1, S2, S3, Z0, Z1 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GLDREPL xv, d, T0, X0, 0x08, T1, B1, 0x08, T2, B2, 0x08, T3, B3, 0x08 + GLDREPL xv, d, Y0, X0, 0x10, Y1, B1, 0x10, Y2, B2, 0x10, Y3, B3, 0x10 + GLDREPL xv, d, G0, X0, 0x18, G1, B1, 0x18, G2, B2, 0x18, G3, B3, 0x18 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, T0, D1, \ + D2, S0, Y0, D2, \ + D3, S0, G0, D3 + GMADD xvf, d, D0, S1, Z1, D0, \ + D1, S1, T1, D1, \ + D2, S1, Y1, D2, \ + D3, S1, G1, D3 + GMADD xvf, d, D0, S2, Z2, D0, \ + D1, S2, T2, D1, \ + D2, S2, Y2, D2, \ + D3, S2, G2, D3 + GMADD xvf, d, D0, S3, Z3, D0, \ + D1, S3, T3, D1, \ + D2, S3, Y3, D2, \ + D3, S3, G3, D3 + PTR_ALSL X0, LDB, X0, 2 + PTR_ALSL B1, LDB, B1, 2 + PTR_ALSL B2, LDB, B2, 2 + PTR_ALSL B3, LDB, B3, 2 + + PTR_ADDI A0, A0, 0x20 + PTR_ADDI A1, A1, 0x20 + PTR_ADDI A2, A2, 0x20 + PTR_ADDI A3, A3, 0x20 + bnez K1, .L_M4_N4_K4 +.L_M4_N4_K3: + andi K1, K, 3 + beqz K1, .L_M4_N4_END +.L_M4_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00, S2, A2, 0x00, S3, A3, 0x00 + GINSVE0 xv, d, S0, S1, 1, S0, S2, 2, S0, S3, 3 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08, Z2, X0, 0x10, Z3, X0, 0x18 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1, \ + D2, S0, Z2, D2, \ + D3, S0, Z3, D3 + PTR_ADD X0, X0, LDB + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + PTR_ADDI A2, A2, 0x08 + PTR_ADDI A3, A3, 0x08 + bnez K1, .L_M4_N4_K1 + .L_M4_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00, S2, C2, 0x00, S3, C3, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, \ + D1, S1, VBETA, D1, \ + D2, S2, VBETA, D2, \ + D3, S3, VBETA, D3 +#endif + GST xv, , D3, C3, 0x00, \ + D2, C2, 0x00, \ + D1, C1, 0x00, \ + D0, C0, 0x00 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x20 + // Restore A0, A1, A2, A3 + move A0, A + PTR_ADD A1, A0, LDA + PTR_ADD A2, A1, LDA + PTR_ADD A3, A2, LDA + bnez N4, .L_M4_N4 +.L_M4_N3: + andi N2, N, 0x02 + beqz N2, .L_M4_N1 +.L_M4_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M4_N2_END +.L_M4_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00, S2, A2, 0x00, S3, A3, 0x00 + GINSVE0 xv, d, S0, S1, 1, S0, S2, 2, S0, S3, 3 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1 + PTR_ADD X0, X0, LDB + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + PTR_ADDI A2, A2, 0x08 + PTR_ADDI A3, A3, 0x08 + bnez K1, .L_M4_N2_K1 +.L_M4_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 +#endif + GST xv, , D1, C1, 0x00, \ + D0, C0, 0x00 + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x10 + // Restore A0 + move A0, A + PTR_ADD A1, A0, LDA + PTR_ADD A2, A1, LDA + PTR_ADD A3, A2, LDA +.L_M4_N1: + andi N1, N, 0x01 + beqz N1, .L_M4_END + GXOR xv, v, D0, D0, D0 + move K1, K // Restore K1 + bge ZERO, K, .L_M4_N1_END +.L_M4_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00, S2, A2, 0x00, S3, A3, 0x00 + GINSVE0 xv, d, S0, S1, 1, S0, S2, 2, S0, S3, 3 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0 + PTR_ADD X0, X0, LDB + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + PTR_ADDI A2, A2, 0x08 + PTR_ADDI A3, A3, 0x08 + bnez K1, .L_M4_N1_K1 +.L_M4_N1_END: + GMUL xvf, d, D0, D0, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 +#endif + GST xv, , D0, C0, 0x00 + // Update C0 + PTR_ALSL C0, LDC, C0, 2 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x08 + // Restore A0 + move A0, A + PTR_ADD A1, A0, LDA + PTR_ADD A2, A1, LDA + PTR_ADD A3, A2, LDA +.L_M4_END: + PTR_ADDI M4, M4, -1 + PTR_ALSL A, LDA, A, 2 // A += LDA << 2; + PTR_ADDI C, C, 0x20 + bnez M4, .L_M4 +.L_M3: + andi M2, M, 0x02 + beqz M2, .L_M1 +.L_M2: + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + PTR_ADD A1, A0, LDA + move X0, B // Restore X0 + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M2_N3 +.L_M2_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M2_N4_END +.L_M2_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00 + GINSVE0 xv, d, S0, S1, 1 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08, Z2, X0, 0x10, Z3, X0, 0x18 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1, \ + D2, S0, Z2, D2, \ + D3, S0, Z3, D3 + PTR_ADD X0, X0, LDB + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + bnez K1, .L_M2_N4_K1 + .L_M2_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00, S2, C2, 0x00, S3, C3, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 +#endif + GST v, , V3, C3, 0x00, \ + V2, C2, 0x00, \ + V1, C1, 0x00, \ + V0, C0, 0x00 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x20 + // Restore A0, A1 + move A0, A + PTR_ADD A1, A0, LDA + bnez N4, .L_M2_N4 +.L_M2_N3: + andi N2, N, 0x02 + beqz N2, .L_M2_N1 +.L_M2_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M2_N2_END +.L_M2_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00 + GINSVE0 xv, d, S0, S1, 1 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1 + PTR_ADD X0, X0, LDB + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + bnez K1, .L_M2_N2_K1 +.L_M2_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 +#endif + GST v, , V1, C1, 0x00, \ + V0, C0, 0x00 + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x10 + // Restore A0, A1 + move A0, A + PTR_ADD A1, A0, LDA +.L_M2_N1: + andi N1, N, 0x01 + beqz N1, .L_M2_END + GXOR xv, v, D0, D0, D0 + move K1, K // Restore K1 + bge ZERO, K, .L_M2_N1_END +.L_M2_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A1, 0x00 + GINSVE0 xv, d, S0, S1, 1 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0 + PTR_ADD X0, X0, LDB + PTR_ADDI A0, A0, 0x08 + PTR_ADDI A1, A1, 0x08 + bnez K1, .L_M2_N1_K1 +.L_M2_N1_END: + GMUL xvf, d, D0, D0, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 +#endif + GST v, , V0, C0, 0x00 + // Update C0 + PTR_ALSL C0, LDC, C0, 2 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x08 + // Restore A0, A1 + move A0, A + PTR_ADD A1, A0, LDA +.L_M2_END: + PTR_ALSL A, LDA, A, 1 // A += LDA << 1; + PTR_ADDI C, C, 0x10 +.L_M1: + andi M1, M, 0x01 + beqz M1, .L_M0 + + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M1_N3 +.L_M1_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M1_N4_END +.L_M1_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08, Z2, X0, 0x10, Z3, X0, 0x18 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1, \ + D2, S0, Z2, D2, \ + D3, S0, Z3, D3 + PTR_ADD X0, X0, LDB + PTR_ADDI A0, A0, 0x08 + bnez K1, .L_M1_N4_K1 + .L_M1_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00, S2, C2, 0x00, S3, C3, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 +#endif + GST f, d, F3, C3, 0x00, \ + F2, C2, 0x00, \ + F1, C1, 0x00, \ + F0, C0, 0x00 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x20 + // Restore A0, A1 + move A0, A + bnez N4, .L_M1_N4 +.L_M1_N3: + andi N2, N, 0x02 + beqz N2, .L_M1_N1 +.L_M1_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M1_N2_END +.L_M1_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08 + GMADD xvf, d, D0, S0, Z0, D0, \ + D1, S0, Z1, D1 + PTR_ADD X0, X0, LDB + PTR_ADDI A0, A0, 0x08 + bnez K1, .L_M1_N2_K1 +.L_M1_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C1, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 +#endif + GST f, d, F1, C1, 0x00, \ + F0, C0, 0x00 + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x10 + // Restore A0 + move A0, A +.L_M1_N1: + andi N1, N, 0x01 + beqz N1, .L_M0 + GXOR xv, v, D0, D0, D0 + move K1, K // Restore K1 + bge ZERO, K, .L_M1_N1_END +.L_M1_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0 + PTR_ADD X0, X0, LDB + PTR_ADDI A0, A0, 0x08 + bnez K1, .L_M1_N1_K1 +.L_M1_N1_END: + GMUL xvf, d, D0, D0, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 +#endif + GST f, d, F0, C0, 0x00 + // Update C0 + PTR_ALSL C0, LDC, C0, 2 + // Update X0 + PTR_SUB X0, X0, K_LDB + PTR_ADDI X0, X0, 0x08 + // Restore A0 + move A0, A +.L_M0: + pop_if_used 8, 2 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_small_matrix_permit.c b/kernel/loongarch64/dgemm_small_matrix_permit.c new file mode 100644 index 000000000..df262a6bb --- /dev/null +++ b/kernel/loongarch64/dgemm_small_matrix_permit.c @@ -0,0 +1,44 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + double MNK = (double) M * (double) N * (double) K; + + if (transa) { + if (MNK <= 24.0 * 24.0 * 24.0) + return 1; + } else { + if (MNK <= 64.0 * 64.0 * 64.0) + return 1; + } + + return 0; +} + From 4e9144b39faa6b0542e1b7c183e0dab1a8637a07 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 5 Jun 2024 23:43:52 +0200 Subject: [PATCH 6/9] Update .cirrus.yml (#4735) * Update versions (and paths) of XCode, and update FreeBSD version --- .cirrus.yml | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index e8a0b85c0..d0e1eeff7 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -41,7 +41,7 @@ macos_instance: # - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1 macos_instance: - image: ghcr.io/cirruslabs/macos-monterey-xcode:latest + image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest task: name: AppleM1/LLVM x86_64 xbuild compile_script: @@ -58,8 +58,8 @@ task: - export VALID_ARCHS="i386 x86_64" - xcrun --sdk macosx --show-sdk-path - xcodebuild -version - - export CC=/Applications/Xcode-15.3.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-15.3.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.4.sdk -arch x86_64" + - export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.5.sdk -arch x86_64" - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" always: config_artifacts: @@ -70,7 +70,7 @@ task: # type: application/octet-streamm macos_instance: - image: ghcr.io/cirruslabs/macos-monterey-xcode:latest + image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest task: name: AppleM1/LLVM armv8-ios xbuild compile_script: @@ -78,8 +78,10 @@ task: - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - - export CC=/Applications/Xcode-15.3.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-15.3.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.4.sdk -arch arm64 -miphoneos-version-min=10.0" + - export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk -arch arm64 -miphoneos-version-min=10.0" + - xcrun --sdk iphoneos --show-sdk-path + - ls -l /Applications - make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1 always: config_artifacts: @@ -96,11 +98,11 @@ task: - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - ls /System/Volumes/Data/opt/homebrew - - ls -l /System/Volumes/Data/opt/homebrew/Caskroom/ + - ls -l /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk - find /System/Volumes/Data/opt/homebrew -name "armv7a-linux-androideabi*-ranlib" - #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0" - - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/26c/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang + - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/26d/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang - make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" always: config_artifacts: @@ -132,7 +134,7 @@ task: FreeBSD_task: name: FreeBSD-gcc12 freebsd_instance: - image_family: freebsd-13-2 + image_family: freebsd-13-3 install_script: - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc compile_script: @@ -143,7 +145,7 @@ FreeBSD_task: FreeBSD_task: name: freebsd-gcc12-ilp64 freebsd_instance: - image_family: freebsd-13-2 + image_family: freebsd-13-3 install_script: - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc compile_script: @@ -153,7 +155,7 @@ FreeBSD_task: FreeBSD_task: name: FreeBSD-clang-openmp freebsd_instance: - image_family: freebsd-13-2 + image_family: freebsd-13-3 install_script: - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc - ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so From af73ae6208b3d8ecbdb37bca07ac63734f847c0f Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 6 Jun 2024 16:43:09 +0800 Subject: [PATCH 7/9] LoongArch: Fixed issue 4728 --- kernel/loongarch64/scal.S | 74 -------------------------------- kernel/loongarch64/scal_lasx.S | 75 ++------------------------------ kernel/loongarch64/scal_lsx.S | 78 +--------------------------------- 3 files changed, 5 insertions(+), 222 deletions(-) diff --git a/kernel/loongarch64/scal.S b/kernel/loongarch64/scal.S index 566bce6cb..8de710f41 100644 --- a/kernel/loongarch64/scal.S +++ b/kernel/loongarch64/scal.S @@ -56,80 +56,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MTC a1, $r0 slli.d INCX, INCX, BASE_SHIFT bge $r0, N, .L999 - CMPEQ $fcc0, ALPHA, a1 - bceqz $fcc0, .L50 - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L15 - .align 3 - -.L12: - ST a1, X, 0 * SIZE - ST a1, X, 1 * SIZE - ST a1, X, 2 * SIZE - ST a1, X, 3 * SIZE - ST a1, X, 4 * SIZE - ST a1, X, 5 * SIZE - ST a1, X, 6 * SIZE - ST a1, X, 7 * SIZE - addi.w I, I, -1 - addi.d X, X, 8 * SIZE - blt $r0, I, .L12 - .align 3 - -.L15: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 -.L16: - ST a1, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L16 - move $r4, $r17 - fmov.d $f0, $f22 - jirl $r0, $r1, 0x0 - .align 3 - -.L20: - srai.d I, N, 3 - bge $r0, I, .L25 - .align 3 - -.L22: - ST a1, X, 0 * SIZE - add.d X, X, INCX - ST a1, X, 0 * SIZE - add.d X, X, INCX - ST a1, X, 0 * SIZE - add.d X, X, INCX - ST a1, X, 0 * SIZE - add.d X, X, INCX - ST a1, X, 0 * SIZE - add.d X, X, INCX - ST a1, X, 0 * SIZE - add.d X, X, INCX - ST a1, X, 0 * SIZE - add.d X, X, INCX - ST a1, X, 0 * SIZE - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L22 - .align 3 - -.L25: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 -.L26: - addi.d I, I, -1 - ST a1, X, 0 * SIZE - add.d X, X, INCX - blt $r0, I, .L26 - move $r4, $r17 - fmov.d $f0, $f22 - jirl $r0, $r1, 0x0 - .align 3 .L50: srai.d I, N, 3 diff --git a/kernel/loongarch64/scal_lasx.S b/kernel/loongarch64/scal_lasx.S index 48e2c0718..b4585c1b9 100644 --- a/kernel/loongarch64/scal_lasx.S +++ b/kernel/loongarch64/scal_lasx.S @@ -58,12 +58,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FFINT a2, a2 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT - CMPEQ $fcc0, ALPHA, a1 - bcnez $fcc0, .L20 //ALPHA==0 CMPEQ $fcc0, ALPHA, a2 bcnez $fcc0, .L999 //ALPHA==1 return + srai.d I, N, 3 - beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + beq INCX, TEMP, .L30 //ALPHA!=1 and INCX==1 MTG TEMP, ALPHA #ifdef DOUBLE xvreplgr2vr.d VALPHA, TEMP @@ -73,7 +72,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. move XX, X .align 3 -.L10: //ALPHA!=0|1 and INCX!=1 +.L10: //ALPHA!=1 and INCX!=1 bge $r0, I, .L32 .align 3 .L11: @@ -166,74 +165,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. blt $r0, I, .L11 b .L32 .align 3 - -.L20: - srai.d I, N, 3 - beq INCX, TEMP, .L24 - bge $r0, I, .L22 - .align 3 - -.L21: - ST a1, X, 0 - add.d X, X, INCX - ST a1, X, 0 - add.d X, X, INCX - ST a1, X, 0 - add.d X, X, INCX - ST a1, X, 0 - add.d X, X, INCX - ST a1, X, 0 - add.d X, X, INCX - ST a1, X, 0 - add.d X, X, INCX - ST a1, X, 0 - add.d X, X, INCX - ST a1, X, 0 - add.d X, X, INCX - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L23: - ST a1, X, 0 * SIZE - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L23 - jirl $r0, $r1, 0 - .align 3 - -.L24: - bge $r0, I, .L26 /*N<8 INCX==1*/ - .align 3 -.L25: - xvxor.v VX0, VX0, VX0 - xvst VX0, X, 0 * SIZE -#ifdef DOUBLE - xvst VX0, X, 4 * SIZE -#endif - addi.d I, I, -1 - addi.d X, X, 8 * SIZE - blt $r0, I, .L25 - .align 3 - -.L26: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L27: - ST a1, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L27 - jirl $r0, $r1, 0 - .align 3 - .L30: bge $r0, I, .L32/*N<8 INCX==1*/ MTG TEMP, ALPHA diff --git a/kernel/loongarch64/scal_lsx.S b/kernel/loongarch64/scal_lsx.S index 1ffce7db2..a27e050ed 100644 --- a/kernel/loongarch64/scal_lsx.S +++ b/kernel/loongarch64/scal_lsx.S @@ -58,12 +58,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FFINT a2, a2 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT - CMPEQ $fcc0, ALPHA, a1 - bcnez $fcc0, .L20 //ALPHA==0 CMPEQ $fcc0, ALPHA, a2 bcnez $fcc0, .L999 //ALPHA==1 return srai.d I, N, 3 - beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + beq INCX, TEMP, .L30 //ALPHA!=1 and INCX==1 MTG TEMP, ALPHA #ifdef DOUBLE vreplgr2vr.d VALPHA, TEMP @@ -73,7 +71,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. move XX, X .align 3 -.L10: //ALPHA!=0|1 and INCX!=1 +.L10: //ALPHA!=1 and INCX!=1 bge $r0, I, .L32 .align 3 @@ -171,78 +169,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L32 .align 3 -.L20: - srai.d I, N, 3 - beq INCX, TEMP, .L24 - bge $r0, I, .L22 - .align 3 - -.L21: - ST a1, X, 0 - add.d X, X, INCX - ST a1, X, 0 - add.d X, X, INCX - ST a1, X, 0 - add.d X, X, INCX - ST a1, X, 0 - add.d X, X, INCX - ST a1, X, 0 - add.d X, X, INCX - ST a1, X, 0 - add.d X, X, INCX - ST a1, X, 0 - add.d X, X, INCX - ST a1, X, 0 - add.d X, X, INCX - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L23: - ST a1, X, 0 * SIZE - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L23 - jirl $r0, $r1, 0 - .align 3 - -.L24: - bge $r0, I, .L26 /*N<8 INCX==1*/ - .align 3 - -.L25: - vxor.v VX0, VX0, VX0 - vst VX0, X, 0 * SIZE -#ifdef DOUBLE - vst VX0, X, 2 * SIZE - vst VX0, X, 4 * SIZE - vst VX0, X, 6 * SIZE -#else - vst VX0, X, 4 * SIZE -#endif - addi.d I, I, -1 - addi.d X, X, 8 * SIZE - blt $r0, I, .L25 - .align 3 - -.L26: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L27: - ST a1, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L27 - jirl $r0, $r1, 0 - .align 3 - .L30: bge $r0, I, .L32/*N<8 INCX==1*/ MTG TEMP, ALPHA From 2787c9f8e4ee356dfff7f31c7bcb718c01552edc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Jun 2024 14:39:50 +0200 Subject: [PATCH 8/9] Disable GEMM3M for generic targets (not implemented) --- kernel/Makefile.L3 | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 863f376e9..87fd5ca10 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -17,6 +17,16 @@ ifeq ($(ARCH), ia64) USE_GEMM3M = 1 endif +ifneq ($(DYNAMIC_ARCH), 1) +ifeq ($(TARGET), GENERIC) +USE_GEMM3M = 0 +endif +else +ifeq ($(CORE), GENERIC) +USE_GEMM3M = 0 +endif +endif + ifeq ($(ARCH), arm) USE_TRMM = 1 endif From f96ee867112ed5fe053fd352166b3d78a88cf996 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Jun 2024 21:17:36 +0200 Subject: [PATCH 9/9] remove .mod files during make clean --- lapack-netlib/SRC/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index de2242701..2a20fcdab 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -668,7 +668,7 @@ FRC: .PHONY: clean cleanobj cleanlib clean: cleanobj cleanlib cleanobj: - rm -f *.o DEPRECATED/*.o + rm -f *.o *.mod DEPRECATED/*.o DEPRECATED/*.mod cleanlib: rm -f $(LAPACKLIB)