From 479e4af0893f3772eff9a75e89cf28bef91383c3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 5 Feb 2024 15:35:24 +0100 Subject: [PATCH 01/14] Rescale input vector more often to minimize relative error (Reference-LAPACK PR 981) --- lapack-netlib/SRC/clarfgp.f | 30 ++++++++++-------------------- lapack-netlib/SRC/zlarfgp.f | 30 ++++++++++-------------------- 2 files changed, 20 insertions(+), 40 deletions(-) diff --git a/lapack-netlib/SRC/clarfgp.f b/lapack-netlib/SRC/clarfgp.f index 47b5e47b0..980e93612 100644 --- a/lapack-netlib/SRC/clarfgp.f +++ b/lapack-netlib/SRC/clarfgp.f @@ -148,33 +148,23 @@ ALPHR = REAL( ALPHA ) ALPHI = AIMAG( ALPHA ) * - IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN + IF( XNORM.LE.EPS*ABS(ALPHA) .AND. ALPHI.EQ.ZERO ) THEN * * H = [1-alpha/abs(alpha) 0; 0 I], sign chosen so ALPHA >= 0. * - IF( ALPHI.EQ.ZERO ) THEN - IF( ALPHR.GE.ZERO ) THEN -* When TAU.eq.ZERO, the vector is special-cased to be -* all zeros in the application routines. We do not need -* to clear it. - TAU = ZERO - ELSE -* However, the application routines rely on explicit -* zero checks when TAU.ne.ZERO, and we must clear X. - TAU = TWO - DO J = 1, N-1 - X( 1 + (J-1)*INCX ) = ZERO - END DO - ALPHA = -ALPHA - END IF + IF( ALPHR.GE.ZERO ) THEN +* When TAU.eq.ZERO, the vector is special-cased to be +* all zeros in the application routines. We do not need +* to clear it. + TAU = ZERO ELSE -* Only "reflecting" the diagonal entry to be real and non-negative. - XNORM = SLAPY2( ALPHR, ALPHI ) - TAU = CMPLX( ONE - ALPHR / XNORM, -ALPHI / XNORM ) +* However, the application routines rely on explicit +* zero checks when TAU.ne.ZERO, and we must clear X. + TAU = TWO DO J = 1, N-1 X( 1 + (J-1)*INCX ) = ZERO END DO - ALPHA = XNORM + ALPHA = -ALPHA END IF ELSE * diff --git a/lapack-netlib/SRC/zlarfgp.f b/lapack-netlib/SRC/zlarfgp.f index 6c9efb04c..d54f2ea5d 100644 --- a/lapack-netlib/SRC/zlarfgp.f +++ b/lapack-netlib/SRC/zlarfgp.f @@ -148,33 +148,23 @@ ALPHR = DBLE( ALPHA ) ALPHI = DIMAG( ALPHA ) * - IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN + IF( XNORM.LE.EPS*ABS(ALPHA) .AND. ALPHI.EQ.ZERO ) THEN * * H = [1-alpha/abs(alpha) 0; 0 I], sign chosen so ALPHA >= 0. * - IF( ALPHI.EQ.ZERO ) THEN - IF( ALPHR.GE.ZERO ) THEN -* When TAU.eq.ZERO, the vector is special-cased to be -* all zeros in the application routines. We do not need -* to clear it. - TAU = ZERO - ELSE -* However, the application routines rely on explicit -* zero checks when TAU.ne.ZERO, and we must clear X. - TAU = TWO - DO J = 1, N-1 - X( 1 + (J-1)*INCX ) = ZERO - END DO - ALPHA = -ALPHA - END IF + IF( ALPHR.GE.ZERO ) THEN +* When TAU.eq.ZERO, the vector is special-cased to be +* all zeros in the application routines. We do not need +* to clear it. + TAU = ZERO ELSE -* Only "reflecting" the diagonal entry to be real and non-negative. - XNORM = DLAPY2( ALPHR, ALPHI ) - TAU = DCMPLX( ONE - ALPHR / XNORM, -ALPHI / XNORM ) +* However, the application routines rely on explicit +* zero checks when TAU.ne.ZERO, and we must clear X. + TAU = TWO DO J = 1, N-1 X( 1 + (J-1)*INCX ) = ZERO END DO - ALPHA = XNORM + ALPHA = -ALPHA END IF ELSE * From fe3da43b7dac7af1be5538f87f707a6073fbc52c Mon Sep 17 00:00:00 2001 From: pengxu Date: Tue, 6 Feb 2024 11:49:01 +0800 Subject: [PATCH 02/14] Optimized zgemm kernel 8*4 LASX, 4*4 LSX and cgemm kernel 8*4 LSX for LoongArch --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 20 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 10 +- kernel/loongarch64/cgemm_kernel_8x4_lsx.S | 3313 ++++++++++++++++++ kernel/loongarch64/cgemm_ncopy_4_lsx.S | 341 ++ kernel/loongarch64/cgemm_ncopy_8_lsx.S | 263 ++ kernel/loongarch64/cgemm_tcopy_4_lsx.S | 324 ++ kernel/loongarch64/cgemm_tcopy_8_lsx.S | 277 ++ kernel/loongarch64/zgemm_kernel_4x4_lsx.S | 2316 +++++++++++++ kernel/loongarch64/zgemm_kernel_8x4_lasx.S | 3545 ++++++++++++++++++++ kernel/loongarch64/zgemm_ncopy_4_lasx.S | 320 ++ kernel/loongarch64/zgemm_ncopy_4_lsx.S | 332 ++ kernel/loongarch64/zgemm_ncopy_8_lasx.S | 263 ++ kernel/loongarch64/zgemm_tcopy_4_lasx.S | 302 ++ kernel/loongarch64/zgemm_tcopy_4_lsx.S | 355 ++ kernel/loongarch64/zgemm_tcopy_8_lasx.S | 268 ++ param.h | 10 +- 16 files changed, 12248 insertions(+), 11 deletions(-) create mode 100644 kernel/loongarch64/cgemm_kernel_8x4_lsx.S create mode 100644 kernel/loongarch64/cgemm_ncopy_4_lsx.S create mode 100644 kernel/loongarch64/cgemm_ncopy_8_lsx.S create mode 100644 kernel/loongarch64/cgemm_tcopy_4_lsx.S create mode 100644 kernel/loongarch64/cgemm_tcopy_8_lsx.S create mode 100644 kernel/loongarch64/zgemm_kernel_4x4_lsx.S create mode 100644 kernel/loongarch64/zgemm_kernel_8x4_lasx.S create mode 100644 kernel/loongarch64/zgemm_ncopy_4_lasx.S create mode 100644 kernel/loongarch64/zgemm_ncopy_4_lsx.S create mode 100644 kernel/loongarch64/zgemm_ncopy_8_lasx.S create mode 100644 kernel/loongarch64/zgemm_tcopy_4_lasx.S create mode 100644 kernel/loongarch64/zgemm_tcopy_4_lsx.S create mode 100644 kernel/loongarch64/zgemm_tcopy_8_lasx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index f4ab495e6..c7ef44035 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -100,9 +100,13 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -CGEMMKERNEL = cgemm_kernel_2x2_lsx.S -CGEMMONCOPY = cgemm_ncopy_2_lsx.S -CGEMMOTCOPY = cgemm_tcopy_2_lsx.S +CGEMMKERNEL = cgemm_kernel_8x4_lsx.S +CGEMMINCOPY = cgemm_ncopy_8_lsx.S +CGEMMITCOPY = cgemm_tcopy_8_lsx.S +CGEMMONCOPY = cgemm_ncopy_4_lsx.S +CGEMMOTCOPY = cgemm_tcopy_4_lsx.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) @@ -111,4 +115,14 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S +ZGEMMONCOPY = zgemm_ncopy_4_lsx.S +ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index bd85fab01..17d15656a 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -122,9 +122,13 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -ZGEMMKERNEL = zgemm_kernel_2x2_lasx.S -ZGEMMONCOPY = zgemm_ncopy_2_lasx.S -ZGEMMOTCOPY = zgemm_tcopy_2_lasx.S +ZGEMMKERNEL = zgemm_kernel_8x4_lasx.S +ZGEMMINCOPY = zgemm_ncopy_8_lasx.S +ZGEMMITCOPY = zgemm_tcopy_8_lasx.S +ZGEMMONCOPY = zgemm_ncopy_4_lasx.S +ZGEMMOTCOPY = zgemm_tcopy_4_lasx.S +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/loongarch64/cgemm_kernel_8x4_lsx.S b/kernel/loongarch64/cgemm_kernel_8x4_lsx.S new file mode 100644 index 000000000..1e9fd8524 --- /dev/null +++ b/kernel/loongarch64/cgemm_kernel_8x4_lsx.S @@ -0,0 +1,3313 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 +#define T2 $r26 +#define T3 $r27 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 + +/* LSX vectors */ +#define U0 $vr30 +#define U1 $vr31 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 +#define D0 $vr16 +#define D1 $vr17 +#define D2 $vr18 +#define D3 $vr19 +#define D4 $vr20 +#define D5 $vr21 +#define D6 $vr22 +#define D7 $vr23 +#define D8 $vr24 +#define D9 $vr25 +#define D10 $vr26 +#define D11 $vr27 +#define D12 $vr28 +#define D13 $vr29 +#define VALPHAR $vr28 +#define VALPHAI $vr29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VNMSUB +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VFMADD +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VFMADD +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VNMSUB +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA_R,$sp, 112 + ST ALPHA_I,$sp, 120 + + vldrepl.w VALPHAR, $sp, 112 + vldrepl.w VALPHAI, $sp, 120 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, 2 + + move J, $r0 + srai.d T0, N, 2 //bn/4 + beq J, T0, .L19 + +.L10: /* for(j=0; j0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vand.v D0, U2, U2 + vand.v D1, U3, U3 + vand.v D2, U2, U2 + vand.v D3, U3, U3 + + vpermi.w D0, U0, 0x44 + vpermi.w D2, U0, 0xee + vpermi.w D1, U1, 0x44 + vpermi.w D3, U1, 0xee + + vst D0, TD, 0x00 + vst D2, TD, 0x10 + vst D1, TD, 0x20 + vst D3, TD, 0x30 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vand.v D0, U1, U1 + + vpermi.w D0, U0, 0x44 + vpermi.w U1, U0, 0xee + + vst D0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d S1, S1, 0x10 // a_offset + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d TD, TD, 0x10 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + vld U0, S1, 0x00 + + vst U0, TD, 0x00 + + addi.d S1, S1, 0x10 // aoffset1 + addi.d TD, TD, 0x10 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_ncopy_8_lsx.S b/kernel/loongarch64/cgemm_ncopy_8_lsx.S new file mode 100644 index 000000000..87a88e37d --- /dev/null +++ b/kernel/loongarch64/cgemm_ncopy_8_lsx.S @@ -0,0 +1,263 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 +#define D8 $vr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + slli.d T0, TL, 0x03 + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d S5, S4, TL + add.d S6, S5, TL + add.d S7, S6, TL + add.d S8, S7, TL + add.d TS, TS, T0 + + beq I, ZERO, .L_J11 + +.L_I1: /* if(i>0) i--*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1c + + fld.s F0, S5, 0x00 + fld.s F1, S5, 0x04 + fld.s F2, S6, 0x00 + fld.s F3, S6, 0x04 + fld.s F4, S7, 0x00 + fld.s F5, S7, 0x04 + fld.s F6, S8, 0x00 + fld.s F7, S8, 0x04 + + fst.s F0, TD, 0x20 + fst.s F1, TD, 0x24 + fst.s F2, TD, 0x28 + fst.s F3, TD, 0x2c + fst.s F4, TD, 0x30 + fst.s F5, TD, 0x34 + fst.s F6, TD, 0x38 + fst.s F7, TD, 0x3c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_J11: /* j--*/ + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4)*/ + andi I, N, 0x04 + beq I, ZERO, .L_N2 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d TS, S4, TL + + beq I, ZERO, .L_N2 + +.L_N11: /* if(i>0)*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N2: /* if(n&2)*/ + andi I, N, 0x02 + beq I, ZERO, .L_N3 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d TS, S2, TL + + beq I, ZERO, .L_N3 + +.L_N21: /* if(i>0)*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N3: /* if(n&2)*/ + andi I, N, 0x01 + beq I, ZERO, .L_N0 + + move S1, TS + move I, M + + beq I, ZERO, .L_N0 + +.L_N31: /* if(i>0)*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + + addi.d S1, S1, 0x08 + addi.d TD, TD, 0x08 + + addi.d I, I, -1 + blt ZERO, I, .L_N31 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_4_lsx.S b/kernel/loongarch64/cgemm_tcopy_4_lsx.S new file mode 100644 index 000000000..6d63d62e7 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_4_lsx.S @@ -0,0 +1,324 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.w T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.w T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vld U4, S3, 0x00 + vld U5, S3, 0x10 + + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + vst U4, S8, 0x40 + vst U5, S8, 0x50 + vst U6, S8, 0x60 + vst U7, S8, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + vst U2, S9, 0x20 + vst U3, S9, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S9, S9, 0x40 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + fst.s F2, S10, 0x08 + fst.s F3, S10, 0x0c + fst.s F4, S10, 0x10 + fst.s F5, S10, 0x14 + fst.s F6, S10, 0x18 + fst.s F7, S10, 0x1c + + addi.d S10, S10, 0x20 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S9, S9, 0x20 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + fst.s F2, S10, 0x08 + fst.s F3, S10, 0x0c + + addi.d S10, S10, 0x10 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + + addi.d S1, S1, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + vld U0, S1, 0x00 + + vst U0, S9, 0x00 + + addi.d S1, S1, 0x10 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_8_lsx.S b/kernel/loongarch64/cgemm_tcopy_8_lsx.S new file mode 100644 index 000000000..2935bbc07 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_8_lsx.S @@ -0,0 +1,277 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x40 + + srai.d I, M, 0x01 + beq ZERO, I, .L_J1M1 + +.L_J1I1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + vst U4, TD, 0x40 + vst U5, TD, 0x50 + vst U6, TD, 0x60 + vst U7, TD, 0x70 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_J1I1 + +.L_J1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_J0 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + addi.d TD, TD, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4) */ + andi I, N, 0x04 + beq ZERO, I, .L_N2 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x20 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N1M1 + +.L_N1I1: /* if(i>0) i-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_N1I1 + +.L_N1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d TD, TD, 0x20 + +.L_N2: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N3 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x10 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N2M1 + +.L_N2I1: /* if(i>0) i-- */ + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + add.d S1, S1, T0 + add.d S2, S2, T0 + + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N2I1 + +.L_N2M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N3 + + vld U0, S1, 0x00 + + vst U0, TD, 0x00 + + addi.d TD, TD, 0x10 + +.L_N3: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + + srai.d I, M, 0x01 + beq ZERO, I, .L_N3M1 + +.L_N3I1: /* if(i>0) i-- */ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x10 + + addi.d I, I, -1 + blt ZERO, I, .L_N3I1 + +.L_N3M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_kernel_4x4_lsx.S b/kernel/loongarch64/zgemm_kernel_4x4_lsx.S new file mode 100644 index 000000000..6c4841b24 --- /dev/null +++ b/kernel/loongarch64/zgemm_kernel_4x4_lsx.S @@ -0,0 +1,2316 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 +#define T2 $r26 +#define T3 $r27 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 + +/* LSX vectors */ +#define U0 $vr30 +#define U1 $vr31 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 +#define D0 $vr16 +#define D1 $vr17 +#define D2 $vr18 +#define D3 $vr19 +#define D4 $vr20 +#define D5 $vr21 +#define D6 $vr22 +#define D7 $vr23 +#define D8 $vr24 +#define D9 $vr25 +#define D10 $vr26 +#define D11 $vr27 +#define D12 $vr28 +#define D13 $vr29 +#define VALPHAR $vr28 +#define VALPHAI $vr29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VNMSUB +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VFMADD +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VFMADD +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VNMSUB +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA_R,$sp, 112 + ST ALPHA_I,$sp, 120 + + vldrepl.d VALPHAR, $sp, 112 + vldrepl.d VALPHAI, $sp, 120 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, BASE_SHIFT + + move J, $r0 + srai.d T0, N, 2 //bn/4 + beq J, T0, .L19 + +.L10: /* for(j=0; j0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + xvand.v D2, U2, U2 + xvand.v D3, U3, U3 + + xvpermi.q D0, U2, 0x02 + xvpermi.q D2, U0, 0x31 + xvpermi.q D1, U3, 0x02 + xvpermi.q D3, U1, 0x31 + + xvst D0, TD, 0x00 + xvst D2, TD, 0x20 + xvst D1, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x40 // a_offset + addi.d S2, S2, 0x40 + addi.d TD, TD, 0x80 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvand.v D0, U0, U0 + + xvpermi.q D0, U1, 0x02 + xvpermi.q U1, U0, 0x31 + + xvst D0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, TD, 0x00 + vst $vr1, TD, 0x10 + + addi.d TD, TD, 0x20 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d S1, S1, 0x40 // aoffset1 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + xvld U0, S1, 0x00 + xvst U0, TD, 0x00 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + vld $vr0, S1, 0x00 + vst $vr0, TD, 0x00 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_ncopy_4_lsx.S b/kernel/loongarch64/zgemm_ncopy_4_lsx.S new file mode 100644 index 000000000..203471cbd --- /dev/null +++ b/kernel/loongarch64/zgemm_ncopy_4_lsx.S @@ -0,0 +1,332 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define TD $r20 +#define TS $r11 +#define TL $r19 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x03 + slli.d TL, TL, 0x01 + + srai.d J, N, 0x02 + beq J, ZERO, .L_N0 + +.L_J1: /* J-- */ + move S1, TS + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq I, ZERO, .L_I3 + +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vld U8, S3, 0x00 + vld U9, S3, 0x10 + vld U10, S3, 0x20 + vld U11, S3, 0x30 + + vld U12, S4, 0x00 + vld U13, S4, 0x10 + vld U14, S4, 0x20 + vld U15, S4, 0x30 + + vst U0, TD, 0x00 + vst U4, TD, 0x10 + vst U8, TD, 0x20 + vst U12, TD, 0x30 + + vst U1, TD, 0x40 + vst U5, TD, 0x50 + vst U9, TD, 0x60 + vst U13, TD, 0x70 + + vst U2, TD, 0x80 + vst U6, TD, 0x90 + vst U10, TD, 0xa0 + vst U14, TD, 0xb0 + + vst U3, TD, 0xc0 + vst U7, TD, 0xd0 + vst U11, TD, 0xe0 + vst U15, TD, 0xf0 + + addi.d S1, S1, 0x40 // a_offset + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d TD, TD, 0x100 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_I3: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_II20 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vld U4, S3, 0x00 + vld U5, S3, 0x10 + + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, TD, 0x00 + vst U2, TD, 0x10 + vst U4, TD, 0x20 + vst U6, TD, 0x30 + + vst U1, TD, 0x40 + vst U3, TD, 0x50 + vst U5, TD, 0x60 + vst U7, TD, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x80 + +.L_II20: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_J0 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + addi.d TD, TD, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N0: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N20 + + move S1, TS + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq ZERO, I, .L_N10 + +.L_N11: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, TD, 0x00 + vst U4, TD, 0x10 + vst U1, TD, 0x20 + vst U5, TD, 0x30 + + vst U2, TD, 0x40 + vst U6, TD, 0x50 + vst U3, TD, 0x60 + vst U7, TD, 0x70 + + addi.d S1, S1, 0x40 // a_offset + addi.d S2, S2, 0x40 + addi.d TD, TD, 0x80 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, TD, 0x00 + vst U2, TD, 0x10 + vst U1, TD, 0x20 + vst U3, TD, 0x30 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d TD, TD, 0x20 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + addi.d S1, S1, 0x40 // aoffset1 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + vld U0, S1, 0x00 + + vst U0, TD, 0x00 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_ncopy_8_lasx.S b/kernel/loongarch64/zgemm_ncopy_8_lasx.S new file mode 100644 index 000000000..7cd8f605b --- /dev/null +++ b/kernel/loongarch64/zgemm_ncopy_8_lasx.S @@ -0,0 +1,263 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 +#define D8 $xr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 + + slli.d T0, TL, 0x03 + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d S5, S4, TL + add.d S6, S5, TL + add.d S7, S6, TL + add.d S8, S7, TL + add.d TS, TS, T0 + + beq I, ZERO, .L_J11 + +.L_I1: /* if(i>0) i--*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + fld.d F2, S2, 0x00 + fld.d F3, S2, 0x08 + fld.d F4, S3, 0x00 + fld.d F5, S3, 0x08 + fld.d F6, S4, 0x00 + fld.d F7, S4, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + fst.d F4, TD, 0x20 + fst.d F5, TD, 0x28 + fst.d F6, TD, 0x30 + fst.d F7, TD, 0x38 + + fld.d F0, S5, 0x00 + fld.d F1, S5, 0x08 + fld.d F2, S6, 0x00 + fld.d F3, S6, 0x08 + fld.d F4, S7, 0x00 + fld.d F5, S7, 0x08 + fld.d F6, S8, 0x00 + fld.d F7, S8, 0x08 + + fst.d F0, TD, 0x40 + fst.d F1, TD, 0x48 + fst.d F2, TD, 0x50 + fst.d F3, TD, 0x58 + fst.d F4, TD, 0x60 + fst.d F5, TD, 0x68 + fst.d F6, TD, 0x70 + fst.d F7, TD, 0x78 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_J11: /* j--*/ + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4)*/ + andi I, N, 0x04 + beq I, ZERO, .L_N2 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d TS, S4, TL + + beq I, ZERO, .L_N2 + +.L_N11: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + fld.d F2, S2, 0x00 + fld.d F3, S2, 0x08 + fld.d F4, S3, 0x00 + fld.d F5, S3, 0x08 + fld.d F6, S4, 0x00 + fld.d F7, S4, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + fst.d F4, TD, 0x20 + fst.d F5, TD, 0x28 + fst.d F6, TD, 0x30 + fst.d F7, TD, 0x38 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N2: /* if(n&2)*/ + andi I, N, 0x02 + beq I, ZERO, .L_N3 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d TS, S2, TL + + beq I, ZERO, .L_N3 + +.L_N21: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + fld.d F2, S2, 0x00 + fld.d F3, S2, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N3: /* if(n&2)*/ + andi I, N, 0x01 + beq I, ZERO, .L_N0 + + move S1, TS + move I, M + + beq I, ZERO, .L_N0 + +.L_N31: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + + addi.d S1, S1, 0x10 + addi.d TD, TD, 0x10 + + addi.d I, I, -1 + blt ZERO, I, .L_N31 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_4_lasx.S b/kernel/loongarch64/zgemm_tcopy_4_lasx.S new file mode 100644 index 000000000..1adee11c5 --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_4_lasx.S @@ -0,0 +1,302 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x100 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + xvst U2, S8, 0x40 + xvst U3, S8, 0x60 + xvst U4, S8, 0x80 + xvst U5, S8, 0xa0 + xvst U6, S8, 0xc0 + xvst U7, S8, 0xe0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, S9, 0x00 + xvst U1, S9, 0x20 + xvst U2, S9, 0x40 + xvst U3, S9, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S9, S9, 0x80 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + vld $vr2, S3, 0x00 + vld $vr3, S4, 0x00 + + vst $vr0, S10, 0x00 + vst $vr1, S10, 0x10 + vst $vr2, S10, 0x20 + vst $vr3, S10, 0x30 + + addi.d S10, S10, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + xvst U2, S8, 0x40 + xvst U3, S8, 0x60 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, S9, 0x00 + xvst U1, S9, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S9, S9, 0x40 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S10, 0x00 + vst $vr1, S10, 0x10 + + addi.d S10, S10, 0x20 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + + addi.d S1, S1, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + xvld U0, S1, 0x00 + + xvst U0, S9, 0x00 + + addi.d S1, S1, 0x20 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + vld $vr0, S1, 0x00 + + vst $vr0, S10, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_4_lsx.S b/kernel/loongarch64/zgemm_tcopy_4_lsx.S new file mode 100644 index 000000000..954753eaf --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_4_lsx.S @@ -0,0 +1,355 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x100 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vld U8, S3, 0x00 + vld U9, S3, 0x10 + vld U10, S3, 0x20 + vld U11, S3, 0x30 + + vld U12, S4, 0x00 + vld U13, S4, 0x10 + vld U14, S4, 0x20 + vld U15, S4, 0x30 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + vst U4, S8, 0x40 + vst U5, S8, 0x50 + vst U6, S8, 0x60 + vst U7, S8, 0x70 + + vst U8, S8, 0x80 + vst U9, S8, 0x90 + vst U10, S8, 0xa0 + vst U11, S8, 0xb0 + vst U12, S8, 0xc0 + vst U13, S8, 0xd0 + vst U14, S8, 0xe0 + vst U15, S8, 0xf0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vld U4, S3, 0x00 + vld U5, S3, 0x10 + + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + vst U2, S9, 0x20 + vst U3, S9, 0x30 + + vst U4, S9, 0x40 + vst U5, S9, 0x50 + vst U6, S9, 0x60 + vst U7, S9, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S9, S9, 0x80 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, S10, 0x00 + vst U1, S10, 0x10 + vst U2, S10, 0x20 + vst U3, S10, 0x30 + + addi.d S10, S10, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + + vst U4, S8, 0x40 + vst U5, S8, 0x50 + vst U6, S8, 0x60 + vst U7, S8, 0x70 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + vst U2, S9, 0x20 + vst U3, S9, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S9, S9, 0x40 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, S10, 0x00 + vst U1, S10, 0x10 + + addi.d S10, S10, 0x20 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + + addi.d S1, S1, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + + addi.d S1, S1, 0x20 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + vld U0, S1, 0x00 + + vst U0, S10, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_8_lasx.S b/kernel/loongarch64/zgemm_tcopy_8_lasx.S new file mode 100644 index 000000000..f7440dc24 --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_8_lasx.S @@ -0,0 +1,268 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 + + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x80 + + srai.d I, M, 0x01 + beq ZERO, I, .L_J1M1 + +.L_J1I1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + xvst U2, TD, 0x40 + xvst U3, TD, 0x60 + xvst U4, TD, 0x80 + xvst U5, TD, 0xa0 + xvst U6, TD, 0xc0 + xvst U7, TD, 0xe0 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x100 + + addi.d I, I, -1 + blt ZERO, I, .L_J1I1 + +.L_J1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_J0 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + xvst U2, TD, 0x40 + xvst U3, TD, 0x60 + + addi.d TD, TD, 0x80 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4) */ + andi I, N, 0x04 + beq ZERO, I, .L_N2 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x40 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N1M1 + +.L_N1I1: /* if(i>0) i-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + xvst U2, TD, 0x40 + xvst U3, TD, 0x60 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_N1I1 + +.L_N1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N2 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d TD, TD, 0x40 + +.L_N2: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N3 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x20 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N2M1 + +.L_N2I1: /* if(i>0) i-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + add.d S1, S1, T0 + add.d S2, S2, T0 + + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_N2I1 + +.L_N2M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N3 + + xvld U0, S1, 0x00 + + xvst U0, TD, 0x00 + + addi.d TD, TD, 0x20 + +.L_N3: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + + srai.d I, M, 0x01 + beq ZERO, I, .L_N3M1 + +.L_N3I1: /* if(i>0) i-- */ + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, TD, 0x00 + vst $vr1, TD, 0x10 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N3I1 + +.L_N3M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N0 + + vld $vr0, S1, 0x00 + + vst $vr0, TD, 0x00 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/param.h b/param.h index 003ea396f..5d2e960a2 100644 --- a/param.h +++ b/param.h @@ -2854,12 +2854,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 -#define ZGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 4 #define XGEMM_DEFAULT_UNROLL_N 1 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 8 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_P 256 @@ -2891,10 +2891,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_P 128 From d4db6a9f16a5c82bbe1860f591cc731c4d83d7c8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 6 Feb 2024 22:23:47 +0100 Subject: [PATCH 03/14] Separate the interface for SBGEMMT from GEMMT due to differences in GEMV arguments --- interface/CMakeLists.txt | 1 + interface/Makefile | 8 +- interface/gemmt.c | 3 +- interface/sbgemmt.c | 447 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 452 insertions(+), 7 deletions(-) create mode 100644 interface/sbgemmt.c diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index ed19b556a..55374674a 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -119,6 +119,7 @@ endif () if (BUILD_BFLOAT16) GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") diff --git a/interface/Makefile b/interface/Makefile index ad4a0fb89..048d679d6 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -1303,7 +1303,7 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c ifeq ($(BUILD_BFLOAT16),1) sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) -sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : gemmt.c ../param.h +sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) endif @@ -1662,10 +1662,6 @@ cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) -sscal.$(SUFFIX) sscal.$(PSUFFIX) : scal.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -dscal.$(SUFFIX) dscal.$(PSUFFIX) : scal.c cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) @@ -1971,7 +1967,7 @@ cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) ifeq ($(BUILD_BFLOAT16),1) -cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h +cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) endif diff --git a/interface/gemmt.c b/interface/gemmt.c index 8fd8089d0..018deb7fb 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -158,7 +158,8 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, uplo = 0; if (Uplo == 'L') uplo = 1; - + + nrowa = m; if (transa & 1) nrowa = k; nrowb = k; #if defined(COMPLEX) diff --git a/interface/sbgemmt.c b/interface/sbgemmt.c new file mode 100644 index 000000000..759af4bfb --- /dev/null +++ b/interface/sbgemmt.c @@ -0,0 +1,447 @@ +/*********************************************************************/ +/* Copyright 2024, The OpenBLAS Project. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#define SMP_THRESHOLD_MIN 65536.0 +#define ERROR_NAME "SBGEMMT " + +#ifndef GEMM_MULTITHREAD_THRESHOLD +#define GEMM_MULTITHREAD_THRESHOLD 4 +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANSA, char *TRANSB, + blasint * M, blasint * K, + FLOAT * Alpha, + IFLOAT * a, blasint * ldA, + IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC) +{ + + blasint m, k; + blasint lda, ldb, ldc; + int transa, transb, uplo; + blasint info; + + char transA, transB, Uplo; + blasint nrowa, nrowb; + IFLOAT *buffer; + IFLOAT *aa, *bb; + FLOAT *cc; + FLOAT alpha, beta; + + PRINT_DEBUG_NAME; + + m = *M; + k = *K; + + alpha = *Alpha; + beta = *Beta; + + lda = *ldA; + ldb = *ldB; + ldc = *ldC; + + transA = *TRANSA; + transB = *TRANSB; + Uplo = *UPLO; + TOUPPER(transA); + TOUPPER(transB); + TOUPPER(Uplo); + + transa = -1; + transb = -1; + uplo = -1; + + if (transA == 'N') + transa = 0; + if (transA == 'T') + transa = 1; + + if (transA == 'R') + transa = 0; + if (transA == 'C') + transa = 1; + + if (transB == 'N') + transb = 0; + if (transB == 'T') + transb = 1; + + if (transB == 'R') + transb = 0; + if (transB == 'C') + transb = 1; + + if (Uplo == 'U') + uplo = 0; + if (Uplo == 'L') + uplo = 1; + nrowa = m; + if (transa & 1) nrowa = k; + nrowb = k; + if (transb & 1) nrowb = m; + + info = 0; + + if (ldc < MAX(1, m)) + info = 13; + if (ldb < MAX(1, nrowb)) + info = 10; + if (lda < MAX(1, nrowa)) + info = 8; + if (k < 0) + info = 5; + if (m < 0) + info = 4; + if (transb < 0) + info = 3; + if (transa < 0) + info = 2; + if (uplo < 0) + info = 1; + + if (info != 0) { + BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m, + blasint k, + FLOAT alpha, + IFLOAT * A, blasint LDA, + IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc) +{ + IFLOAT *aa, *bb; + FLOAT *cc; + + int transa, transb, uplo; + blasint info; + blasint lda, ldb; + IFLOAT *a, *b; + XFLOAT *buffer; + + PRINT_DEBUG_CNAME; + + uplo = -1; + transa = -1; + transb = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) + transa = 0; + if (TransA == CblasTrans) + transa = 1; + + if (TransA == CblasConjNoTrans) + transa = 0; + if (TransA == CblasConjTrans) + transa = 1; + + if (TransB == CblasNoTrans) + transb = 0; + if (TransB == CblasTrans) + transb = 1; + + if (TransB == CblasConjNoTrans) + transb = 0; + if (TransB == CblasConjTrans) + transb = 1; + + a = (void *)A; + b = (void *)B; + lda = LDA; + ldb = LDB; + + info = -1; + + blasint nrowa; + blasint nrowb; + nrowa = m; + if (transa & 1) nrowa = k; + nrowb = k; + if (transb & 1) nrowb = m; + + if (ldc < MAX(1, m)) + info = 13; + if (ldb < MAX(1, nrowb)) + info = 10; + if (lda < MAX(1, nrowa)) + info = 8; + if (k < 0) + info = 5; + if (m < 0) + info = 4; + if (transb < 0) + info = 3; + if (transa < 0) + info = 2; + if (uplo < 0) + info = 1; + } + + if (order == CblasRowMajor) { + + a = (void *)B; + b = (void *)A; + + lda = LDB; + ldb = LDA; + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransB == CblasNoTrans) + transa = 0; + if (TransB == CblasTrans) + transa = 1; + + if (TransB == CblasConjNoTrans) + transa = 0; + if (TransB == CblasConjTrans) + transa = 1; + + if (TransA == CblasNoTrans) + transb = 0; + if (TransA == CblasTrans) + transb = 1; + + if (TransA == CblasConjNoTrans) + transb = 0; + if (TransA == CblasConjTrans) + transb = 1; + + info = -1; + + blasint ncola; + blasint ncolb; + + ncola = m; + if (transa & 1) ncola = k; + ncolb = k; + + if (transb & 1) { + ncolb = m; + } + + if (ldc < MAX(1,m)) + info = 13; + if (ldb < MAX(1, ncolb)) + info = 8; + if (lda < MAX(1, ncola)) + info = 10; + if (k < 0) + info = 5; + if (m < 0) + info = 4; + if (transb < 0) + info = 2; + if (transa < 0) + info = 3; + if (uplo < 0) + info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + int buffer_size; + blasint i, j; + +#ifdef SMP + int nthreads; +#endif + + +#ifdef SMP + static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, + BLASLONG, IFLOAT *, BLASLONG, FLOAT, + FLOAT *, BLASLONG, int) = { + sbgemv_thread_n, sbgemv_thread_t, + }; +#endif + int (*gemv[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, BLASLONG, + IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = { + SBGEMV_N, SBGEMV_T,}; + + + if (m == 0) + return; + + IDEBUG_START; + + const blasint incb = ((transb & 1) == 0) ? 1 : ldb; + + if (uplo == 1) { + for (i = 0; i < m; i++) { + j = m - i; + + aa = a + i; + bb = b + i * ldb; + if (transa & 1) { + aa = a + lda * i; + } + if (transb & 1) + bb = b + i; + cc = c + i * ldc + i; + +#if 0 + if (beta != ONE) + SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); + + if (alpha == ZERO) + continue; +#endif + + IDEBUG_START; + + buffer_size = j + k + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT); +#endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, IFLOAT, buffer); + +#ifdef SMP + + if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 1; + else + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + if (!(transa & 1)) + (gemv[(int)transa]) (j, k, alpha, aa, lda, + bb, incb, beta, cc, 1); + else + (gemv[(int)transa]) (k, j, alpha, aa, lda, + bb, incb, beta, cc, 1); + +#ifdef SMP + } else { + if (!(transa & 1)) + (gemv_thread[(int)transa]) (j, k, alpha, aa, + lda, bb, incb, beta, cc, + 1, nthreads); + else + (gemv_thread[(int)transa]) (k, j, alpha, aa, + lda, bb, incb, beta, cc, + 1, nthreads); + + } +#endif + + STACK_FREE(buffer); + } + } else { + + for (i = 0; i < m; i++) { + j = i + 1; + + bb = b + i * ldb; + if (transb & 1) { + bb = b + i; + } + cc = c + i * ldc; + +#if 0 + if (beta != ONE) + SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); + + if (alpha == ZERO) + continue; +#endif + IDEBUG_START; + + buffer_size = j + k + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT); +#endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, IFLOAT, buffer); + +#ifdef SMP + + if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 1; + else + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + if (!(transa & 1)) + (gemv[(int)transa]) (j, k, alpha, a, lda, bb, + incb, beta, cc, 1); + else + (gemv[(int)transa]) (k, j, alpha, a, lda, bb, + incb, beta, cc, 1); + +#ifdef SMP + } else { + if (!(transa & 1)) + (gemv_thread[(int)transa]) (j, k, alpha, a, lda, + bb, incb, beta, cc, 1, + nthreads); + else + (gemv_thread[(int)transa]) (k, j, alpha, a, lda, + bb, incb, beta, cc, 1, + nthreads); + } +#endif + + STACK_FREE(buffer); + } + } + + IDEBUG_END; + + return; +} From fb99fc2e6e4ec8ecdcfffe1ca1aeb787464d2825 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 7 Feb 2024 13:42:08 +0100 Subject: [PATCH 04/14] fix type conversion warnings --- test/compare_sgemm_sbgemm.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index cf808b56d..4afa8bf93 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -81,6 +81,16 @@ float16to32 (bfloat16_bits f16) return f32.v; } +float +float32to16 (float32_bits f32) +{ + bfloat16_bits f16; + f16.bits.s = f32.bits.s; + f16.bits.e = f32.bits.e; + f16.bits.m = (uint32_t) f32.bits.m >> 16; + return f32.v; +} + int main (int argc, char *argv[]) { @@ -108,16 +118,16 @@ main (int argc, char *argv[]) A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; C[j * k + i] = 0; - AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16; - BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16; + AA[j * k + i].v = float32to16( A[j * k + i] ); + BB[j * k + i].v = float32to16( B[j * k + i] ); CC[j * k + i] = 0; DD[j * k + i] = 0; } } SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m); - SBGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, - &m, BB, &k, &beta, CC, &m); + SBGEMM (&transA, &transB, &m, &n, &k, &alpha, (bfloat16*) AA, + &m, (bfloat16*)BB, &k, &beta, CC, &m); for (i = 0; i < n; i++) for (j = 0; j < m; j++) if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) From 08ce6b1c1c6468c26607353d516e0dc8c009f58e Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Wed, 7 Feb 2024 07:54:06 -0600 Subject: [PATCH 05/14] Add missing CPU ID definitions for old versions of AIX. --- driver/others/dynamic_power.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 0454f186c..16320dc40 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -43,6 +43,13 @@ char *gotoblas_corename(void) { #define CPU_POWER9 9 #define CPU_POWER10 10 +#ifndef POWER_9 +#define POWER_9 0x20000 /* 9 class CPU */ +#endif +#ifndef POWER_10 +#define POWER_10 0x40000 /* 10 class CPU */ +#endif + #ifdef _AIX #include From e9f480111e1d5b6f69c8053f79375b0a4242712f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 7 Feb 2024 19:57:18 +0100 Subject: [PATCH 06/14] fix sbgemm bfloat16 conversion errors introduced in PR 4488 --- test/compare_sgemm_sbgemm.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index 4afa8bf93..bc74233ab 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -81,16 +81,6 @@ float16to32 (bfloat16_bits f16) return f32.v; } -float -float32to16 (float32_bits f32) -{ - bfloat16_bits f16; - f16.bits.s = f32.bits.s; - f16.bits.e = f32.bits.e; - f16.bits.m = (uint32_t) f32.bits.m >> 16; - return f32.v; -} - int main (int argc, char *argv[]) { @@ -110,6 +100,8 @@ main (int argc, char *argv[]) float C[m * n]; bfloat16_bits AA[m * k], BB[k * n]; float DD[m * n], CC[m * n]; + bfloat16 atmp,btmp; + blasint one=1; for (j = 0; j < m; j++) { @@ -118,8 +110,10 @@ main (int argc, char *argv[]) A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; C[j * k + i] = 0; - AA[j * k + i].v = float32to16( A[j * k + i] ); - BB[j * k + i].v = float32to16( B[j * k + i] ); + sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one); + sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one); + AA[j * k + i].v = atmp; + BB[j * k + i].v = btmp; CC[j * k + i] = 0; DD[j * k + i] = 0; } From b3fa16345d83b723b8984b78dc6a2bb5d9f3d479 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Feb 2024 13:15:34 +0100 Subject: [PATCH 07/14] fix prototype for c/zaxpby --- common_interface.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common_interface.h b/common_interface.h index 61a82c306..5a2e1654c 100644 --- a/common_interface.h +++ b/common_interface.h @@ -773,8 +773,8 @@ xdouble BLASFUNC(qlamc3)(xdouble *, xdouble *); void BLASFUNC(saxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(daxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *); -void BLASFUNC(caxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *); -void BLASFUNC(zaxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *); +void BLASFUNC(caxpby) (blasint *, void *, float *, blasint *, void *, float *, blasint *); +void BLASFUNC(zaxpby) (blasint *, void *, double *, blasint *, void *, double *, blasint *); void BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *); From 500ac4de5e20596d5cd797d745db97dd0a62ff86 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Feb 2024 13:18:34 +0100 Subject: [PATCH 08/14] fix incompatible pointer types --- interface/zaxpby.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/interface/zaxpby.c b/interface/zaxpby.c index 3a4db7403..e5065270d 100644 --- a/interface/zaxpby.c +++ b/interface/zaxpby.c @@ -39,12 +39,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef CBLAS -void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY) +void NAME(blasint *N, void *VALPHA, FLOAT *x, blasint *INCX, void *VBETA, FLOAT *y, blasint *INCY) { blasint n = *N; blasint incx = *INCX; blasint incy = *INCY; + FLOAT* ALPHA = (FLOAT*) VALPHA; + FLOAT* BETA = (FLOAT*) VBETA; #else From ac6b4b7aa472799245c497ab3caa6e77c0e80852 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Thu, 8 Feb 2024 08:56:30 -0600 Subject: [PATCH 09/14] Make sure CPU ID works for all POWER_10 conditions --- driver/others/dynamic_power.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 16320dc40..cd5e88922 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -69,7 +69,7 @@ static int cpuid(void) else if (arch == POWER_9) return CPU_POWER9; #endif #ifdef POWER_10 - else if (arch == POWER_10) return CPU_POWER10; + else if (arch >= POWER_10) return CPU_POWER10; #endif return CPU_UNKNOWN; } From d408ecedba8cc1d9b799b89df6996bdcd996d5fc Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Thu, 8 Feb 2024 12:17:18 -0600 Subject: [PATCH 10/14] Add environment variable to display coretype for dynamic arch. --- driver/others/dynamic_power.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index cd5e88922..4c1f4a26e 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -339,6 +339,9 @@ void gotoblas_dynamic_init(void) { if (gotoblas && gotoblas -> init) { strncpy(coren,gotoblas_corename(),20); sprintf(coremsg, "Core: %s\n",coren); + if (getenv("GET_OPENBLAS_CORETYPE")) { + fprintf(stderr, "%s", coremsg); + } openblas_warning(2, coremsg); gotoblas -> init(); } else { From 83bec5135534c94fe1790aa5795620a7cdeaa916 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Feb 2024 21:23:48 +0100 Subject: [PATCH 11/14] Update with recently added CBLAS interfaces and LAPACK/LAPACKE functions --- exports/gensymbol | 85 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/exports/gensymbol b/exports/gensymbol index 704eab06f..93edf2400 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -60,6 +60,7 @@ cblasobjsc=" cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv cblas_scnrm2 cblas_scasum cblas_cgemmt cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy + cblas_caxpyc cblas_crotg cblas_csrot cblas_scamax cblas_scamin " cblasobjsd=" cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot @@ -69,6 +70,7 @@ cblasobjsd=" cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy + cblas_damax cblas_damin " cblasobjss=" @@ -80,6 +82,7 @@ cblasobjss=" cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm cblas_strsv cblas_sgeadd cblas_sgemmt cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy + cblas_samax cblas_samin " cblasobjsz=" @@ -91,6 +94,7 @@ cblasobjsz=" cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub cblas_zaxpby cblas_zgeadd cblas_zgemmt cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy + cblas_zaxpyc cblas_zdrot cblas_zrotg cblas_dzamax cblas_dzamin " cblasobjs="cblas_xerbla" @@ -861,6 +865,51 @@ lapackobjs2z="$lapackobjs2z zgedmd zgedmdq " + +#functions added post 3.11 + +lapackobjs2c="$lapackobjs2c + claqp2rk + claqp3rk + claqz0 + claqz1 + claqz2 + claqz3 + clatrs3 + ctrsyl3 + " +lapackobjs2d="$lapackobjs2d + dgelqs + dgelst + dgeqp3rk + dgeqrs + dlaqp2rk + dlaqp3rk + dlaqz0 + dlaqz1 + dlaqz2 + dlaqz3 + dlaqz4 + dlarmm + dlatrs3 + dtrsyl3 + " +lapackobjs2z="$lapackobjs2z + zgelqs + zgelst + zgeqp3rk + zgeqrs + zlaqp2rk + zlaqp3rk + zlaqz0 + zlaqz1 + zlaqz2 + zlaqz3 + zlatrs3 + zrscl + ztrsyl3 +" + lapack_extendedprecision_objs=" zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx @@ -1622,6 +1671,14 @@ lapackeobjsc=" LAPACKE_cgetsqrhrt_work LAPACKE_cungtsqr_row LAPACKE_cungtsqr_row_work + LAPACKE_clangb + LAPACKE_clangb_work + LAPACKE_ctrsyl3 + LAPACKE_ctrsyl3_work + LAPACKE_ctz_nancheck + LAPACKE_ctz_trans + LAPACKE_cunhr_col + LAPACKE_cunhr_col_work " lapackeobjsd=" @@ -2239,6 +2296,14 @@ lapackeobjsd=" LAPACKE_dgetsqrhrt_work LAPACKE_dorgtsqr_row LAPACKE_dorgtsqr_row_work + LAPACKE_dlangb + LAPACKE_dlangb_work + LAPACKE_dorhr_col + LAPACKE_dorhr_col_work + LAPACKE_dtrsyl3 + LAPACKE_dtrsyl3_work + LAPACKE_dtz_nancheck + LAPACKE_dtz_trans " lapackeobjss=" @@ -2848,6 +2913,14 @@ lapackeobjss=" LAPACKE_sgetsqrhrt_work LAPACKE_sorgtsqr_row LAPACKE_sorgtsqr_row_work + LAPACKE_slangb + LAPACKE_slangb_work + LAPACKE_sorhr_col + LAPACKE_sorhr_col_work + LAPACKE_strsyl3 + LAPACKE_strsyl3_work + LAPACKE_stz_nancheck + LAPACKE_stz_trans " lapackeobjsz=" @@ -3515,6 +3588,14 @@ lapackeobjsz=" LAPACKE_zgetsqrhrt_work LAPACKE_zungtsqr_row LAPACKE_zungtsqr_row_work + LAPACKE_zlangb + LAPACKE_zlangb_work + LAPACKE_ztrsyl3 + LAPACKE_ztrsyl3_work + LAPACKE_ztz_nancheck + LAPACKE_ztz_trans + LAPACKE_zunhr_col + LAPACKE_zunhr_col_work " ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the @@ -3616,6 +3697,7 @@ lapack_embeded_underscore_objs_s=" ssysv_aa_2stage ssytrf_aa_2stage ssytrs_aa_2stage slaorhr_col_getrfnp slaorhr_col_getrfnp2 sorhr_col + slarfb_gett " lapack_embeded_underscore_objs_c=" chetf2_rook chetrf_rook chetri_rook @@ -3641,6 +3723,7 @@ lapack_embeded_underscore_objs_c=" csysv_aa_2stage csytrf_aa_2stage csytrs_aa_2stage claunhr_col_getrfnp claunhr_col_getrfnp2 cunhr_col + clarfb_gett " lapack_embeded_underscore_objs_d=" dlasyf_rook @@ -3658,6 +3741,7 @@ lapack_embeded_underscore_objs_d=" dsysv_aa_2stage dsytrf_aa_2stage dsytrs_aa_2stage dlaorhr_col_getrfnp dlaorhr_col_getrfnp2 dorhr_col + dlarfb_gett " lapack_embeded_underscore_objs_z=" zhetf2_rook zhetrf_rook zhetri_rook @@ -3682,6 +3766,7 @@ lapack_embeded_underscore_objs_z=" zhetrs_aa_2stage zsysv_aa_2stage zsytrf_aa_2stage zsytrs_aa_2stage zlaunhr_col_getrfnp zlaunhr_col_getrfnp2 zunhr_col + zlarfb_gett " dirname=`pwd -P`/../lapack-netlib From 93872f4681430a6ae9d857d8044d36ba98d09bf2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Feb 2024 23:02:09 +0100 Subject: [PATCH 12/14] drop the ?laqz? symbols for now (not translatable by f2c) --- exports/gensymbol | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index 93edf2400..226035842 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -871,13 +871,14 @@ lapackobjs2z="$lapackobjs2z lapackobjs2c="$lapackobjs2c claqp2rk claqp3rk - claqz0 - claqz1 - claqz2 - claqz3 - clatrs3 ctrsyl3 " +# claqz0 +# claqz1 +# claqz2 +# claqz3 +# clatrs3 + lapackobjs2d="$lapackobjs2d dgelqs dgelst @@ -885,15 +886,16 @@ lapackobjs2d="$lapackobjs2d dgeqrs dlaqp2rk dlaqp3rk - dlaqz0 - dlaqz1 - dlaqz2 - dlaqz3 - dlaqz4 dlarmm dlatrs3 dtrsyl3 " +# dlaqz0 +# dlaqz1 +# dlaqz2 +# dlaqz3 +# dlaqz4 + lapackobjs2z="$lapackobjs2z zgelqs zgelst @@ -901,14 +903,14 @@ lapackobjs2z="$lapackobjs2z zgeqrs zlaqp2rk zlaqp3rk - zlaqz0 - zlaqz1 - zlaqz2 - zlaqz3 zlatrs3 zrscl ztrsyl3 -" + " +# zlaqz0 +# zlaqz1 +# zlaqz2 +# zlaqz3 lapack_extendedprecision_objs=" zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx From ff1523163f71e1ad9f1e4a2c8548416f213e7cb1 Mon Sep 17 00:00:00 2001 From: Sergei Lewis Date: Fri, 9 Feb 2024 12:59:14 +0000 Subject: [PATCH 13/14] Fix axpy test hangs when n==0. Reenable zaxpy_vector kernel for C910V. --- kernel/riscv64/KERNEL.C910V | 4 ++-- kernel/riscv64/axpy_vector.c | 2 +- kernel/riscv64/zaxpy_vector.c | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V index 066329390..2798a870e 100644 --- a/kernel/riscv64/KERNEL.C910V +++ b/kernel/riscv64/KERNEL.C910V @@ -42,8 +42,8 @@ ZSUMKERNEL = ../arm/zsum.c SAXPYKERNEL = axpy_vector.c DAXPYKERNEL = axpy_vector.c -CAXPYKERNEL = zaxpy.c -ZAXPYKERNEL = zaxpy.c +CAXPYKERNEL = zaxpy_vector.c +ZAXPYKERNEL = zaxpy_vector.c SAXPBYKERNEL = axpby_vector.c DAXPBYKERNEL = axpby_vector.c diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c index e99ca8542..6dffe5f09 100644 --- a/kernel/riscv64/axpy_vector.c +++ b/kernel/riscv64/axpy_vector.c @@ -65,7 +65,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS FLOAT_V_T vy0, vy1; BLASLONG stride_x, stride_y; - if (n < 0) return(0); + if (n <= 0) return(0); if (da == 0.0) return(0); if (inc_x == 1 && inc_y == 1) { diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c index d19e51118..1e766c5f4 100644 --- a/kernel/riscv64/zaxpy_vector.c +++ b/kernel/riscv64/zaxpy_vector.c @@ -45,10 +45,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0, j = 0; - BLASLONG ix = 0,iy = 0; - if(n < 0) return(0); - if(da_r == 0.0 && da_i == 0.0) return(0); + BLASLONG i = 0, j = 0; + BLASLONG ix = 0,iy = 0; + if(n <= 0) return(0); + if(da_r == 0.0 && da_i == 0.0) return(0); unsigned int gvl = 0; BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); From d0f5dc763be1783874c5fe5c84c0eef90e73a6d1 Mon Sep 17 00:00:00 2001 From: Dmitry Mikushin Date: Mon, 12 Feb 2024 02:18:03 +0100 Subject: [PATCH 14/14] Adding USE_GEMM3M macro to kernel targets, so that the *gemm3m functions and parameters can be included into the gotoblas structure. Fixes #4500 --- kernel/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 60314eedb..74e6760c2 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -1349,6 +1349,9 @@ endif () set_target_properties(kernel${TSUFFIX} PROPERTIES COMPILE_FLAGS "${KERNEL_DEFINITIONS}") get_target_property(KERNEL_INCLUDE_DIRECTORIES kernel${TSUFFIX} INCLUDE_DIRECTORIES) set_target_properties(kernel${TSUFFIX} PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDE_DIRECTORIES};${TARGET_CONF_DIR}") + if (USE_GEMM3M) + target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M) + endif() endfunction ()