diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index f4ab495e6..c7ef44035 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -100,9 +100,13 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -CGEMMKERNEL = cgemm_kernel_2x2_lsx.S -CGEMMONCOPY = cgemm_ncopy_2_lsx.S -CGEMMOTCOPY = cgemm_tcopy_2_lsx.S +CGEMMKERNEL = cgemm_kernel_8x4_lsx.S +CGEMMINCOPY = cgemm_ncopy_8_lsx.S +CGEMMITCOPY = cgemm_tcopy_8_lsx.S +CGEMMONCOPY = cgemm_ncopy_4_lsx.S +CGEMMOTCOPY = cgemm_tcopy_4_lsx.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) @@ -111,4 +115,14 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S +ZGEMMONCOPY = zgemm_ncopy_4_lsx.S +ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index bd85fab01..17d15656a 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -122,9 +122,13 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -ZGEMMKERNEL = zgemm_kernel_2x2_lasx.S -ZGEMMONCOPY = zgemm_ncopy_2_lasx.S -ZGEMMOTCOPY = zgemm_tcopy_2_lasx.S +ZGEMMKERNEL = zgemm_kernel_8x4_lasx.S +ZGEMMINCOPY = zgemm_ncopy_8_lasx.S +ZGEMMITCOPY = zgemm_tcopy_8_lasx.S +ZGEMMONCOPY = zgemm_ncopy_4_lasx.S +ZGEMMOTCOPY = zgemm_tcopy_4_lasx.S +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/loongarch64/cgemm_kernel_8x4_lsx.S b/kernel/loongarch64/cgemm_kernel_8x4_lsx.S new file mode 100644 index 000000000..1e9fd8524 --- /dev/null +++ b/kernel/loongarch64/cgemm_kernel_8x4_lsx.S @@ -0,0 +1,3313 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 +#define T2 $r26 +#define T3 $r27 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 + +/* LSX vectors */ +#define U0 $vr30 +#define U1 $vr31 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 +#define D0 $vr16 +#define D1 $vr17 +#define D2 $vr18 +#define D3 $vr19 +#define D4 $vr20 +#define D5 $vr21 +#define D6 $vr22 +#define D7 $vr23 +#define D8 $vr24 +#define D9 $vr25 +#define D10 $vr26 +#define D11 $vr27 +#define D12 $vr28 +#define D13 $vr29 +#define VALPHAR $vr28 +#define VALPHAI $vr29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VNMSUB +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VFMADD +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VFMADD +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VNMSUB +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA_R,$sp, 112 + ST ALPHA_I,$sp, 120 + + vldrepl.w VALPHAR, $sp, 112 + vldrepl.w VALPHAI, $sp, 120 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, 2 + + move J, $r0 + srai.d T0, N, 2 //bn/4 + beq J, T0, .L19 + +.L10: /* for(j=0; j0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vand.v D0, U2, U2 + vand.v D1, U3, U3 + vand.v D2, U2, U2 + vand.v D3, U3, U3 + + vpermi.w D0, U0, 0x44 + vpermi.w D2, U0, 0xee + vpermi.w D1, U1, 0x44 + vpermi.w D3, U1, 0xee + + vst D0, TD, 0x00 + vst D2, TD, 0x10 + vst D1, TD, 0x20 + vst D3, TD, 0x30 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vand.v D0, U1, U1 + + vpermi.w D0, U0, 0x44 + vpermi.w U1, U0, 0xee + + vst D0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d S1, S1, 0x10 // a_offset + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d TD, TD, 0x10 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + vld U0, S1, 0x00 + + vst U0, TD, 0x00 + + addi.d S1, S1, 0x10 // aoffset1 + addi.d TD, TD, 0x10 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_ncopy_8_lsx.S b/kernel/loongarch64/cgemm_ncopy_8_lsx.S new file mode 100644 index 000000000..87a88e37d --- /dev/null +++ b/kernel/loongarch64/cgemm_ncopy_8_lsx.S @@ -0,0 +1,263 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 +#define D8 $vr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + slli.d T0, TL, 0x03 + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d S5, S4, TL + add.d S6, S5, TL + add.d S7, S6, TL + add.d S8, S7, TL + add.d TS, TS, T0 + + beq I, ZERO, .L_J11 + +.L_I1: /* if(i>0) i--*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1c + + fld.s F0, S5, 0x00 + fld.s F1, S5, 0x04 + fld.s F2, S6, 0x00 + fld.s F3, S6, 0x04 + fld.s F4, S7, 0x00 + fld.s F5, S7, 0x04 + fld.s F6, S8, 0x00 + fld.s F7, S8, 0x04 + + fst.s F0, TD, 0x20 + fst.s F1, TD, 0x24 + fst.s F2, TD, 0x28 + fst.s F3, TD, 0x2c + fst.s F4, TD, 0x30 + fst.s F5, TD, 0x34 + fst.s F6, TD, 0x38 + fst.s F7, TD, 0x3c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_J11: /* j--*/ + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4)*/ + andi I, N, 0x04 + beq I, ZERO, .L_N2 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d TS, S4, TL + + beq I, ZERO, .L_N2 + +.L_N11: /* if(i>0)*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N2: /* if(n&2)*/ + andi I, N, 0x02 + beq I, ZERO, .L_N3 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d TS, S2, TL + + beq I, ZERO, .L_N3 + +.L_N21: /* if(i>0)*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N3: /* if(n&2)*/ + andi I, N, 0x01 + beq I, ZERO, .L_N0 + + move S1, TS + move I, M + + beq I, ZERO, .L_N0 + +.L_N31: /* if(i>0)*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + + addi.d S1, S1, 0x08 + addi.d TD, TD, 0x08 + + addi.d I, I, -1 + blt ZERO, I, .L_N31 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_4_lsx.S b/kernel/loongarch64/cgemm_tcopy_4_lsx.S new file mode 100644 index 000000000..6d63d62e7 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_4_lsx.S @@ -0,0 +1,324 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.w T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.w T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vld U4, S3, 0x00 + vld U5, S3, 0x10 + + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + vst U4, S8, 0x40 + vst U5, S8, 0x50 + vst U6, S8, 0x60 + vst U7, S8, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + vst U2, S9, 0x20 + vst U3, S9, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S9, S9, 0x40 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + fst.s F2, S10, 0x08 + fst.s F3, S10, 0x0c + fst.s F4, S10, 0x10 + fst.s F5, S10, 0x14 + fst.s F6, S10, 0x18 + fst.s F7, S10, 0x1c + + addi.d S10, S10, 0x20 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S9, S9, 0x20 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + fst.s F2, S10, 0x08 + fst.s F3, S10, 0x0c + + addi.d S10, S10, 0x10 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + + addi.d S1, S1, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + vld U0, S1, 0x00 + + vst U0, S9, 0x00 + + addi.d S1, S1, 0x10 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_8_lsx.S b/kernel/loongarch64/cgemm_tcopy_8_lsx.S new file mode 100644 index 000000000..2935bbc07 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_8_lsx.S @@ -0,0 +1,277 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x40 + + srai.d I, M, 0x01 + beq ZERO, I, .L_J1M1 + +.L_J1I1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + vst U4, TD, 0x40 + vst U5, TD, 0x50 + vst U6, TD, 0x60 + vst U7, TD, 0x70 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_J1I1 + +.L_J1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_J0 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + addi.d TD, TD, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4) */ + andi I, N, 0x04 + beq ZERO, I, .L_N2 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x20 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N1M1 + +.L_N1I1: /* if(i>0) i-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_N1I1 + +.L_N1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d TD, TD, 0x20 + +.L_N2: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N3 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x10 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N2M1 + +.L_N2I1: /* if(i>0) i-- */ + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + add.d S1, S1, T0 + add.d S2, S2, T0 + + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N2I1 + +.L_N2M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N3 + + vld U0, S1, 0x00 + + vst U0, TD, 0x00 + + addi.d TD, TD, 0x10 + +.L_N3: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + + srai.d I, M, 0x01 + beq ZERO, I, .L_N3M1 + +.L_N3I1: /* if(i>0) i-- */ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x10 + + addi.d I, I, -1 + blt ZERO, I, .L_N3I1 + +.L_N3M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_kernel_4x4_lsx.S b/kernel/loongarch64/zgemm_kernel_4x4_lsx.S new file mode 100644 index 000000000..6c4841b24 --- /dev/null +++ b/kernel/loongarch64/zgemm_kernel_4x4_lsx.S @@ -0,0 +1,2316 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 +#define T2 $r26 +#define T3 $r27 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 + +/* LSX vectors */ +#define U0 $vr30 +#define U1 $vr31 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 +#define D0 $vr16 +#define D1 $vr17 +#define D2 $vr18 +#define D3 $vr19 +#define D4 $vr20 +#define D5 $vr21 +#define D6 $vr22 +#define D7 $vr23 +#define D8 $vr24 +#define D9 $vr25 +#define D10 $vr26 +#define D11 $vr27 +#define D12 $vr28 +#define D13 $vr29 +#define VALPHAR $vr28 +#define VALPHAI $vr29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VNMSUB +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VFMADD +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VFMADD +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VNMSUB +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA_R,$sp, 112 + ST ALPHA_I,$sp, 120 + + vldrepl.d VALPHAR, $sp, 112 + vldrepl.d VALPHAI, $sp, 120 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, BASE_SHIFT + + move J, $r0 + srai.d T0, N, 2 //bn/4 + beq J, T0, .L19 + +.L10: /* for(j=0; j0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + xvand.v D2, U2, U2 + xvand.v D3, U3, U3 + + xvpermi.q D0, U2, 0x02 + xvpermi.q D2, U0, 0x31 + xvpermi.q D1, U3, 0x02 + xvpermi.q D3, U1, 0x31 + + xvst D0, TD, 0x00 + xvst D2, TD, 0x20 + xvst D1, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x40 // a_offset + addi.d S2, S2, 0x40 + addi.d TD, TD, 0x80 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvand.v D0, U0, U0 + + xvpermi.q D0, U1, 0x02 + xvpermi.q U1, U0, 0x31 + + xvst D0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, TD, 0x00 + vst $vr1, TD, 0x10 + + addi.d TD, TD, 0x20 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d S1, S1, 0x40 // aoffset1 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + xvld U0, S1, 0x00 + xvst U0, TD, 0x00 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + vld $vr0, S1, 0x00 + vst $vr0, TD, 0x00 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_ncopy_4_lsx.S b/kernel/loongarch64/zgemm_ncopy_4_lsx.S new file mode 100644 index 000000000..203471cbd --- /dev/null +++ b/kernel/loongarch64/zgemm_ncopy_4_lsx.S @@ -0,0 +1,332 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define TD $r20 +#define TS $r11 +#define TL $r19 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x03 + slli.d TL, TL, 0x01 + + srai.d J, N, 0x02 + beq J, ZERO, .L_N0 + +.L_J1: /* J-- */ + move S1, TS + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq I, ZERO, .L_I3 + +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vld U8, S3, 0x00 + vld U9, S3, 0x10 + vld U10, S3, 0x20 + vld U11, S3, 0x30 + + vld U12, S4, 0x00 + vld U13, S4, 0x10 + vld U14, S4, 0x20 + vld U15, S4, 0x30 + + vst U0, TD, 0x00 + vst U4, TD, 0x10 + vst U8, TD, 0x20 + vst U12, TD, 0x30 + + vst U1, TD, 0x40 + vst U5, TD, 0x50 + vst U9, TD, 0x60 + vst U13, TD, 0x70 + + vst U2, TD, 0x80 + vst U6, TD, 0x90 + vst U10, TD, 0xa0 + vst U14, TD, 0xb0 + + vst U3, TD, 0xc0 + vst U7, TD, 0xd0 + vst U11, TD, 0xe0 + vst U15, TD, 0xf0 + + addi.d S1, S1, 0x40 // a_offset + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d TD, TD, 0x100 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_I3: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_II20 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vld U4, S3, 0x00 + vld U5, S3, 0x10 + + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, TD, 0x00 + vst U2, TD, 0x10 + vst U4, TD, 0x20 + vst U6, TD, 0x30 + + vst U1, TD, 0x40 + vst U3, TD, 0x50 + vst U5, TD, 0x60 + vst U7, TD, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x80 + +.L_II20: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_J0 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + addi.d TD, TD, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N0: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N20 + + move S1, TS + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq ZERO, I, .L_N10 + +.L_N11: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, TD, 0x00 + vst U4, TD, 0x10 + vst U1, TD, 0x20 + vst U5, TD, 0x30 + + vst U2, TD, 0x40 + vst U6, TD, 0x50 + vst U3, TD, 0x60 + vst U7, TD, 0x70 + + addi.d S1, S1, 0x40 // a_offset + addi.d S2, S2, 0x40 + addi.d TD, TD, 0x80 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, TD, 0x00 + vst U2, TD, 0x10 + vst U1, TD, 0x20 + vst U3, TD, 0x30 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d TD, TD, 0x20 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + addi.d S1, S1, 0x40 // aoffset1 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + vld U0, S1, 0x00 + + vst U0, TD, 0x00 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_ncopy_8_lasx.S b/kernel/loongarch64/zgemm_ncopy_8_lasx.S new file mode 100644 index 000000000..7cd8f605b --- /dev/null +++ b/kernel/loongarch64/zgemm_ncopy_8_lasx.S @@ -0,0 +1,263 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 +#define D8 $xr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 + + slli.d T0, TL, 0x03 + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d S5, S4, TL + add.d S6, S5, TL + add.d S7, S6, TL + add.d S8, S7, TL + add.d TS, TS, T0 + + beq I, ZERO, .L_J11 + +.L_I1: /* if(i>0) i--*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + fld.d F2, S2, 0x00 + fld.d F3, S2, 0x08 + fld.d F4, S3, 0x00 + fld.d F5, S3, 0x08 + fld.d F6, S4, 0x00 + fld.d F7, S4, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + fst.d F4, TD, 0x20 + fst.d F5, TD, 0x28 + fst.d F6, TD, 0x30 + fst.d F7, TD, 0x38 + + fld.d F0, S5, 0x00 + fld.d F1, S5, 0x08 + fld.d F2, S6, 0x00 + fld.d F3, S6, 0x08 + fld.d F4, S7, 0x00 + fld.d F5, S7, 0x08 + fld.d F6, S8, 0x00 + fld.d F7, S8, 0x08 + + fst.d F0, TD, 0x40 + fst.d F1, TD, 0x48 + fst.d F2, TD, 0x50 + fst.d F3, TD, 0x58 + fst.d F4, TD, 0x60 + fst.d F5, TD, 0x68 + fst.d F6, TD, 0x70 + fst.d F7, TD, 0x78 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_J11: /* j--*/ + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4)*/ + andi I, N, 0x04 + beq I, ZERO, .L_N2 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d TS, S4, TL + + beq I, ZERO, .L_N2 + +.L_N11: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + fld.d F2, S2, 0x00 + fld.d F3, S2, 0x08 + fld.d F4, S3, 0x00 + fld.d F5, S3, 0x08 + fld.d F6, S4, 0x00 + fld.d F7, S4, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + fst.d F4, TD, 0x20 + fst.d F5, TD, 0x28 + fst.d F6, TD, 0x30 + fst.d F7, TD, 0x38 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N2: /* if(n&2)*/ + andi I, N, 0x02 + beq I, ZERO, .L_N3 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d TS, S2, TL + + beq I, ZERO, .L_N3 + +.L_N21: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + fld.d F2, S2, 0x00 + fld.d F3, S2, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N3: /* if(n&2)*/ + andi I, N, 0x01 + beq I, ZERO, .L_N0 + + move S1, TS + move I, M + + beq I, ZERO, .L_N0 + +.L_N31: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + + addi.d S1, S1, 0x10 + addi.d TD, TD, 0x10 + + addi.d I, I, -1 + blt ZERO, I, .L_N31 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_4_lasx.S b/kernel/loongarch64/zgemm_tcopy_4_lasx.S new file mode 100644 index 000000000..1adee11c5 --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_4_lasx.S @@ -0,0 +1,302 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x100 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + xvst U2, S8, 0x40 + xvst U3, S8, 0x60 + xvst U4, S8, 0x80 + xvst U5, S8, 0xa0 + xvst U6, S8, 0xc0 + xvst U7, S8, 0xe0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, S9, 0x00 + xvst U1, S9, 0x20 + xvst U2, S9, 0x40 + xvst U3, S9, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S9, S9, 0x80 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + vld $vr2, S3, 0x00 + vld $vr3, S4, 0x00 + + vst $vr0, S10, 0x00 + vst $vr1, S10, 0x10 + vst $vr2, S10, 0x20 + vst $vr3, S10, 0x30 + + addi.d S10, S10, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + xvst U2, S8, 0x40 + xvst U3, S8, 0x60 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, S9, 0x00 + xvst U1, S9, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S9, S9, 0x40 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S10, 0x00 + vst $vr1, S10, 0x10 + + addi.d S10, S10, 0x20 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + + addi.d S1, S1, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + xvld U0, S1, 0x00 + + xvst U0, S9, 0x00 + + addi.d S1, S1, 0x20 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + vld $vr0, S1, 0x00 + + vst $vr0, S10, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_4_lsx.S b/kernel/loongarch64/zgemm_tcopy_4_lsx.S new file mode 100644 index 000000000..954753eaf --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_4_lsx.S @@ -0,0 +1,355 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x100 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vld U8, S3, 0x00 + vld U9, S3, 0x10 + vld U10, S3, 0x20 + vld U11, S3, 0x30 + + vld U12, S4, 0x00 + vld U13, S4, 0x10 + vld U14, S4, 0x20 + vld U15, S4, 0x30 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + vst U4, S8, 0x40 + vst U5, S8, 0x50 + vst U6, S8, 0x60 + vst U7, S8, 0x70 + + vst U8, S8, 0x80 + vst U9, S8, 0x90 + vst U10, S8, 0xa0 + vst U11, S8, 0xb0 + vst U12, S8, 0xc0 + vst U13, S8, 0xd0 + vst U14, S8, 0xe0 + vst U15, S8, 0xf0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vld U4, S3, 0x00 + vld U5, S3, 0x10 + + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + vst U2, S9, 0x20 + vst U3, S9, 0x30 + + vst U4, S9, 0x40 + vst U5, S9, 0x50 + vst U6, S9, 0x60 + vst U7, S9, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S9, S9, 0x80 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, S10, 0x00 + vst U1, S10, 0x10 + vst U2, S10, 0x20 + vst U3, S10, 0x30 + + addi.d S10, S10, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + + vst U4, S8, 0x40 + vst U5, S8, 0x50 + vst U6, S8, 0x60 + vst U7, S8, 0x70 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + vst U2, S9, 0x20 + vst U3, S9, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S9, S9, 0x40 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, S10, 0x00 + vst U1, S10, 0x10 + + addi.d S10, S10, 0x20 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + + addi.d S1, S1, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + + addi.d S1, S1, 0x20 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + vld U0, S1, 0x00 + + vst U0, S10, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_8_lasx.S b/kernel/loongarch64/zgemm_tcopy_8_lasx.S new file mode 100644 index 000000000..f7440dc24 --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_8_lasx.S @@ -0,0 +1,268 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 + + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x80 + + srai.d I, M, 0x01 + beq ZERO, I, .L_J1M1 + +.L_J1I1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + xvst U2, TD, 0x40 + xvst U3, TD, 0x60 + xvst U4, TD, 0x80 + xvst U5, TD, 0xa0 + xvst U6, TD, 0xc0 + xvst U7, TD, 0xe0 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x100 + + addi.d I, I, -1 + blt ZERO, I, .L_J1I1 + +.L_J1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_J0 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + xvst U2, TD, 0x40 + xvst U3, TD, 0x60 + + addi.d TD, TD, 0x80 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4) */ + andi I, N, 0x04 + beq ZERO, I, .L_N2 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x40 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N1M1 + +.L_N1I1: /* if(i>0) i-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + xvst U2, TD, 0x40 + xvst U3, TD, 0x60 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_N1I1 + +.L_N1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N2 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d TD, TD, 0x40 + +.L_N2: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N3 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x20 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N2M1 + +.L_N2I1: /* if(i>0) i-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + add.d S1, S1, T0 + add.d S2, S2, T0 + + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_N2I1 + +.L_N2M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N3 + + xvld U0, S1, 0x00 + + xvst U0, TD, 0x00 + + addi.d TD, TD, 0x20 + +.L_N3: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + + srai.d I, M, 0x01 + beq ZERO, I, .L_N3M1 + +.L_N3I1: /* if(i>0) i-- */ + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, TD, 0x00 + vst $vr1, TD, 0x10 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N3I1 + +.L_N3M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N0 + + vld $vr0, S1, 0x00 + + vst $vr0, TD, 0x00 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/param.h b/param.h index 003ea396f..5d2e960a2 100644 --- a/param.h +++ b/param.h @@ -2854,12 +2854,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 -#define ZGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 4 #define XGEMM_DEFAULT_UNROLL_N 1 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 8 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_P 256 @@ -2891,10 +2891,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_P 128