diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 802dd1c9b..cb230b348 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -31,11 +31,11 @@ IDAMAXKERNEL = idamax_lsx.S ISAMINKERNEL = isamin_lsx.S IDAMINKERNEL = idamin_lsx.S -SCOPYKERNEL = scopy_lsx.S -DCOPYKERNEL = dcopy_lsx.S +SCOPYKERNEL = copy_lsx.S +DCOPYKERNEL = copy_lsx.S -SSWAPKERNEL = sswap_lsx.S -DSWAPKERNEL = dswap_lsx.S +SSWAPKERNEL = swap_lsx.S +DSWAPKERNEL = swap_lsx.S SAXPYKERNEL = saxpy_lsx.S DAXPYKERNEL = daxpy_lsx.S @@ -43,8 +43,8 @@ DAXPYKERNEL = daxpy_lsx.S SAXPBYKERNEL = saxpby_lsx.S DAXPBYKERNEL = daxpby_lsx.S -SSUMKERNEL = ssum_lsx.S -DSUMKERNEL = dsum_lsx.S +SSUMKERNEL = sum_lsx.S +DSUMKERNEL = sum_lsx.S SASUMKERNEL = sasum_lsx.S DASUMKERNEL = dasum_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 3253489d9..ba59c4566 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -31,11 +31,11 @@ IDAMAXKERNEL = idamax_lasx.S ISAMINKERNEL = isamin_lasx.S IDAMINKERNEL = idamin_lasx.S -SCOPYKERNEL = scopy_lasx.S -DCOPYKERNEL = dcopy_lasx.S +SCOPYKERNEL = copy_lasx.S +DCOPYKERNEL = copy_lasx.S -SSWAPKERNEL = sswap_lasx.S -DSWAPKERNEL = dswap_lasx.S +SSWAPKERNEL = swap_lasx.S +DSWAPKERNEL = swap_lasx.S SAXPYKERNEL = saxpy_lasx.S DAXPYKERNEL = daxpy_lasx.S @@ -43,8 +43,8 @@ DAXPYKERNEL = daxpy_lasx.S SAXPBYKERNEL = saxpby_lasx.S DAXPBYKERNEL = daxpby_lasx.S -SSUMKERNEL = ssum_lasx.S -DSUMKERNEL = dsum_lasx.S +SSUMKERNEL = sum_lasx.S +DSUMKERNEL = sum_lasx.S SASUMKERNEL = sasum_lasx.S DASUMKERNEL = dasum_lasx.S diff --git a/kernel/loongarch64/copy_lasx.S b/kernel/loongarch64/copy_lasx.S new file mode 100644 index 000000000..31f91cec1 --- /dev/null +++ b/kernel/loongarch64/copy_lasx.S @@ -0,0 +1,306 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $xr12 +#define VX1 $xr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and INCY==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 + addi.d I, I, -1 + xvst VX0, Y, 0 +#ifdef DOUBLE + xvld VX0, X, 32 + xvst VX0, Y, 32 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 3 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 3 + add.d Y, Y, INCY +#else + xvld VX0, X, 0 + xvstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 3 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 4 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 5 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 6 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 7 + add.d Y, Y, INCY +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +/* INCX!=1 and INCY==1 */ +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvst VX0, Y, 0 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvst VX1, Y, 32 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvst VX0, Y, 0 +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +/* INCX!=1 and INCY!=1 */ +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/copy_lsx.S b/kernel/loongarch64/copy_lsx.S new file mode 100644 index 000000000..bb10f3565 --- /dev/null +++ b/kernel/loongarch64/copy_lsx.S @@ -0,0 +1,316 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $vr12 +#define VX1 $vr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and INCY==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + vst VX0, Y, 0 + vst VX1, Y, 16 +#ifdef DOUBLE + vld VX0, X, 32 + vld VX1, X, 48 + vst VX0, Y, 32 + vst VX1, Y, 48 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + vld VX0, X, 32 + vld VX1, X, 48 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY +#else + vld VX0, X, 0 + vld VX1, X, 16 + vstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 3 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 1 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 2 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 3 + add.d Y, Y, INCY +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +/* INCX!=1 and INCY==1 */ +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 0 + vst VX1, Y, 16 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 32 + vst VX1, Y, 48 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vst VX0, Y, 0 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX1, Y, 16 +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +/* INCX!=1 and INCY!=1 */ +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/dcopy_lasx.S b/kernel/loongarch64/dcopy_lasx.S deleted file mode 100644 index 9d7da4a80..000000000 --- a/kernel/loongarch64/dcopy_lasx.S +++ /dev/null @@ -1,224 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define I $r17 -#define TEMP $r18 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define VX0 $xr12 -#define VX1 $xr13 - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - xvst VX0, Y, 0 * SIZE - xvst VX1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.d $f12, Y, 0 * SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: - bge $r0, I, .L122 - .align 3 - -.L121: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - xvstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - xvstelm.d VX0, Y, 0, 1 - add.d Y, Y, INCY - xvstelm.d VX0, Y, 0, 2 - add.d Y, Y, INCY - xvstelm.d VX0, Y, 0, 3 - add.d Y, Y, INCY - xvstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - xvstelm.d VX1, Y, 0, 1 - add.d Y, Y, INCY - xvstelm.d VX1, Y, 0, 2 - add.d Y, Y, INCY - xvstelm.d VX1, Y, 0, 3 - add.d Y, Y, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.d $f12, Y, 0 * SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - xvst VX0, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvst VX1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bgez INCX, .L220 - .align 3 - -.L220: - bge $r0, I, .L223 - .align 3 - -.L222: - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.d a4, X, 0 * SIZE - add.d Y, Y, INCY - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.d a4, X, 0 * SIZE - add.d Y, Y, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/dcopy_lsx.S b/kernel/loongarch64/dcopy_lsx.S deleted file mode 100644 index 161655bbd..000000000 --- a/kernel/loongarch64/dcopy_lsx.S +++ /dev/null @@ -1,232 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define I $r17 -#define TEMP $r18 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define VX0 $vr12 -#define VX1 $vr13 - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - vld VX0, X, 0 * SIZE - vld VX1, X, 2 * SIZE - vst VX0, Y, 0 * SIZE - vst VX1, Y, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - addi.d I, I, -1 - vst VX0, Y, 4 * SIZE - vst VX1, Y, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.d $f12, Y, 0 * SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: - bge $r0, I, .L122 - .align 3 - -.L121: - vld VX0, X, 0 * SIZE - vld VX1, X, 2 * SIZE - vstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - vstelm.d VX0, Y, 0, 1 - add.d Y, Y, INCY - vstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - vstelm.d VX1, Y, 0, 1 - add.d Y, Y, INCY - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - vstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - vstelm.d VX0, Y, 0, 1 - add.d Y, Y, INCY - vstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - vstelm.d VX1, Y, 0, 1 - add.d Y, Y, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.d $f12, Y, 0 * SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vst VX0, Y, 0 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - vst VX1, Y, 2 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vst VX0, Y, 4 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - vst VX1, Y, 6 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bgez INCX, .L220 - .align 3 - -.L220: - bge $r0, I, .L223 - .align 3 - -.L222: - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.d a4, X, 0 * SIZE - add.d Y, Y, INCY - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.d a4, X, 0 * SIZE - add.d Y, Y, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/dnrm2_lasx.S b/kernel/loongarch64/dnrm2_lasx.S index 2a9c3cf7b..5a6f7cf1e 100644 --- a/kernel/loongarch64/dnrm2_lasx.S +++ b/kernel/loongarch64/dnrm2_lasx.S @@ -1,3 +1,35 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -12,6 +44,8 @@ #define t2 $r13 #define t3 $r14 #define t4 $r15 + +/* Don't change following FR unless you know the effects. */ #define VX0 $xr15 #define VX1 $xr16 #define VM0 $xr17 @@ -35,6 +69,7 @@ xvxor.v res1, res1, res1 xvxor.v res2, res2, res2 + xvxor.v VM0, VM0, VM0 bge $r0, N, .L999 beq $r0, INCX, .L999 move XX, X @@ -46,12 +81,11 @@ slli.d INCX, INCX, BASE_SHIFT srai.d I, N, 3 bne INCX, TEMP, .L20 - xvld VM0, X, 0 bge $r0, I, .L97 .align 3 .L10: - xvld VX0, X, 0 * SIZE + xvld VX0, X, 0 xvld VX1, X, 4 * SIZE xvfmaxa.d VM1, VX1, VX0 xvfmaxa.d VM0, VM0, VM1 @@ -62,40 +96,32 @@ .align 3 .L20: // INCX!=1 - move TEMP, X // initialize the maxa value - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 bge $r0, I, .L97 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t2, 1 .align 3 .L21: - ld.d t1, X, 0 * SIZE + ld.d t1, X, 0 add.d X, X, INCX xvinsgr2vr.d VX0, t1, 0 - ld.d t2, X, 0 * SIZE + ld.d t2, X, 0 add.d X, X, INCX xvinsgr2vr.d VX0, t2, 1 - ld.d t3, X, 0 * SIZE + ld.d t3, X, 0 add.d X, X, INCX xvinsgr2vr.d VX0, t3, 2 - ld.d t4, X, 0 * SIZE + ld.d t4, X, 0 add.d X, X, INCX xvinsgr2vr.d VX0, t4, 3 - ld.d t1, X, 0 * SIZE + ld.d t1, X, 0 add.d X, X, INCX xvinsgr2vr.d VX1, t1, 0 - ld.d t2, X, 0 * SIZE + ld.d t2, X, 0 add.d X, X, INCX xvinsgr2vr.d VX1, t2, 1 - ld.d t3, X, 0 * SIZE + ld.d t3, X, 0 add.d X, X, INCX xvinsgr2vr.d VX1, t3, 2 - ld.d t4, X, 0 * SIZE + ld.d t4, X, 0 add.d X, X, INCX xvinsgr2vr.d VX1, t4, 3 xvfmaxa.d VM1, VX0, VX1 @@ -109,9 +135,9 @@ xvpickve.d VX0, VM0, 1 xvpickve.d VX1, VM0, 2 xvpickve.d VM3, VM0, 3 - xvfmaxa.d VM1, VX0, VX1 - xvfmaxa.d VM2, VM3, VM0 - xvfmaxa.d VM0, VM1, VM2 + fmaxa.d $f17, $f17, $f14 + fmaxa.d $f17, $f17, $f15 + fmaxa.d $f17, $f17, $f16 .align 3 .L97: @@ -149,12 +175,12 @@ .align 3 .L110: - xvld VX0, XX, 0 * SIZE + xvld VX0, XX, 0 xvld VX1, XX, 4 * SIZE - xvfmul.d VM0, VX0, VALPHA - xvfmul.d VM1, VX1, VALPHA - xvfmadd.d res1, VM0, VM0, res1 - xvfmadd.d res2, VM1, VM1, res2 + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 addi.d XX, XX, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L110 @@ -166,34 +192,34 @@ bge $r0, I, .L997 .L121: - ld.d t1, XX, 0 * SIZE + ld.d t1, XX, 0 add.d XX, XX, INCX - ld.d t2, XX, 0 * SIZE + ld.d t2, XX, 0 add.d XX, XX, INCX - ld.d t3, XX, 0 * SIZE + ld.d t3, XX, 0 add.d XX, XX, INCX - ld.d t4, XX, 0 * SIZE + ld.d t4, XX, 0 add.d XX, XX, INCX xvinsgr2vr.d VX0, t1, 0 xvinsgr2vr.d VX0, t2, 1 xvinsgr2vr.d VX0, t3, 2 xvinsgr2vr.d VX0, t4, 3 - ld.d t1, XX, 0 * SIZE + ld.d t1, XX, 0 add.d XX, XX, INCX - ld.d t2, XX, 0 * SIZE + ld.d t2, XX, 0 add.d XX, XX, INCX - ld.d t3, XX, 0 * SIZE + ld.d t3, XX, 0 add.d XX, XX, INCX - ld.d t4, XX, 0 * SIZE + ld.d t4, XX, 0 add.d XX, XX, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 xvinsgr2vr.d VX1, t3, 2 xvinsgr2vr.d VX1, t4, 3 - xvfmul.d VM0, VX0, VALPHA - xvfmul.d VM1, VX1, VALPHA - xvfmadd.d res1, VM0, VM0, res1 - xvfmadd.d res2, VM1, VM1, res2 + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 addi.d I, I, -1 blt $r0, I, .L121 b .L996 @@ -203,10 +229,10 @@ xvfadd.d res1, res1, res2 xvpickve.d VX0, res1, 1 xvpickve.d VX1, res1, 2 - xvpickve.d VM0, res1, 3 - xvfadd.d res1, VX0, res1 - xvfadd.d VX1, VX1, VM0 - xvfadd.d res1, VX1, res1 + xvpickve.d VM2, res1, 3 + fadd.d $f19, $f19, $f15 + fadd.d $f19, $f19, $f16 + fadd.d $f19, $f19, $f13 .align 3 .L997: @@ -215,19 +241,17 @@ .align 3 .L998: - fld.d $f15, XX, 0 * SIZE + fld.d $f15, XX, 0 addi.d I, I, -1 fmul.d $f15, $f15, ALPHA fmadd.d $f19, $f15, $f15, $f19 add.d XX, XX , INCX blt $r0, I, .L998 - fsqrt.d $f19, $f19 - fmul.d $f0, max, $f19 - jirl $r0, $r1, 0x0 - .align 3 .L999: - fmov.d $f0, $f19 + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 jirl $r0, $r1, 0x0 + .align 3 EPILOGUE diff --git a/kernel/loongarch64/dnrm2_lsx.S b/kernel/loongarch64/dnrm2_lsx.S index e4615e18d..fce4260e2 100644 --- a/kernel/loongarch64/dnrm2_lsx.S +++ b/kernel/loongarch64/dnrm2_lsx.S @@ -1,3 +1,35 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -12,6 +44,8 @@ #define t2 $r13 #define t3 $r14 #define t4 $r15 + +/* Don't change following FR unless you know the effects. */ #define VX0 $vr15 #define VX1 $vr16 #define VM0 $vr17 @@ -35,6 +69,7 @@ vxor.v res1, res1, res1 vxor.v res2, res2, res2 + vxor.v VM0, VM0, VM0 bge $r0, N, .L999 beq $r0, INCX, .L999 move XX, X @@ -46,7 +81,7 @@ slli.d INCX, INCX, BASE_SHIFT srai.d I, N, 3 bne INCX, TEMP, .L20 - vld VM0, X, 0 + bge $r0, I, .L97 .align 3 @@ -66,15 +101,7 @@ .align 3 .L20: // INCX!=1 - move TEMP, X // initialize the maxa value - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 bge $r0, I, .L97 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t2, 1 .align 3 .L21: @@ -154,16 +181,16 @@ .L110: vld VX0, XX, 0 * SIZE vld VX1, XX, 2 * SIZE - vfmul.d VM0, VX0, VALPHA - vfmul.d VM1, VX1, VALPHA - vfmadd.d res1, VM0, VM0, res1 - vfmadd.d res2, VM1, VM1, res2 + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 vld VX0, XX, 4 * SIZE vld VX1, XX, 6 * SIZE - vfmul.d VM0, VX0, VALPHA - vfmul.d VM1, VX1, VALPHA - vfmadd.d res1, VM0, VM0, res1 - vfmadd.d res2, VM1, VM1, res2 + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 addi.d XX, XX, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L110 @@ -173,6 +200,7 @@ .L120: srai.d I, N, 3 bge $r0, I, .L997 + .align 3 .L121: ld.d t1, XX, 0 * SIZE @@ -187,14 +215,14 @@ vinsgr2vr.d VX0, t2, 1 vinsgr2vr.d VX1, t3, 0 vinsgr2vr.d VX1, t4, 1 - vfmul.d VM0, VX0, VALPHA + vfmul.d VM2, VX0, VALPHA ld.d t1, XX, 0 * SIZE add.d XX, XX, INCX - vfmul.d VM1, VX1, VALPHA + vfmul.d VM3, VX1, VALPHA ld.d t2, XX, 0 * SIZE add.d XX, XX, INCX - vfmadd.d res1, VM0, VM0, res1 - vfmadd.d res2, VM1, VM1, res2 + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 ld.d t3, XX, 0 * SIZE add.d XX, XX, INCX ld.d t4, XX, 0 * SIZE @@ -203,10 +231,10 @@ vinsgr2vr.d VX0, t2, 1 vinsgr2vr.d VX1, t3, 0 vinsgr2vr.d VX1, t4, 1 - vfmul.d VM0, VX0, VALPHA - vfmul.d VM1, VX1, VALPHA - vfmadd.d res1, VM0, VM0, res1 - vfmadd.d res2, VM1, VM1, res2 + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 addi.d I, I, -1 blt $r0, I, .L121 b .L996 @@ -230,13 +258,11 @@ fmadd.d $f19, $f15, $f15, $f19 add.d XX, XX , INCX blt $r0, I, .L998 - fsqrt.d $f19, $f19 - fmul.d $f0, max, $f19 - jirl $r0, $r1, 0x0 .align 3 .L999: - fmov.d $f0, $f19 + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/dsum_lasx.S b/kernel/loongarch64/dsum_lasx.S deleted file mode 100644 index 3c51dab60..000000000 --- a/kernel/loongarch64/dsum_lasx.S +++ /dev/null @@ -1,125 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 -#define res1 $xr16 -#define res2 $xr17 - PROLOGUE - xvxor.v res1, res1, res1 - xvxor.v res2, res2, res2 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - xvfadd.d res2, VX0, VX1 - xvfadd.d res1, res1, res2 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - xvpickve.d VX1, res1, 1 - xvpickve.d VX2, res1, 2 - xvpickve.d VX3, res1, 3 - xvfadd.d res1, VX1, res1 - xvfadd.d res1, VX2, res1 - xvfadd.d res1, VX3, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - fadd.d $f16, $f12, $f16 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvfadd.d res2, VX0, VX1 - xvfadd.d res1, res1, res2 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - xvpickve.d VX1, res1, 1 - xvpickve.d VX2, res1, 2 - xvpickve.d VX3, res1, 3 - xvfadd.d res1, VX1, res1 - xvfadd.d res1, VX2, res1 - xvfadd.d res1, VX3, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.d $f12, X, 0 * SIZE - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.d $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/dsum_lsx.S b/kernel/loongarch64/dsum_lsx.S deleted file mode 100644 index 402d087df..000000000 --- a/kernel/loongarch64/dsum_lsx.S +++ /dev/null @@ -1,123 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 -#define res1 $vr16 -#define res2 $vr17 - PROLOGUE - vxor.v res1, res1, res1 - vxor.v res2, res2, res2 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - vld VX0, X, 0 * SIZE - vld VX1, X, 2 * SIZE - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - vreplvei.d VX1, res1, 1 - vfadd.d res1, VX1, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.d $f12, X, 0 * SIZE - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - add.d X, X, INCX - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - vreplvei.d VX1, res1, 1 - vfadd.d res1, VX1, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.d $f12, X, 0 * SIZE - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.d $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/dswap_lasx.S b/kernel/loongarch64/dswap_lasx.S deleted file mode 100644 index 221cb7fa2..000000000 --- a/kernel/loongarch64/dswap_lasx.S +++ /dev/null @@ -1,301 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r17 -#define TEMP $r18 -#define XX $r5 -#define YY $r6 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 - - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - xvld VX2, Y, 0 * SIZE - xvld VX3, Y, 4 * SIZE - addi.d I, I, -1 - xvst VX2, X, 0 * SIZE - xvst VX3, X, 4 * SIZE - xvst VX0, Y, 0 * SIZE - xvst VX1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - .align 3 - -.L121: - xvld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - xvstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - xvstelm.d VX0, Y, 0, 1 - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - xvstelm.d VX0, Y, 0, 2 - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvstelm.d VX0, Y, 0, 3 - xvinsgr2vr.d VX2, t1, 0 - xvinsgr2vr.d VX2, t2, 1 - xvinsgr2vr.d VX2, t3, 2 - xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY - xvst VX2, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - xvstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - xvstelm.d VX1, Y, 0, 1 - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - xvstelm.d VX1, Y, 0, 2 - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvstelm.d VX1, Y, 0, 3 - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 - add.d Y, Y, INCY - xvst VX3, X, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - xvld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - xvstelm.d VX2, X, 0, 0 - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - xvstelm.d VX2, X, 0, 1 - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - xvstelm.d VX2, X, 0, 2 - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvstelm.d VX2, X, 0, 3 - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - add.d X, X, INCX - xvst VX0, Y, 0 * SIZE - xvld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - xvstelm.d VX3, X, 0, 0 - add.d X, X, INCY - ld.d t2, X, 0 * SIZE - xvstelm.d VX3, X, 0, 1 - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - xvstelm.d VX3, X, 0, 2 - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvstelm.d VX3, X, 0, 3 - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - add.d X, X, INCX - xvst VX1, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bgez INCX, .L220 - //addi.d TEMP, N, -1 - //mul.d TEMP, TEMP, INCX - //sub.d X, X, TEMP - .align 3 - -.L220: - bge $r0, I, .L223 - .align 3 - move XX, X - -.L222: - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fld.d b1, Y, 0 * SIZE - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b2, Y, 0 * SIZE - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b3, Y, 0 * SIZE - fst.d a3, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b4, Y, 0 * SIZE - fst.d a4, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fst.d b1, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b1, Y, 0 * SIZE - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fst.d b2, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b2, Y, 0 * SIZE - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fst.d b3, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b3, Y, 0 * SIZE - fst.d a3, Y, 0 * SIZE - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d b4, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b4, Y, 0 * SIZE - fst.d a4, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d b1, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b2, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b3, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b4, XX, 0 * SIZE - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/dswap_lsx.S b/kernel/loongarch64/dswap_lsx.S deleted file mode 100644 index 7f7f585e1..000000000 --- a/kernel/loongarch64/dswap_lsx.S +++ /dev/null @@ -1,317 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r17 -#define TEMP $r18 -#define XX $r5 -#define YY $r6 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 - - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - vld VX0, X, 0 * SIZE - vld VX1, X, 2 * SIZE - vld VX2, Y, 0 * SIZE - vld VX3, Y, 2 * SIZE - vst VX2, X, 0 * SIZE - vst VX3, X, 2 * SIZE - vst VX0, Y, 0 * SIZE - vst VX1, Y, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - vld VX2, Y, 4 * SIZE - vld VX3, Y, 6 * SIZE - addi.d I, I, -1 - vst VX2, X, 4 * SIZE - vst VX3, X, 6 * SIZE - vst VX0, Y, 4 * SIZE - vst VX1, Y, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - .align 3 - -.L121: - vld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - vstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vstelm.d VX0, Y, 0, 1 - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vst VX2, X, 0 * SIZE - vld VX1, X, 2 * SIZE - ld.d t3, Y, 0 * SIZE - vstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vstelm.d VX1, Y, 0, 1 - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vst VX3, X, 2 * SIZE - vld VX0, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - vstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vstelm.d VX0, Y, 0, 1 - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vst VX2, X, 4 * SIZE - vld VX1, X, 6 * SIZE - ld.d t3, Y, 0 * SIZE - vstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vstelm.d VX1, Y, 0, 1 - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vst VX3, X, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - vld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - vstelm.d VX2, X, 0, 0 - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vstelm.d VX2, X, 0, 1 - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - add.d X, X, INCY - vst VX0, Y, 0 * SIZE - vld VX3, Y, 2 * SIZE - ld.d t3, X, 0 * SIZE - vstelm.d VX3, X, 0, 0 - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vstelm.d VX3, X, 0, 1 - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vst VX1, Y, 2 * SIZE - vld VX2, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - vstelm.d VX2, X, 0, 0 - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vstelm.d VX2, X, 0, 1 - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - add.d X, X, INCY - vst VX0, Y, 4 * SIZE - vld VX3, Y, 6 * SIZE - ld.d t3, X, 0 * SIZE - vstelm.d VX3, X, 0, 0 - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vstelm.d VX3, X, 0, 1 - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vst VX1, Y, 6 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bgez INCX, .L220 - //addi.d TEMP, N, -1 - //mul.d TEMP, TEMP, INCX - //sub.d X, X, TEMP - .align 3 - -.L220: - bge $r0, I, .L223 - .align 3 - move XX, X - -.L222: - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fld.d b1, Y, 0 * SIZE - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b2, Y, 0 * SIZE - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b3, Y, 0 * SIZE - fst.d a3, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b4, Y, 0 * SIZE - fst.d a4, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fst.d b1, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b1, Y, 0 * SIZE - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fst.d b2, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b2, Y, 0 * SIZE - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fst.d b3, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b3, Y, 0 * SIZE - fst.d a3, Y, 0 * SIZE - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d b4, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b4, Y, 0 * SIZE - fst.d a4, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d b1, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b2, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b3, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b4, XX, 0 * SIZE - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/scopy_lasx.S b/kernel/loongarch64/scopy_lasx.S deleted file mode 100644 index 7db1e7cee..000000000 --- a/kernel/loongarch64/scopy_lasx.S +++ /dev/null @@ -1,216 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define I $r17 -#define TEMP $r18 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define VX0 $xr12 -#define VX1 $xr13 - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - xvld VX0, X, 0 * SIZE - addi.d I, I, -1 - xvst VX0, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.s $f12, Y, 0 * SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: - bge $r0, I, .L122 - .align 3 - -.L121: - xvld VX0, X, 0 * SIZE - xvstelm.w VX0, Y, 0, 0 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 1 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 2 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 3 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 4 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 5 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 6 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 7 - add.d Y, Y, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.s $f12, Y, 0 * SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - xvst VX0, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - .align 3 - -.L222: - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.s a4, X, 0 * SIZE - add.d Y, Y, INCY - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.s a4, X, 0 * SIZE - add.d Y, Y, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/scopy_lsx.S b/kernel/loongarch64/scopy_lsx.S deleted file mode 100644 index 32150d3d6..000000000 --- a/kernel/loongarch64/scopy_lsx.S +++ /dev/null @@ -1,220 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define I $r17 -#define TEMP $r18 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define VX0 $vr12 -#define VX1 $vr13 - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - addi.d I, I, -1 - vst VX0, Y, 0 * SIZE - vst VX1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.s $f12, Y, 0 * SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: - bge $r0, I, .L122 - .align 3 - -.L121: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vstelm.w VX0, Y, 0, 0 - add.d Y, Y, INCY - vstelm.w VX0, Y, 0, 1 - add.d Y, Y, INCY - vstelm.w VX0, Y, 0, 2 - add.d Y, Y, INCY - vstelm.w VX0, Y, 0, 3 - add.d Y, Y, INCY - vstelm.w VX1, Y, 0, 0 - add.d Y, Y, INCY - vstelm.w VX1, Y, 0, 1 - add.d Y, Y, INCY - vstelm.w VX1, Y, 0, 2 - add.d Y, Y, INCY - vstelm.w VX1, Y, 0, 3 - add.d Y, Y, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.s $f12, Y, 0 * SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - vst VX0, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vst VX1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - .align 3 - -.L222: - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.s a4, X, 0 * SIZE - add.d Y, Y, INCY - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.s a4, X, 0 * SIZE - add.d Y, Y, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/snrm2_lasx.S b/kernel/loongarch64/snrm2_lasx.S index 274908c14..3ae11e897 100644 --- a/kernel/loongarch64/snrm2_lasx.S +++ b/kernel/loongarch64/snrm2_lasx.S @@ -1,3 +1,35 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -11,10 +43,13 @@ #define t2 $r13 #define t3 $r14 #define t4 $r15 + +/* Don't change following FR unless you know the effects. */ #define VX0 $xr15 #define VX1 $xr16 #define VX2 $xr17 #define VX3 $xr18 +#define VX4 $xr21 #define res1 $xr19 #define res2 $xr20 @@ -37,14 +72,13 @@ .align 3 .L10: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 0 * SIZE - xvfcvtl.d.s VX0, VX0 - xvfcvth.d.s VX1, VX1 - xvfmadd.d res1, VX0, VX0, res1 - xvfmadd.d res2, VX1, VX1, res2 + xvld VX0, X, 0 + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 addi.d I, I, -1 - addi.d X, X, 8 * SIZE + addi.d X, X, 8 * SIZE blt $r0, I, .L10 .align 3 b .L996 @@ -54,70 +88,46 @@ .align 3 .L21: - ld.w t1, X, 0 * SIZE + ld.w t1, X, 0 add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 add.d X, X, INCX - ld.w t3, X, 0 * SIZE + ld.w t3, X, 0 add.d X, X, INCX - ld.w t4, X, 0 * SIZE + ld.w t4, X, 0 add.d X, X, INCX xvinsgr2vr.w VX0, t1, 0 xvinsgr2vr.w VX0, t2, 1 xvinsgr2vr.w VX0, t3, 2 xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE + ld.w t1, X, 0 add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE + ld.w t3, X, 0 add.d X, X, INCX + ld.w t4, X, 0 xvinsgr2vr.w VX0, t1, 4 xvinsgr2vr.w VX0, t2, 5 xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 - ld.w t1, X, 0 * SIZE add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX1, t1, 0 - xvinsgr2vr.w VX1, t2, 1 - xvinsgr2vr.w VX1, t3, 2 - xvinsgr2vr.w VX1, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX1, t1, 4 - xvinsgr2vr.w VX1, t2, 5 - xvinsgr2vr.w VX1, t3, 6 - xvinsgr2vr.w VX1, t4, 7 - xvfcvtl.d.s VX0, VX0 - xvfcvth.d.s VX1, VX1 - xvfmadd.d res1, VX0, VX0, res1 - xvfmadd.d res2, VX1, VX1, res2 + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 addi.d I, I, -1 blt $r0, I, .L21 b .L996 .L996: xvfadd.d res1, res1, res2 - xvpickve.w VX1, res1, 1 - xvpickve.w VX2, res1, 2 - xvpickve.w VX3, res1, 3 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX3, res1 + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + fadd.d $f19, $f19, $f16 + fadd.d $f19, $f19, $f17 + fadd.d $f19, $f19, $f18 .align 3 .L997: @@ -126,11 +136,11 @@ .align 3 .L998: - fld.s $f15, X, 0 * SIZE - addi.d I, I, -1 + fld.s $f15, X, 0 + add.d X, X, INCX + addi.d I, I, -1 fcvt.d.s $f15, $f15 - fmadd.d $f19, $f15, $f15, $f19 - add.d X, X, INCX + fmadd.d $f19, $f15, $f15, $f19 blt $r0, I, .L998 .align 3 diff --git a/kernel/loongarch64/snrm2_lsx.S b/kernel/loongarch64/snrm2_lsx.S index 17d017900..bb492dbf0 100644 --- a/kernel/loongarch64/snrm2_lsx.S +++ b/kernel/loongarch64/snrm2_lsx.S @@ -1,3 +1,35 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -15,6 +47,9 @@ #define VX1 $vr16 #define VX2 $vr17 #define VX3 $vr18 +#define VX4 $vr21 +#define VX5 $vr22 +/* Don't change following FR unless you know the effects. */ #define res1 $vr19 #define res2 $vr20 @@ -24,99 +59,71 @@ LDINT N, 0(N) LDINT INCX, 0(INCX) #endif - vxor.v res1, res1, res1 vxor.v res2, res2, res2 - bge $r0, N, .L999 + bge $r0, N, .L999 beq $r0, INCX, .L999 li.d TEMP, SIZE slli.d INCX, INCX, BASE_SHIFT srai.d I, N, 3 bne INCX, TEMP, .L20 - bge $r0, I, .L997 + bge $r0, I, .L997 .align 3 .L10: - vld VX0, X, 0 * SIZE - vld VX1, X, 0 * SIZE - vfcvtl.d.s VX0, VX0 - vfcvth.d.s VX1, VX1 - vfmadd.d res1, VX0, VX0, res1 - vfmadd.d res2, VX1, VX1, res2 - vld VX2, X, 4 * SIZE - vld VX3, X, 4 * SIZE - vfcvtl.d.s VX2, VX2 - vfcvth.d.s VX3, VX3 - vfmadd.d res1, VX2, VX2, res1 - vfmadd.d res2, VX3, VX3, res2 + vld VX0, X, 0 + vld VX5, X, 4 * SIZE addi.d I, I, -1 - addi.d X, X, 8 * SIZE + addi.d X, X, 8 * SIZE + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfcvtl.d.s VX3, VX5 + vfcvth.d.s VX4, VX5 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 blt $r0, I, .L10 b .L996 .align 3 - .L20: bge $r0, I, .L997 .align 3 .L21: - ld.w t1, X, 0 * SIZE + ld.w t1, X, 0 add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 add.d X, X, INCX - ld.w t3, X, 0 * SIZE + ld.w t3, X, 0 add.d X, X, INCX - ld.w t4, X, 0 * SIZE + ld.w t4, X, 0 add.d X, X, INCX vinsgr2vr.w VX0, t1, 0 vinsgr2vr.w VX0, t2, 1 vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE + vinsgr2vr.w VX0, t4, 3 + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + ld.w t1, X, 0 add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 add.d X, X, INCX - ld.w t3, X, 0 * SIZE + ld.w t3, X, 0 add.d X, X, INCX - ld.w t4, X, 0 * SIZE + ld.w t4, X, 0 add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vfcvtl.d.s VX0, VX0 - vfcvth.d.s VX1, VX1 - vfmadd.d res1, VX0, VX0, res1 - vfmadd.d res2, VX1, VX1, res2 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - vfcvtl.d.s VX2, VX2 - vfcvth.d.s VX3, VX3 - vfmadd.d res1, VX2, VX2, res1 - vfmadd.d res2, VX3, VX3, res2 + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vfcvtl.d.s VX3, VX0 + vfcvth.d.s VX4, VX0 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 addi.d I, I, -1 blt $r0, I, .L21 b .L996 @@ -124,12 +131,8 @@ .L996: vfadd.d res1, res1, res2 - vreplvei.w VX1, res1, 1 - vreplvei.w VX2, res1, 2 - vreplvei.w VX3, res1, 3 - vfadd.s res1, VX1, res1 - vfadd.s res1, VX2, res1 - vfadd.s res1, VX3, res1 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 .align 3 .L997: @@ -138,7 +141,7 @@ .align 3 .L998: - fld.s $f15, X, 0 * SIZE + fld.s $f15, X, 0 addi.d I, I, -1 fcvt.d.s $f15, $f15 fmadd.d $f19, $f15, $f15, $f19 diff --git a/kernel/loongarch64/ssum_lasx.S b/kernel/loongarch64/ssum_lasx.S deleted file mode 100644 index 7cf57bc77..000000000 --- a/kernel/loongarch64/ssum_lasx.S +++ /dev/null @@ -1,140 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 -#define res1 $xr16 -#define res2 $xr17 - PROLOGUE - xvxor.v res1, res1, res1 - xvxor.v res2, res2, res2 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - xvld VX0, X, 0 * SIZE - xvfadd.s res1, VX0, res1 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - xvfadd.s res2, res1, res2 - xvpickve.w VX1, res1, 1 - xvpickve.w VX2, res1, 2 - xvpickve.w VX3, res1, 3 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX3, res1 - xvpickve.w VX0, res2, 4 - xvpickve.w VX1, res2, 5 - xvpickve.w VX2, res2, 6 - xvpickve.w VX3, res2, 7 - xvfadd.s res1, VX0, res1 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX2, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.s $f12, X, 0 * SIZE - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - xvfadd.s res1, VX0, res1 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - xvfadd.s res2, res1, res2 - xvpickve.w VX1, res1, 1 - xvpickve.w VX2, res1, 2 - xvpickve.w VX3, res1, 3 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX3, res1 - xvpickve.w VX0, res2, 4 - xvpickve.w VX1, res2, 5 - xvpickve.w VX2, res2, 6 - xvpickve.w VX3, res2, 7 - xvfadd.s res1, VX0, res1 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX2, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.s $f12, X, 0 * SIZE - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.s $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ssum_lsx.S b/kernel/loongarch64/ssum_lsx.S deleted file mode 100644 index de63c69e3..000000000 --- a/kernel/loongarch64/ssum_lsx.S +++ /dev/null @@ -1,125 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 -#define res1 $vr16 -#define res2 $vr17 - PROLOGUE - vxor.v res1, res1, res1 - vxor.v res2, res2, res2 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vfadd.s res2, VX0, VX1 - vfadd.s res1, res1, res2 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - vreplvei.w VX1, res1, 1 - vreplvei.w VX2, res1, 2 - vreplvei.w VX3, res1, 3 - vfadd.s res1, VX1, res1 - vfadd.s res1, VX2, res1 - vfadd.s res1, VX3, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.s $f12, X, 0 * SIZE - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vfadd.s res2, VX0, VX1 - vfadd.s res1, res1, res2 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - vreplvei.w VX1, res1, 1 - vreplvei.w VX2, res1, 2 - vreplvei.w VX3, res1, 3 - vfadd.s res1, VX1, res1 - vfadd.s res1, VX2, res1 - vfadd.s res1, VX3, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.s $f12, X, 0 * SIZE - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.s $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/sswap_lasx.S b/kernel/loongarch64/sswap_lasx.S deleted file mode 100644 index 7184eff45..000000000 --- a/kernel/loongarch64/sswap_lasx.S +++ /dev/null @@ -1,286 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r17 -#define TEMP $r18 -#define XX $r5 -#define YY $r6 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 - - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - addi.d I, I, -1 - xvst VX2, X, 0 * SIZE - xvst VX0, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - .align 3 - -.L121: - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 0 - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 1 - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 2 - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 3 - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - ld.w t1, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 4 - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 5 - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 6 - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 7 - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvst VX2, X, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - xvstelm.w VX2, X, 0, 0 - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - xvstelm.w VX2, X, 0, 1 - add.d X, X, INCY - ld.w t3, X, 0 * SIZE - xvstelm.w VX2, X, 0, 2 - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvstelm.w VX2, X, 0, 3 - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - ld.w t1, X, 0 * SIZE - xvstelm.w VX2, X, 0, 4 - add.d X, X, INCY - ld.w t2, X, 0 * SIZE - xvstelm.w VX2, X, 0, 5 - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - xvstelm.w VX2, X, 0, 6 - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvstelm.w VX2, X, 0, 7 - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvst VX1, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - .align 3 - move XX, X - -.L222: - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fld.s b1, Y, 0 * SIZE - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b2, Y, 0 * SIZE - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b3, Y, 0 * SIZE - fst.s a3, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b4, Y, 0 * SIZE - fst.s a4, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fst.s b1, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b1, Y, 0 * SIZE - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fst.s b2, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b2, Y, 0 * SIZE - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fst.s b3, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b3, Y, 0 * SIZE - fst.s a3, Y, 0 * SIZE - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s b4, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b4, Y, 0 * SIZE - fst.s a4, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s b1, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b2, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b3, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b4, XX, 0 * SIZE - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/sswap_lsx.S b/kernel/loongarch64/sswap_lsx.S deleted file mode 100644 index 4f19a8024..000000000 --- a/kernel/loongarch64/sswap_lsx.S +++ /dev/null @@ -1,294 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r17 -#define TEMP $r18 -#define XX $r5 -#define YY $r6 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 - - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX2, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - addi.d I, I, -1 - vst VX2, X, 0 * SIZE - vst VX3, X, 4 * SIZE - vst VX0, Y, 0 * SIZE - vst VX1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - .align 3 - -.L121: - vld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - vstelm.w VX0, Y, 0, 0 - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - vstelm.w VX0, Y, 0, 1 - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - vstelm.w VX0, Y, 0, 2 - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vstelm.w VX0, Y, 0, 3 - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vst VX2, X, 0 * SIZE - vld VX1, X, 4 * SIZE - ld.w t1, Y, 0 * SIZE - vstelm.w VX1, Y, 0, 0 - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - vstelm.w VX1, Y, 0, 1 - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - vstelm.w VX1, Y, 0, 2 - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vstelm.w VX1, Y, 0, 3 - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vst VX3, X, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L212 - .align 3 - -.L211: - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - vstelm.w VX2, X, 0, 0 - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - vstelm.w VX2, X, 0, 1 - add.d X, X, INCY - ld.w t3, X, 0 * SIZE - vstelm.w VX2, X, 0, 2 - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vstelm.w VX2, X, 0, 3 - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vst VX0, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - ld.w t1, X, 0 * SIZE - vstelm.w VX3, X, 0, 0 - add.d X, X, INCY - ld.w t2, X, 0 * SIZE - vstelm.w VX3, X, 0, 1 - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - vstelm.w VX3, X, 0, 2 - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vstelm.w VX3, X, 0, 3 - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - vst VX1, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - .align 3 - move XX, X - -.L222: - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fld.s b1, Y, 0 * SIZE - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b2, Y, 0 * SIZE - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b3, Y, 0 * SIZE - fst.s a3, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b4, Y, 0 * SIZE - fst.s a4, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fst.s b1, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b1, Y, 0 * SIZE - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fst.s b2, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b2, Y, 0 * SIZE - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fst.s b3, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b3, Y, 0 * SIZE - fst.s a3, Y, 0 * SIZE - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s b4, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b4, Y, 0 * SIZE - fst.s a4, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s b1, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b2, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b3, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b4, XX, 0 * SIZE - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/sum_lasx.S b/kernel/loongarch64/sum_lasx.S new file mode 100644 index 000000000..fd6d5adb3 --- /dev/null +++ b/kernel/loongarch64/sum_lasx.S @@ -0,0 +1,225 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + xvld VX0, X, 0 + xvfadd.s res1, res1, VX0 +#ifdef DOUBLE + xvld VX1, X, 32 + xvfadd.s res1, res1, VX1 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfadd.s res1, VX0, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/sum_lsx.S b/kernel/loongarch64/sum_lsx.S new file mode 100644 index 000000000..6b2027781 --- /dev/null +++ b/kernel/loongarch64/sum_lsx.S @@ -0,0 +1,204 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + vld VX0, X, 0 + vld VX1, X, 16 + VFADD res2, VX0, VX1 + VFADD res1, res1, res2 +#ifdef DOUBLE + vld VX0, X, 32 + vld VX1, X, 48 + VFADD res2, VX0, VX1 + VFADD res1, res1, res2 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, res1, VX1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/swap_lasx.S b/kernel/loongarch64/swap_lasx.S new file mode 100644 index 000000000..4767fffe3 --- /dev/null +++ b/kernel/loongarch64/swap_lasx.S @@ -0,0 +1,401 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and INCY==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 + xvld VX2, Y, 0 + addi.d I, I, -1 + xvst VX2, X, 0 + xvst VX0, Y, 0 +#ifdef DOUBLE + xvld VX0, X, 32 + xvld VX2, Y, 32 + xvst VX2, X, 32 + xvst VX0, Y, 32 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 + ld.d t1, Y, 0 + xvstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 + xvstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 + xvstelm.d VX0, Y, 0, 2 + add.d Y, Y, INCY + ld.d t4, Y, 0 + xvstelm.d VX0, Y, 0, 3 + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvst VX2, X, 0 + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 + xvstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 + xvstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 + xvstelm.d VX1, Y, 0, 2 + add.d Y, Y, INCY + ld.d t4, Y, 0 + xvstelm.d VX1, Y, 0, 3 + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX0, X, 0 + ld.w t1, Y, 0 + xvstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.w t2, Y, 0 + xvstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 + xvstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + ld.w t4, Y, 0 + xvstelm.w VX0, Y, 0, 3 + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 + xvstelm.w VX0, Y, 0, 4 + add.d Y, Y, INCY + ld.w t2, Y, 0 + xvstelm.w VX0, Y, 0, 5 + add.d Y, Y, INCY + ld.w t3, Y, 0 + xvstelm.w VX0, Y, 0, 6 + add.d Y, Y, INCY + ld.w t4, Y, 0 + xvstelm.w VX0, Y, 0, 7 + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvst VX2, X, 0 + addi.d X, X, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 + ld.d t1, X, 0 + xvstelm.d VX2, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 + xvstelm.d VX2, X, 0, 1 + add.d X, X, INCX + ld.d t3, X, 0 + xvstelm.d VX2, X, 0, 2 + add.d X, X, INCX + ld.d t4, X, 0 + xvstelm.d VX2, X, 0, 3 + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvst VX0, Y, 0 + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 + xvstelm.d VX3, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 + xvstelm.d VX3, X, 0, 1 + add.d X, X, INCX + ld.d t3, X, 0 + xvstelm.d VX3, X, 0, 2 + add.d X, X, INCX + ld.d t4, X, 0 + xvstelm.d VX3, X, 0, 3 + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvst VX1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX2, Y, 0 + ld.w t1, X, 0 + xvstelm.w VX2, X, 0, 0 + add.d X, X, INCX + ld.w t2, X, 0 + xvstelm.w VX2, X, 0, 1 + add.d X, X, INCX + ld.w t3, X, 0 + xvstelm.w VX2, X, 0, 2 + add.d X, X, INCX + ld.w t4, X, 0 + xvstelm.w VX2, X, 0, 3 + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 + xvstelm.w VX2, X, 0, 4 + add.d X, X, INCX + ld.w t2, X, 0 + xvstelm.w VX2, X, 0, 5 + add.d X, X, INCX + ld.w t3, X, 0 + xvstelm.w VX2, X, 0, 6 + add.d X, X, INCX + ld.w t4, X, 0 + xvstelm.w VX2, X, 0, 7 + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvst VX0, Y, 0 + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + LD b1, Y, 0 + ST a1, Y, 0 + add.d Y, Y, INCY + LD b2, Y, 0 + ST a2, Y, 0 + add.d Y, Y, INCY + LD b3, Y, 0 + ST a3, Y, 0 + add.d Y, Y, INCY + LD b4, Y, 0 + ST a4, Y, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + ST b1, XX, 0 + add.d XX, XX, INCX + LD b1, Y, 0 + ST a1, Y, 0 + add.d Y, Y, INCY + LD a2, X, 0 + add.d X, X, INCX + ST b2, XX, 0 + add.d XX, XX, INCX + LD b2, Y, 0 + ST a2, Y, 0 + add.d Y, Y, INCY + LD a3, X, 0 + add.d X, X, INCX + ST b3, XX, 0 + add.d XX, XX, INCX + LD b3, Y, 0 + ST a3, Y, 0 + LD a4, X, 0 + add.d X, X, INCX + ST b4, XX, 0 + add.d XX, XX, INCX + LD b4, Y, 0 + ST a4, Y, 0 + add.d Y, Y, INCY + ST b1, XX, 0 + add.d XX, XX, INCX + ST b2, XX, 0 + add.d XX, XX, INCX + ST b3, XX, 0 + add.d XX, XX, INCX + ST b4, XX, 0 + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/swap_lsx.S b/kernel/loongarch64/swap_lsx.S new file mode 100644 index 000000000..736187f93 --- /dev/null +++ b/kernel/loongarch64/swap_lsx.S @@ -0,0 +1,431 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and incy==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, Y, 0 + vld VX3, Y, 16 + addi.d I, I, -1 + vst VX2, X, 0 + vst VX3, X, 16 + vst VX0, Y, 0 + vst VX1, Y, 16 +#ifdef DOUBLE + vld VX0, X, 32 + vld VX1, X, 48 + vld VX2, Y, 32 + vld VX3, Y, 48 + vst VX2, X, 32 + vst VX3, X, 48 + vst VX0, Y, 32 + vst VX1, Y, 48 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: +#ifdef DOUBLE + fld.d $f12, X, 0 + fld.d $f14, Y, 0 + addi.d I, I, -1 + fst.d $f12, Y, 0 + fst.d $f14, X, 0 +#else + fld.s $f12, X, 0 + fld.s $f14, Y, 0 + addi.d I, I, -1 + fst.s $f12, Y, 0 + fst.s $f14, X, 0 +#endif + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 + ld.d t1, Y, 0 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 + vstelm.d VX0, Y, 0, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 0 + vld VX1, X, 2 * SIZE + ld.d t3, Y, 0 + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.d t4, Y, 0 + vstelm.d VX1, Y, 0, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + ld.d t1, Y, 0 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 + vstelm.d VX0, Y, 0, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.d t4, Y, 0 + vstelm.d VX1, Y, 0, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 6 * SIZE + addi.d X, X, 8 * SIZE +#else + vld VX0, X, 0 + ld.w t1, Y, 0 + vstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.w t2, Y, 0 + vstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 + vstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + ld.w t4, Y, 0 + vstelm.w VX0, Y, 0, 3 + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vst VX2, X, 0 + + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 + vstelm.w VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.w t2, Y, 0 + vstelm.w VX1, Y, 0, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 + vstelm.w VX1, Y, 0, 2 + add.d Y, Y, INCY + ld.w t4, Y, 0 + vstelm.w VX1, Y, 0, 3 + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +/* INCX!=1 and INCY==1 */ +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 + ld.d t1, X, 0 + vstelm.d VX2, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 + vstelm.d VX2, X, 0, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 0 + vld VX3, Y, 2 * SIZE + ld.d t3, X, 0 + vstelm.d VX3, X, 0, 0 + add.d X, X, INCX + ld.d t4, X, 0 + vstelm.d VX3, X, 0, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + ld.d t1, X, 0 + vstelm.d VX2, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 + vstelm.d VX2, X, 0, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 + vstelm.d VX3, X, 0, 0 + add.d X, X, INCX + ld.d t4, X, 0 + vstelm.d VX3, X, 0, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE +#else + vld VX2, Y, 0 + ld.w t1, X, 0 + vstelm.w VX2, X, 0, 0 + add.d X, X, INCX + ld.w t2, X, 0 + vstelm.w VX2, X, 0, 1 + add.d X, X, INCX + ld.w t3, X, 0 + vstelm.w VX2, X, 0, 2 + add.d X, X, INCX + ld.w t4, X, 0 + vstelm.w VX2, X, 0, 3 + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vst VX0, Y, 0 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 + vstelm.w VX3, X, 0, 0 + add.d X, X, INCX + ld.w t2, X, 0 + vstelm.w VX3, X, 0, 1 + add.d X, X, INCX + ld.w t3, X, 0 + vstelm.w VX3, X, 0, 2 + add.d X, X, INCX + ld.w t4, X, 0 + vstelm.w VX3, X, 0, 3 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vst VX1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE + addi.d I, I, -1 + ST $f12, Y, 0 * SIZE + ST $f14, X, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + LD b1, Y, 0 + ST a1, Y, 0 + add.d Y, Y, INCY + LD b2, Y, 0 + ST a2, Y, 0 + add.d Y, Y, INCY + LD b3, Y, 0 + ST a3, Y, 0 + add.d Y, Y, INCY + LD b4, Y, 0 + ST a4, Y, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + ST b1, XX, 0 + add.d XX, XX, INCX + LD b1, Y, 0 + ST a1, Y, 0 + add.d Y, Y, INCY + LD a2, X, 0 + add.d X, X, INCX + ST b2, XX, 0 + add.d XX, XX, INCX + LD b2, Y, 0 + ST a2, Y, 0 + add.d Y, Y, INCY + LD a3, X, 0 + add.d X, X, INCX + ST b3, XX, 0 + add.d XX, XX, INCX + LD b3, Y, 0 + ST a3, Y, 0 + LD a4, X, 0 + add.d X, X, INCX + ST b4, XX, 0 + add.d XX, XX, INCX + LD b4, Y, 0 + ST a4, Y, 0 + add.d Y, Y, INCY + ST b1, XX, 0 + add.d XX, XX, INCX + ST b2, XX, 0 + add.d XX, XX, INCX + ST b3, XX, 0 + add.d XX, XX, INCX + ST b4, XX, 0 + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE