loongarch64: Refine copy,swap,nrm2,sum optimization.

This commit is contained in:
Shiyou Yin 2023-12-27 11:30:17 +08:00
parent c6996a80e9
commit 36c12c4971
24 changed files with 2159 additions and 2816 deletions

View File

@ -31,11 +31,11 @@ IDAMAXKERNEL = idamax_lsx.S
ISAMINKERNEL = isamin_lsx.S
IDAMINKERNEL = idamin_lsx.S
SCOPYKERNEL = scopy_lsx.S
DCOPYKERNEL = dcopy_lsx.S
SCOPYKERNEL = copy_lsx.S
DCOPYKERNEL = copy_lsx.S
SSWAPKERNEL = sswap_lsx.S
DSWAPKERNEL = dswap_lsx.S
SSWAPKERNEL = swap_lsx.S
DSWAPKERNEL = swap_lsx.S
SAXPYKERNEL = saxpy_lsx.S
DAXPYKERNEL = daxpy_lsx.S
@ -43,8 +43,8 @@ DAXPYKERNEL = daxpy_lsx.S
SAXPBYKERNEL = saxpby_lsx.S
DAXPBYKERNEL = daxpby_lsx.S
SSUMKERNEL = ssum_lsx.S
DSUMKERNEL = dsum_lsx.S
SSUMKERNEL = sum_lsx.S
DSUMKERNEL = sum_lsx.S
SASUMKERNEL = sasum_lsx.S
DASUMKERNEL = dasum_lsx.S

View File

@ -31,11 +31,11 @@ IDAMAXKERNEL = idamax_lasx.S
ISAMINKERNEL = isamin_lasx.S
IDAMINKERNEL = idamin_lasx.S
SCOPYKERNEL = scopy_lasx.S
DCOPYKERNEL = dcopy_lasx.S
SCOPYKERNEL = copy_lasx.S
DCOPYKERNEL = copy_lasx.S
SSWAPKERNEL = sswap_lasx.S
DSWAPKERNEL = dswap_lasx.S
SSWAPKERNEL = swap_lasx.S
DSWAPKERNEL = swap_lasx.S
SAXPYKERNEL = saxpy_lasx.S
DAXPYKERNEL = daxpy_lasx.S
@ -43,8 +43,8 @@ DAXPYKERNEL = daxpy_lasx.S
SAXPBYKERNEL = saxpby_lasx.S
DAXPBYKERNEL = daxpby_lasx.S
SSUMKERNEL = ssum_lasx.S
DSUMKERNEL = dsum_lasx.S
SSUMKERNEL = sum_lasx.S
DSUMKERNEL = sum_lasx.S
SASUMKERNEL = sasum_lasx.S
DASUMKERNEL = dasum_lasx.S

View File

@ -0,0 +1,306 @@
/*****************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
#define t1 $r14
#define t2 $r15
#define t3 $r16
#define t4 $r19
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define VX0 $xr12
#define VX1 $xr13
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
/* INCX==1 and INCY==1 */
.L11:
bge $r0, I, .L112
.align 3
.L111:
xvld VX0, X, 0
addi.d I, I, -1
xvst VX0, Y, 0
#ifdef DOUBLE
xvld VX0, X, 32
xvst VX0, Y, 32
#endif
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
.align 3
.L112:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L113:
LD $f12, X, 0
addi.d I, I, -1
addi.d X, X, SIZE
ST $f12, Y, 0
addi.d Y, Y, SIZE
blt $r0, I, .L113
b .L999
.align 3
/* INCX==1 and INCY!=1 */
.L12:
bge $r0, I, .L122
.align 3
.L121:
#ifdef DOUBLE
xvld VX0, X, 0
xvld VX1, X, 32
xvstelm.d VX0, Y, 0, 0
add.d Y, Y, INCY
xvstelm.d VX0, Y, 0, 1
add.d Y, Y, INCY
xvstelm.d VX0, Y, 0, 2
add.d Y, Y, INCY
xvstelm.d VX0, Y, 0, 3
add.d Y, Y, INCY
xvstelm.d VX1, Y, 0, 0
add.d Y, Y, INCY
xvstelm.d VX1, Y, 0, 1
add.d Y, Y, INCY
xvstelm.d VX1, Y, 0, 2
add.d Y, Y, INCY
xvstelm.d VX1, Y, 0, 3
add.d Y, Y, INCY
#else
xvld VX0, X, 0
xvstelm.w VX0, Y, 0, 0
add.d Y, Y, INCY
xvstelm.w VX0, Y, 0, 1
add.d Y, Y, INCY
xvstelm.w VX0, Y, 0, 2
add.d Y, Y, INCY
xvstelm.w VX0, Y, 0, 3
add.d Y, Y, INCY
xvstelm.w VX0, Y, 0, 4
add.d Y, Y, INCY
xvstelm.w VX0, Y, 0, 5
add.d Y, Y, INCY
xvstelm.w VX0, Y, 0, 6
add.d Y, Y, INCY
xvstelm.w VX0, Y, 0, 7
add.d Y, Y, INCY
#endif
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
LD $f12, X, 0
addi.d I, I, -1
addi.d X, X, SIZE
ST $f12, Y, 0
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
/* INCX!=1 and INCY==1 */
.L21:
bge $r0, I, .L212
.align 3
.L211:
#ifdef DOUBLE
ld.d t1, X, 0
add.d X, X, INCX
ld.d t2, X, 0
add.d X, X, INCX
ld.d t3, X, 0
add.d X, X, INCX
ld.d t4, X, 0
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
xvst VX0, Y, 0
ld.d t1, X, 0
add.d X, X, INCX
ld.d t2, X, 0
add.d X, X, INCX
ld.d t3, X, 0
add.d X, X, INCX
ld.d t4, X, 0
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
xvst VX1, Y, 32
#else
ld.w t1, X, 0
add.d X, X, INCX
ld.w t2, X, 0
add.d X, X, INCX
ld.w t3, X, 0
add.d X, X, INCX
ld.w t4, X, 0
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0
add.d X, X, INCX
ld.w t2, X, 0
add.d X, X, INCX
ld.w t3, X, 0
add.d X, X, INCX
ld.w t4, X, 0
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvst VX0, Y, 0
#endif
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
LD $f12, X, 0
addi.d I, I, -1
ST $f12, Y, 0
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
/* INCX!=1 and INCY!=1 */
.L22:
bge $r0, I, .L223
.align 3
.L222:
LD a1, X, 0
add.d X, X, INCX
LD a2, X, 0
add.d X, X, INCX
LD a3, X, 0
add.d X, X, INCX
LD a4, X, 0
add.d X, X, INCX
ST a1, Y, 0
add.d Y, Y, INCY
ST a2, Y, 0
add.d Y, Y, INCY
ST a3, X, 0
add.d Y, Y, INCY
ST a4, X, 0
add.d Y, Y, INCY
LD a1, X, 0
add.d X, X, INCX
LD a2, X, 0
add.d X, X, INCX
LD a3, X, 0
add.d X, X, INCX
LD a4, X, 0
add.d X, X, INCX
ST a1, Y, 0
add.d Y, Y, INCY
ST a2, Y, 0
add.d Y, Y, INCY
ST a3, X, 0
add.d Y, Y, INCY
ST a4, X, 0
add.d Y, Y, INCY
addi.d I, I, -1
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
LD $f12, X, 0
addi.d I, I, -1
ST $f12, Y, 0
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,316 @@
/*****************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
#define t1 $r14
#define t2 $r15
#define t3 $r16
#define t4 $r19
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define VX0 $vr12
#define VX1 $vr13
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
/* INCX==1 and INCY==1 */
.L11:
bge $r0, I, .L112
.align 3
.L111:
vld VX0, X, 0
vld VX1, X, 16
addi.d I, I, -1
vst VX0, Y, 0
vst VX1, Y, 16
#ifdef DOUBLE
vld VX0, X, 32
vld VX1, X, 48
vst VX0, Y, 32
vst VX1, Y, 48
#endif
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
.align 3
.L112:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L113:
LD $f12, X, 0
addi.d I, I, -1
addi.d X, X, SIZE
ST $f12, Y, 0
addi.d Y, Y, SIZE
blt $r0, I, .L113
b .L999
.align 3
/* INCX==1 and INCY!=1 */
.L12:
bge $r0, I, .L122
.align 3
.L121:
#ifdef DOUBLE
vld VX0, X, 0
vld VX1, X, 16
vstelm.d VX0, Y, 0, 0
add.d Y, Y, INCY
vstelm.d VX0, Y, 0, 1
add.d Y, Y, INCY
vstelm.d VX1, Y, 0, 0
add.d Y, Y, INCY
vstelm.d VX1, Y, 0, 1
add.d Y, Y, INCY
vld VX0, X, 32
vld VX1, X, 48
vstelm.d VX0, Y, 0, 0
add.d Y, Y, INCY
vstelm.d VX0, Y, 0, 1
add.d Y, Y, INCY
vstelm.d VX1, Y, 0, 0
add.d Y, Y, INCY
vstelm.d VX1, Y, 0, 1
add.d Y, Y, INCY
#else
vld VX0, X, 0
vld VX1, X, 16
vstelm.w VX0, Y, 0, 0
add.d Y, Y, INCY
vstelm.w VX0, Y, 0, 1
add.d Y, Y, INCY
vstelm.w VX0, Y, 0, 2
add.d Y, Y, INCY
vstelm.w VX0, Y, 0, 3
add.d Y, Y, INCY
vstelm.w VX1, Y, 0, 0
add.d Y, Y, INCY
vstelm.w VX1, Y, 0, 1
add.d Y, Y, INCY
vstelm.w VX1, Y, 0, 2
add.d Y, Y, INCY
vstelm.w VX1, Y, 0, 3
add.d Y, Y, INCY
#endif
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
LD $f12, X, 0
addi.d I, I, -1
addi.d X, X, SIZE
ST $f12, Y, 0
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
/* INCX!=1 and INCY==1 */
.L21:
bge $r0, I, .L212
.align 3
.L211:
#ifdef DOUBLE
ld.d t1, X, 0
add.d X, X, INCX
ld.d t2, X, 0
add.d X, X, INCX
ld.d t3, X, 0
add.d X, X, INCX
ld.d t4, X, 0
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vst VX0, Y, 0
vst VX1, Y, 16
ld.d t1, X, 0
add.d X, X, INCX
ld.d t2, X, 0
add.d X, X, INCX
ld.d t3, X, 0
add.d X, X, INCX
ld.d t4, X, 0
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vst VX0, Y, 32
vst VX1, Y, 48
#else
ld.w t1, X, 0
add.d X, X, INCX
ld.w t2, X, 0
add.d X, X, INCX
ld.w t3, X, 0
add.d X, X, INCX
ld.w t4, X, 0
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
vst VX0, Y, 0
ld.w t1, X, 0
add.d X, X, INCX
ld.w t2, X, 0
add.d X, X, INCX
ld.w t3, X, 0
add.d X, X, INCX
ld.w t4, X, 0
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
vst VX1, Y, 16
#endif
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
LD $f12, X, 0
addi.d I, I, -1
ST $f12, Y, 0
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
/* INCX!=1 and INCY!=1 */
.L22:
bge $r0, I, .L223
.align 3
.L222:
LD a1, X, 0
add.d X, X, INCX
LD a2, X, 0
add.d X, X, INCX
LD a3, X, 0
add.d X, X, INCX
LD a4, X, 0
add.d X, X, INCX
ST a1, Y, 0
add.d Y, Y, INCY
ST a2, Y, 0
add.d Y, Y, INCY
ST a3, X, 0
add.d Y, Y, INCY
ST a4, X, 0
add.d Y, Y, INCY
LD a1, X, 0
add.d X, X, INCX
LD a2, X, 0
add.d X, X, INCX
LD a3, X, 0
add.d X, X, INCX
LD a4, X, 0
add.d X, X, INCX
ST a1, Y, 0
add.d Y, Y, INCY
ST a2, Y, 0
add.d Y, Y, INCY
ST a3, X, 0
add.d Y, Y, INCY
ST a4, X, 0
add.d Y, Y, INCY
addi.d I, I, -1
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
LD $f12, X, 0
addi.d I, I, -1
ST $f12, Y, 0
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -1,224 +0,0 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
#define t1 $r14
#define t2 $r15
#define t3 $r16
#define t4 $r19
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define VX0 $xr12
#define VX1 $xr13
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L112
.align 3
.L111:
xvld VX0, X, 0 * SIZE
xvld VX1, X, 4 * SIZE
xvst VX0, Y, 0 * SIZE
xvst VX1, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L111
.align 3
.L112:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L113:
fld.d $f12, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
fst.d $f12, Y, 0 * SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L113
b .L999
.align 3
.L12:
bge $r0, I, .L122
.align 3
.L121:
xvld VX0, X, 0 * SIZE
xvld VX1, X, 4 * SIZE
xvstelm.d VX0, Y, 0, 0
add.d Y, Y, INCY
xvstelm.d VX0, Y, 0, 1
add.d Y, Y, INCY
xvstelm.d VX0, Y, 0, 2
add.d Y, Y, INCY
xvstelm.d VX0, Y, 0, 3
add.d Y, Y, INCY
xvstelm.d VX1, Y, 0, 0
add.d Y, Y, INCY
xvstelm.d VX1, Y, 0, 1
add.d Y, Y, INCY
xvstelm.d VX1, Y, 0, 2
add.d Y, Y, INCY
xvstelm.d VX1, Y, 0, 3
add.d Y, Y, INCY
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
fld.d $f12, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
fst.d $f12, Y, 0 * SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
.L21:
bge $r0, I, .L212
.align 3
.L211:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
xvst VX0, Y, 0 * SIZE
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
xvst VX1, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
fld.d $f12, X, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bgez INCX, .L220
.align 3
.L220:
bge $r0, I, .L223
.align 3
.L222:
fld.d a1, X, 0 * SIZE
add.d X, X, INCX
fld.d a2, X, 0 * SIZE
add.d X, X, INCX
fld.d a3, X, 0 * SIZE
add.d X, X, INCX
fld.d a4, X, 0 * SIZE
add.d X, X, INCX
fst.d a1, Y, 0 * SIZE
add.d Y, Y, INCY
fst.d a2, Y, 0 * SIZE
add.d Y, Y, INCY
fst.d a3, X, 0 * SIZE
add.d Y, Y, INCY
fst.d a4, X, 0 * SIZE
add.d Y, Y, INCY
fld.d a1, X, 0 * SIZE
add.d X, X, INCX
fld.d a2, X, 0 * SIZE
add.d X, X, INCX
fld.d a3, X, 0 * SIZE
add.d X, X, INCX
fld.d a4, X, 0 * SIZE
add.d X, X, INCX
fst.d a1, Y, 0 * SIZE
add.d Y, Y, INCY
fst.d a2, Y, 0 * SIZE
add.d Y, Y, INCY
fst.d a3, X, 0 * SIZE
add.d Y, Y, INCY
fst.d a4, X, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
fld.d $f12, X, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -1,232 +0,0 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
#define t1 $r14
#define t2 $r15
#define t3 $r16
#define t4 $r19
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define VX0 $vr12
#define VX1 $vr13
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L112
.align 3
.L111:
vld VX0, X, 0 * SIZE
vld VX1, X, 2 * SIZE
vst VX0, Y, 0 * SIZE
vst VX1, Y, 2 * SIZE
vld VX0, X, 4 * SIZE
vld VX1, X, 6 * SIZE
addi.d I, I, -1
vst VX0, Y, 4 * SIZE
vst VX1, Y, 6 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
.align 3
.L112:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L113:
fld.d $f12, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
fst.d $f12, Y, 0 * SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L113
b .L999
.align 3
.L12:
bge $r0, I, .L122
.align 3
.L121:
vld VX0, X, 0 * SIZE
vld VX1, X, 2 * SIZE
vstelm.d VX0, Y, 0, 0
add.d Y, Y, INCY
vstelm.d VX0, Y, 0, 1
add.d Y, Y, INCY
vstelm.d VX1, Y, 0, 0
add.d Y, Y, INCY
vstelm.d VX1, Y, 0, 1
add.d Y, Y, INCY
vld VX0, X, 4 * SIZE
vld VX1, X, 6 * SIZE
vstelm.d VX0, Y, 0, 0
add.d Y, Y, INCY
vstelm.d VX0, Y, 0, 1
add.d Y, Y, INCY
vstelm.d VX1, Y, 0, 0
add.d Y, Y, INCY
vstelm.d VX1, Y, 0, 1
add.d Y, Y, INCY
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
fld.d $f12, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
fst.d $f12, Y, 0 * SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
.L21:
bge $r0, I, .L212
.align 3
.L211:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
vst VX0, Y, 0 * SIZE
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vst VX1, Y, 2 * SIZE
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
vst VX0, Y, 4 * SIZE
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vst VX1, Y, 6 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
fld.d $f12, X, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bgez INCX, .L220
.align 3
.L220:
bge $r0, I, .L223
.align 3
.L222:
fld.d a1, X, 0 * SIZE
add.d X, X, INCX
fld.d a2, X, 0 * SIZE
add.d X, X, INCX
fld.d a3, X, 0 * SIZE
add.d X, X, INCX
fld.d a4, X, 0 * SIZE
add.d X, X, INCX
fst.d a1, Y, 0 * SIZE
add.d Y, Y, INCY
fst.d a2, Y, 0 * SIZE
add.d Y, Y, INCY
fst.d a3, X, 0 * SIZE
add.d Y, Y, INCY
fst.d a4, X, 0 * SIZE
add.d Y, Y, INCY
fld.d a1, X, 0 * SIZE
add.d X, X, INCX
fld.d a2, X, 0 * SIZE
add.d X, X, INCX
fld.d a3, X, 0 * SIZE
add.d X, X, INCX
fld.d a4, X, 0 * SIZE
add.d X, X, INCX
fst.d a1, Y, 0 * SIZE
add.d Y, Y, INCY
fst.d a2, Y, 0 * SIZE
add.d Y, Y, INCY
fst.d a3, X, 0 * SIZE
add.d Y, Y, INCY
fst.d a4, X, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
fld.d $f12, X, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -1,3 +1,35 @@
/*****************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#define ASSEMBLER
#include "common.h"
@ -12,6 +44,8 @@
#define t2 $r13
#define t3 $r14
#define t4 $r15
/* Don't change following FR unless you know the effects. */
#define VX0 $xr15
#define VX1 $xr16
#define VM0 $xr17
@ -35,6 +69,7 @@
xvxor.v res1, res1, res1
xvxor.v res2, res2, res2
xvxor.v VM0, VM0, VM0
bge $r0, N, .L999
beq $r0, INCX, .L999
move XX, X
@ -46,12 +81,11 @@
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
xvld VM0, X, 0
bge $r0, I, .L97
.align 3
.L10:
xvld VX0, X, 0 * SIZE
xvld VX0, X, 0
xvld VX1, X, 4 * SIZE
xvfmaxa.d VM1, VX1, VX0
xvfmaxa.d VM0, VM0, VM1
@ -62,40 +96,32 @@
.align 3
.L20: // INCX!=1
move TEMP, X // initialize the maxa value
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L97
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t2, 1
.align 3
.L21:
ld.d t1, X, 0 * SIZE
ld.d t1, X, 0
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
ld.d t2, X, 0 * SIZE
ld.d t2, X, 0
add.d X, X, INCX
xvinsgr2vr.d VX0, t2, 1
ld.d t3, X, 0 * SIZE
ld.d t3, X, 0
add.d X, X, INCX
xvinsgr2vr.d VX0, t3, 2
ld.d t4, X, 0 * SIZE
ld.d t4, X, 0
add.d X, X, INCX
xvinsgr2vr.d VX0, t4, 3
ld.d t1, X, 0 * SIZE
ld.d t1, X, 0
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
ld.d t2, X, 0 * SIZE
ld.d t2, X, 0
add.d X, X, INCX
xvinsgr2vr.d VX1, t2, 1
ld.d t3, X, 0 * SIZE
ld.d t3, X, 0
add.d X, X, INCX
xvinsgr2vr.d VX1, t3, 2
ld.d t4, X, 0 * SIZE
ld.d t4, X, 0
add.d X, X, INCX
xvinsgr2vr.d VX1, t4, 3
xvfmaxa.d VM1, VX0, VX1
@ -109,9 +135,9 @@
xvpickve.d VX0, VM0, 1
xvpickve.d VX1, VM0, 2
xvpickve.d VM3, VM0, 3
xvfmaxa.d VM1, VX0, VX1
xvfmaxa.d VM2, VM3, VM0
xvfmaxa.d VM0, VM1, VM2
fmaxa.d $f17, $f17, $f14
fmaxa.d $f17, $f17, $f15
fmaxa.d $f17, $f17, $f16
.align 3
.L97:
@ -149,12 +175,12 @@
.align 3
.L110:
xvld VX0, XX, 0 * SIZE
xvld VX0, XX, 0
xvld VX1, XX, 4 * SIZE
xvfmul.d VM0, VX0, VALPHA
xvfmul.d VM1, VX1, VALPHA
xvfmadd.d res1, VM0, VM0, res1
xvfmadd.d res2, VM1, VM1, res2
xvfmul.d VM2, VX0, VALPHA
xvfmul.d VM3, VX1, VALPHA
xvfmadd.d res1, VM2, VM2, res1
xvfmadd.d res2, VM3, VM3, res2
addi.d XX, XX, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L110
@ -166,34 +192,34 @@
bge $r0, I, .L997
.L121:
ld.d t1, XX, 0 * SIZE
ld.d t1, XX, 0
add.d XX, XX, INCX
ld.d t2, XX, 0 * SIZE
ld.d t2, XX, 0
add.d XX, XX, INCX
ld.d t3, XX, 0 * SIZE
ld.d t3, XX, 0
add.d XX, XX, INCX
ld.d t4, XX, 0 * SIZE
ld.d t4, XX, 0
add.d XX, XX, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
ld.d t1, XX, 0 * SIZE
ld.d t1, XX, 0
add.d XX, XX, INCX
ld.d t2, XX, 0 * SIZE
ld.d t2, XX, 0
add.d XX, XX, INCX
ld.d t3, XX, 0 * SIZE
ld.d t3, XX, 0
add.d XX, XX, INCX
ld.d t4, XX, 0 * SIZE
ld.d t4, XX, 0
add.d XX, XX, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
xvfmul.d VM0, VX0, VALPHA
xvfmul.d VM1, VX1, VALPHA
xvfmadd.d res1, VM0, VM0, res1
xvfmadd.d res2, VM1, VM1, res2
xvfmul.d VM2, VX0, VALPHA
xvfmul.d VM3, VX1, VALPHA
xvfmadd.d res1, VM2, VM2, res1
xvfmadd.d res2, VM3, VM3, res2
addi.d I, I, -1
blt $r0, I, .L121
b .L996
@ -203,10 +229,10 @@
xvfadd.d res1, res1, res2
xvpickve.d VX0, res1, 1
xvpickve.d VX1, res1, 2
xvpickve.d VM0, res1, 3
xvfadd.d res1, VX0, res1
xvfadd.d VX1, VX1, VM0
xvfadd.d res1, VX1, res1
xvpickve.d VM2, res1, 3
fadd.d $f19, $f19, $f15
fadd.d $f19, $f19, $f16
fadd.d $f19, $f19, $f13
.align 3
.L997:
@ -215,19 +241,17 @@
.align 3
.L998:
fld.d $f15, XX, 0 * SIZE
fld.d $f15, XX, 0
addi.d I, I, -1
fmul.d $f15, $f15, ALPHA
fmadd.d $f19, $f15, $f15, $f19
add.d XX, XX , INCX
blt $r0, I, .L998
fsqrt.d $f19, $f19
fmul.d $f0, max, $f19
jirl $r0, $r1, 0x0
.align 3
.L999:
fmov.d $f0, $f19
fsqrt.d $f19, $f19
fmul.d $f0, max, $f19
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -1,3 +1,35 @@
/*****************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#define ASSEMBLER
#include "common.h"
@ -12,6 +44,8 @@
#define t2 $r13
#define t3 $r14
#define t4 $r15
/* Don't change following FR unless you know the effects. */
#define VX0 $vr15
#define VX1 $vr16
#define VM0 $vr17
@ -35,6 +69,7 @@
vxor.v res1, res1, res1
vxor.v res2, res2, res2
vxor.v VM0, VM0, VM0
bge $r0, N, .L999
beq $r0, INCX, .L999
move XX, X
@ -46,7 +81,7 @@
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
vld VM0, X, 0
bge $r0, I, .L97
.align 3
@ -66,15 +101,7 @@
.align 3
.L20: // INCX!=1
move TEMP, X // initialize the maxa value
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t1, 0
srai.d I, N, 3
bge $r0, I, .L97
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t2, 1
.align 3
.L21:
@ -154,16 +181,16 @@
.L110:
vld VX0, XX, 0 * SIZE
vld VX1, XX, 2 * SIZE
vfmul.d VM0, VX0, VALPHA
vfmul.d VM1, VX1, VALPHA
vfmadd.d res1, VM0, VM0, res1
vfmadd.d res2, VM1, VM1, res2
vfmul.d VM2, VX0, VALPHA
vfmul.d VM3, VX1, VALPHA
vfmadd.d res1, VM2, VM2, res1
vfmadd.d res2, VM3, VM3, res2
vld VX0, XX, 4 * SIZE
vld VX1, XX, 6 * SIZE
vfmul.d VM0, VX0, VALPHA
vfmul.d VM1, VX1, VALPHA
vfmadd.d res1, VM0, VM0, res1
vfmadd.d res2, VM1, VM1, res2
vfmul.d VM2, VX0, VALPHA
vfmul.d VM3, VX1, VALPHA
vfmadd.d res1, VM2, VM2, res1
vfmadd.d res2, VM3, VM3, res2
addi.d XX, XX, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L110
@ -173,6 +200,7 @@
.L120:
srai.d I, N, 3
bge $r0, I, .L997
.align 3
.L121:
ld.d t1, XX, 0 * SIZE
@ -187,14 +215,14 @@
vinsgr2vr.d VX0, t2, 1
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vfmul.d VM0, VX0, VALPHA
vfmul.d VM2, VX0, VALPHA
ld.d t1, XX, 0 * SIZE
add.d XX, XX, INCX
vfmul.d VM1, VX1, VALPHA
vfmul.d VM3, VX1, VALPHA
ld.d t2, XX, 0 * SIZE
add.d XX, XX, INCX
vfmadd.d res1, VM0, VM0, res1
vfmadd.d res2, VM1, VM1, res2
vfmadd.d res1, VM2, VM2, res1
vfmadd.d res2, VM3, VM3, res2
ld.d t3, XX, 0 * SIZE
add.d XX, XX, INCX
ld.d t4, XX, 0 * SIZE
@ -203,10 +231,10 @@
vinsgr2vr.d VX0, t2, 1
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vfmul.d VM0, VX0, VALPHA
vfmul.d VM1, VX1, VALPHA
vfmadd.d res1, VM0, VM0, res1
vfmadd.d res2, VM1, VM1, res2
vfmul.d VM2, VX0, VALPHA
vfmul.d VM3, VX1, VALPHA
vfmadd.d res1, VM2, VM2, res1
vfmadd.d res2, VM3, VM3, res2
addi.d I, I, -1
blt $r0, I, .L121
b .L996
@ -230,13 +258,11 @@
fmadd.d $f19, $f15, $f15, $f19
add.d XX, XX , INCX
blt $r0, I, .L998
fsqrt.d $f19, $f19
fmul.d $f0, max, $f19
jirl $r0, $r1, 0x0
.align 3
.L999:
fmov.d $f0, $f19
fsqrt.d $f19, $f19
fmul.d $f0, max, $f19
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -1,125 +0,0 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define t1 $r15
#define t2 $r12
#define t3 $r13
#define t4 $r14
#define VX0 $xr12
#define VX1 $xr13
#define VX2 $xr14
#define VX3 $xr15
#define res1 $xr16
#define res2 $xr17
PROLOGUE
xvxor.v res1, res1, res1
xvxor.v res2, res2, res2
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L13
.align 3
.L11:
xvld VX0, X, 0 * SIZE
xvld VX1, X, 4 * SIZE
xvfadd.d res2, VX0, VX1
xvfadd.d res1, res1, res2
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L11
.align 3
.L12:
xvpickve.d VX1, res1, 1
xvpickve.d VX2, res1, 2
xvpickve.d VX3, res1, 3
xvfadd.d res1, VX1, res1
xvfadd.d res1, VX2, res1
xvfadd.d res1, VX3, res1
.align 3
.L13:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L14:
fld.d $f12, X, 0 * SIZE
addi.d I, I, -1
fadd.d $f16, $f12, $f16
addi.d X, X, SIZE
blt $r0, I, .L14
b .L999
.align 3
.L20:
bge $r0, I, .L23
.align 3
.L21:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
xvfadd.d res2, VX0, VX1
xvfadd.d res1, res1, res2
addi.d I, I, -1
blt $r0, I, .L21
.align 3
.L22:
xvpickve.d VX1, res1, 1
xvpickve.d VX2, res1, 2
xvpickve.d VX3, res1, 3
xvfadd.d res1, VX1, res1
xvfadd.d res1, VX2, res1
xvfadd.d res1, VX3, res1
.align 3
.L23:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
fld.d $f12, X, 0 * SIZE
fadd.d $f16, $f12, $f16
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.d $f0, $f16
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -1,123 +0,0 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define t1 $r15
#define t2 $r12
#define t3 $r13
#define t4 $r14
#define VX0 $vr12
#define VX1 $vr13
#define VX2 $vr14
#define VX3 $vr15
#define res1 $vr16
#define res2 $vr17
PROLOGUE
vxor.v res1, res1, res1
vxor.v res2, res2, res2
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L13
.align 3
.L11:
vld VX0, X, 0 * SIZE
vld VX1, X, 2 * SIZE
vfadd.d res2, VX0, VX1
vfadd.d res1, res1, res2
vld VX0, X, 4 * SIZE
vld VX1, X, 6 * SIZE
vfadd.d res2, VX0, VX1
vfadd.d res1, res1, res2
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L11
.align 3
.L12:
vreplvei.d VX1, res1, 1
vfadd.d res1, VX1, res1
.align 3
.L13:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L14:
fld.d $f12, X, 0 * SIZE
fadd.d $f16, $f12, $f16
addi.d I, I, -1
addi.d X, X, SIZE
blt $r0, I, .L14
b .L999
.align 3
.L20:
bge $r0, I, .L23
.align 3
.L21:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
vinsgr2vr.d VX1, t1, 0
vinsgr2vr.d VX1, t2, 1
add.d X, X, INCX
vfadd.d res2, VX0, VX1
vfadd.d res1, res1, res2
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t3, 0
vinsgr2vr.d VX0, t4, 1
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
add.d X, X, INCX
vfadd.d res2, VX0, VX1
vfadd.d res1, res1, res2
addi.d I, I, -1
blt $r0, I, .L21
.align 3
.L22:
vreplvei.d VX1, res1, 1
vfadd.d res1, VX1, res1
.align 3
.L23:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
fld.d $f12, X, 0 * SIZE
fadd.d $f16, $f12, $f16
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.d $f0, $f16
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -1,301 +0,0 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r7
#define INCX $r8
#define Y $r9
#define INCY $r10
#define I $r17
#define TEMP $r18
#define XX $r5
#define YY $r6
#define t1 $r14
#define t2 $r15
#define t3 $r16
#define t4 $r19
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define b1 $f16
#define b2 $f17
#define b3 $f18
#define b4 $f19
#define VX0 $xr12
#define VX1 $xr13
#define VX2 $xr14
#define VX3 $xr15
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L112
.align 3
.L111:
xvld VX0, X, 0 * SIZE
xvld VX1, X, 4 * SIZE
xvld VX2, Y, 0 * SIZE
xvld VX3, Y, 4 * SIZE
addi.d I, I, -1
xvst VX2, X, 0 * SIZE
xvst VX3, X, 4 * SIZE
xvst VX0, Y, 0 * SIZE
xvst VX1, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
.align 3
.L112:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L113:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
fst.d $f14, X, 0 * SIZE
addi.d X, X, SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L113
b .L999
.align 3
.L12: // INCX==1 and INCY!=1
bge $r0, I, .L122
.align 3
.L121:
xvld VX0, X, 0 * SIZE
ld.d t1, Y, 0 * SIZE
xvstelm.d VX0, Y, 0, 0
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
xvstelm.d VX0, Y, 0, 1
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
xvstelm.d VX0, Y, 0, 2
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvstelm.d VX0, Y, 0, 3
xvinsgr2vr.d VX2, t1, 0
xvinsgr2vr.d VX2, t2, 1
xvinsgr2vr.d VX2, t3, 2
xvinsgr2vr.d VX2, t4, 3
add.d Y, Y, INCY
xvst VX2, X, 0 * SIZE
xvld VX1, X, 4 * SIZE
ld.d t1, Y, 0 * SIZE
xvstelm.d VX1, Y, 0, 0
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
xvstelm.d VX1, Y, 0, 1
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
xvstelm.d VX1, Y, 0, 2
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
xvstelm.d VX1, Y, 0, 3
xvinsgr2vr.d VX3, t1, 0
xvinsgr2vr.d VX3, t2, 1
xvinsgr2vr.d VX3, t3, 2
xvinsgr2vr.d VX3, t4, 3
add.d Y, Y, INCY
xvst VX3, X, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
fst.d $f14, X, 0 * SIZE
addi.d X, X, SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
.L21:
bge $r0, I, .L212
.align 3
.L211:
xvld VX2, Y, 0 * SIZE
ld.d t1, X, 0 * SIZE
xvstelm.d VX2, X, 0, 0
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
xvstelm.d VX2, X, 0, 1
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
xvstelm.d VX2, X, 0, 2
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
xvstelm.d VX2, X, 0, 3
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
add.d X, X, INCX
xvst VX0, Y, 0 * SIZE
xvld VX3, Y, 4 * SIZE
ld.d t1, X, 0 * SIZE
xvstelm.d VX3, X, 0, 0
add.d X, X, INCY
ld.d t2, X, 0 * SIZE
xvstelm.d VX3, X, 0, 1
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
xvstelm.d VX3, X, 0, 2
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
xvstelm.d VX3, X, 0, 3
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
add.d X, X, INCX
xvst VX1, Y, 0 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
fst.d $f14, X, 0 * SIZE
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bgez INCX, .L220
//addi.d TEMP, N, -1
//mul.d TEMP, TEMP, INCX
//sub.d X, X, TEMP
.align 3
.L220:
bge $r0, I, .L223
.align 3
move XX, X
.L222:
fld.d a1, X, 0 * SIZE
add.d X, X, INCX
fld.d a2, X, 0 * SIZE
add.d X, X, INCX
fld.d a3, X, 0 * SIZE
add.d X, X, INCX
fld.d a4, X, 0 * SIZE
add.d X, X, INCX
fld.d b1, Y, 0 * SIZE
fst.d a1, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d b2, Y, 0 * SIZE
fst.d a2, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d b3, Y, 0 * SIZE
fst.d a3, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d b4, Y, 0 * SIZE
fst.d a4, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d a1, X, 0 * SIZE
add.d X, X, INCX
fst.d b1, XX, 0 * SIZE
add.d XX, XX, INCX
fld.d b1, Y, 0 * SIZE
fst.d a1, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d a2, X, 0 * SIZE
add.d X, X, INCX
fst.d b2, XX, 0 * SIZE
add.d XX, XX, INCX
fld.d b2, Y, 0 * SIZE
fst.d a2, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d a3, X, 0 * SIZE
add.d X, X, INCX
fst.d b3, XX, 0 * SIZE
add.d XX, XX, INCX
fld.d b3, Y, 0 * SIZE
fst.d a3, Y, 0 * SIZE
fld.d a4, X, 0 * SIZE
add.d X, X, INCX
fst.d b4, XX, 0 * SIZE
add.d XX, XX, INCX
fld.d b4, Y, 0 * SIZE
fst.d a4, Y, 0 * SIZE
add.d Y, Y, INCY
fst.d b1, XX, 0 * SIZE
add.d XX, XX, INCX
fst.d b2, XX, 0 * SIZE
add.d XX, XX, INCX
fst.d b3, XX, 0 * SIZE
add.d XX, XX, INCX
fst.d b4, XX, 0 * SIZE
add.d XX, XX, INCX
addi.d I, I, -1
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
fst.d $f14, X, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -1,317 +0,0 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r7
#define INCX $r8
#define Y $r9
#define INCY $r10
#define I $r17
#define TEMP $r18
#define XX $r5
#define YY $r6
#define t1 $r14
#define t2 $r15
#define t3 $r16
#define t4 $r19
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define b1 $f16
#define b2 $f17
#define b3 $f18
#define b4 $f19
#define VX0 $vr12
#define VX1 $vr13
#define VX2 $vr14
#define VX3 $vr15
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L112
.align 3
.L111:
vld VX0, X, 0 * SIZE
vld VX1, X, 2 * SIZE
vld VX2, Y, 0 * SIZE
vld VX3, Y, 2 * SIZE
vst VX2, X, 0 * SIZE
vst VX3, X, 2 * SIZE
vst VX0, Y, 0 * SIZE
vst VX1, Y, 2 * SIZE
vld VX0, X, 4 * SIZE
vld VX1, X, 6 * SIZE
vld VX2, Y, 4 * SIZE
vld VX3, Y, 6 * SIZE
addi.d I, I, -1
vst VX2, X, 4 * SIZE
vst VX3, X, 6 * SIZE
vst VX0, Y, 4 * SIZE
vst VX1, Y, 6 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
.align 3
.L112:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L113:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
fst.d $f14, X, 0 * SIZE
addi.d X, X, SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L113
b .L999
.align 3
.L12: // INCX==1 and INCY!=1
bge $r0, I, .L122
.align 3
.L121:
vld VX0, X, 0 * SIZE
ld.d t1, Y, 0 * SIZE
vstelm.d VX0, Y, 0, 0
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
vstelm.d VX0, Y, 0, 1
vinsgr2vr.d VX2, t1, 0
vinsgr2vr.d VX2, t2, 1
add.d Y, Y, INCY
vst VX2, X, 0 * SIZE
vld VX1, X, 2 * SIZE
ld.d t3, Y, 0 * SIZE
vstelm.d VX1, Y, 0, 0
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
vstelm.d VX1, Y, 0, 1
vinsgr2vr.d VX3, t3, 0
vinsgr2vr.d VX3, t4, 1
add.d Y, Y, INCY
vst VX3, X, 2 * SIZE
vld VX0, X, 4 * SIZE
ld.d t1, Y, 0 * SIZE
vstelm.d VX0, Y, 0, 0
add.d Y, Y, INCY
ld.d t2, Y, 0 * SIZE
vstelm.d VX0, Y, 0, 1
vinsgr2vr.d VX2, t1, 0
vinsgr2vr.d VX2, t2, 1
add.d Y, Y, INCY
vst VX2, X, 4 * SIZE
vld VX1, X, 6 * SIZE
ld.d t3, Y, 0 * SIZE
vstelm.d VX1, Y, 0, 0
add.d Y, Y, INCY
ld.d t4, Y, 0 * SIZE
vstelm.d VX1, Y, 0, 1
vinsgr2vr.d VX3, t3, 0
vinsgr2vr.d VX3, t4, 1
add.d Y, Y, INCY
vst VX3, X, 6 * SIZE
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
fst.d $f14, X, 0 * SIZE
addi.d X, X, SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
.L21:
bge $r0, I, .L212
.align 3
.L211:
vld VX2, Y, 0 * SIZE
ld.d t1, X, 0 * SIZE
vstelm.d VX2, X, 0, 0
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
vstelm.d VX2, X, 0, 1
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
add.d X, X, INCY
vst VX0, Y, 0 * SIZE
vld VX3, Y, 2 * SIZE
ld.d t3, X, 0 * SIZE
vstelm.d VX3, X, 0, 0
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
vstelm.d VX3, X, 0, 1
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
add.d X, X, INCX
vst VX1, Y, 2 * SIZE
vld VX2, Y, 4 * SIZE
ld.d t1, X, 0 * SIZE
vstelm.d VX2, X, 0, 0
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
vstelm.d VX2, X, 0, 1
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
add.d X, X, INCY
vst VX0, Y, 4 * SIZE
vld VX3, Y, 6 * SIZE
ld.d t3, X, 0 * SIZE
vstelm.d VX3, X, 0, 0
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
vstelm.d VX3, X, 0, 1
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
add.d X, X, INCX
vst VX1, Y, 6 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
fst.d $f14, X, 0 * SIZE
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bgez INCX, .L220
//addi.d TEMP, N, -1
//mul.d TEMP, TEMP, INCX
//sub.d X, X, TEMP
.align 3
.L220:
bge $r0, I, .L223
.align 3
move XX, X
.L222:
fld.d a1, X, 0 * SIZE
add.d X, X, INCX
fld.d a2, X, 0 * SIZE
add.d X, X, INCX
fld.d a3, X, 0 * SIZE
add.d X, X, INCX
fld.d a4, X, 0 * SIZE
add.d X, X, INCX
fld.d b1, Y, 0 * SIZE
fst.d a1, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d b2, Y, 0 * SIZE
fst.d a2, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d b3, Y, 0 * SIZE
fst.d a3, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d b4, Y, 0 * SIZE
fst.d a4, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d a1, X, 0 * SIZE
add.d X, X, INCX
fst.d b1, XX, 0 * SIZE
add.d XX, XX, INCX
fld.d b1, Y, 0 * SIZE
fst.d a1, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d a2, X, 0 * SIZE
add.d X, X, INCX
fst.d b2, XX, 0 * SIZE
add.d XX, XX, INCX
fld.d b2, Y, 0 * SIZE
fst.d a2, Y, 0 * SIZE
add.d Y, Y, INCY
fld.d a3, X, 0 * SIZE
add.d X, X, INCX
fst.d b3, XX, 0 * SIZE
add.d XX, XX, INCX
fld.d b3, Y, 0 * SIZE
fst.d a3, Y, 0 * SIZE
fld.d a4, X, 0 * SIZE
add.d X, X, INCX
fst.d b4, XX, 0 * SIZE
add.d XX, XX, INCX
fld.d b4, Y, 0 * SIZE
fst.d a4, Y, 0 * SIZE
add.d Y, Y, INCY
fst.d b1, XX, 0 * SIZE
add.d XX, XX, INCX
fst.d b2, XX, 0 * SIZE
add.d XX, XX, INCX
fst.d b3, XX, 0 * SIZE
add.d XX, XX, INCX
fst.d b4, XX, 0 * SIZE
add.d XX, XX, INCX
addi.d I, I, -1
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
fld.d $f12, X, 0 * SIZE
fld.d $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.d $f12, Y, 0 * SIZE
fst.d $f14, X, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -1,216 +0,0 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
#define t1 $r14
#define t2 $r15
#define t3 $r16
#define t4 $r19
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define VX0 $xr12
#define VX1 $xr13
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L112
.align 3
.L111:
xvld VX0, X, 0 * SIZE
addi.d I, I, -1
xvst VX0, Y, 0 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
.align 3
.L112:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L113:
fld.s $f12, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
fst.s $f12, Y, 0 * SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L113
b .L999
.align 3
.L12:
bge $r0, I, .L122
.align 3
.L121:
xvld VX0, X, 0 * SIZE
xvstelm.w VX0, Y, 0, 0
add.d Y, Y, INCY
xvstelm.w VX0, Y, 0, 1
add.d Y, Y, INCY
xvstelm.w VX0, Y, 0, 2
add.d Y, Y, INCY
xvstelm.w VX0, Y, 0, 3
add.d Y, Y, INCY
xvstelm.w VX0, Y, 0, 4
add.d Y, Y, INCY
xvstelm.w VX0, Y, 0, 5
add.d Y, Y, INCY
xvstelm.w VX0, Y, 0, 6
add.d Y, Y, INCY
xvstelm.w VX0, Y, 0, 7
add.d Y, Y, INCY
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
fld.s $f12, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
fst.s $f12, Y, 0 * SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
.L21:
bge $r0, I, .L212
.align 3
.L211:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvst VX0, Y, 0 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
fld.s $f12, X, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bge $r0, I, .L223
.align 3
.L222:
fld.s a1, X, 0 * SIZE
add.d X, X, INCX
fld.s a2, X, 0 * SIZE
add.d X, X, INCX
fld.s a3, X, 0 * SIZE
add.d X, X, INCX
fld.s a4, X, 0 * SIZE
add.d X, X, INCX
fst.s a1, Y, 0 * SIZE
add.d Y, Y, INCY
fst.s a2, Y, 0 * SIZE
add.d Y, Y, INCY
fst.s a3, X, 0 * SIZE
add.d Y, Y, INCY
fst.s a4, X, 0 * SIZE
add.d Y, Y, INCY
fld.s a1, X, 0 * SIZE
add.d X, X, INCX
fld.s a2, X, 0 * SIZE
add.d X, X, INCX
fld.s a3, X, 0 * SIZE
add.d X, X, INCX
fld.s a4, X, 0 * SIZE
add.d X, X, INCX
fst.s a1, Y, 0 * SIZE
add.d Y, Y, INCY
fst.s a2, Y, 0 * SIZE
add.d Y, Y, INCY
fst.s a3, X, 0 * SIZE
add.d Y, Y, INCY
fst.s a4, X, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
fld.s $f12, X, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -1,220 +0,0 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
#define t1 $r14
#define t2 $r15
#define t3 $r16
#define t4 $r19
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define VX0 $vr12
#define VX1 $vr13
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L112
.align 3
.L111:
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
addi.d I, I, -1
vst VX0, Y, 0 * SIZE
vst VX1, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
.align 3
.L112:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L113:
fld.s $f12, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
fst.s $f12, Y, 0 * SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L113
b .L999
.align 3
.L12:
bge $r0, I, .L122
.align 3
.L121:
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
vstelm.w VX0, Y, 0, 0
add.d Y, Y, INCY
vstelm.w VX0, Y, 0, 1
add.d Y, Y, INCY
vstelm.w VX0, Y, 0, 2
add.d Y, Y, INCY
vstelm.w VX0, Y, 0, 3
add.d Y, Y, INCY
vstelm.w VX1, Y, 0, 0
add.d Y, Y, INCY
vstelm.w VX1, Y, 0, 1
add.d Y, Y, INCY
vstelm.w VX1, Y, 0, 2
add.d Y, Y, INCY
vstelm.w VX1, Y, 0, 3
add.d Y, Y, INCY
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
fld.s $f12, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
fst.s $f12, Y, 0 * SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
.L21:
bge $r0, I, .L212
.align 3
.L211:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
vst VX0, Y, 0 * SIZE
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
vst VX1, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
fld.s $f12, X, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bge $r0, I, .L223
.align 3
.L222:
fld.s a1, X, 0 * SIZE
add.d X, X, INCX
fld.s a2, X, 0 * SIZE
add.d X, X, INCX
fld.s a3, X, 0 * SIZE
add.d X, X, INCX
fld.s a4, X, 0 * SIZE
add.d X, X, INCX
fst.s a1, Y, 0 * SIZE
add.d Y, Y, INCY
fst.s a2, Y, 0 * SIZE
add.d Y, Y, INCY
fst.s a3, X, 0 * SIZE
add.d Y, Y, INCY
fst.s a4, X, 0 * SIZE
add.d Y, Y, INCY
fld.s a1, X, 0 * SIZE
add.d X, X, INCX
fld.s a2, X, 0 * SIZE
add.d X, X, INCX
fld.s a3, X, 0 * SIZE
add.d X, X, INCX
fld.s a4, X, 0 * SIZE
add.d X, X, INCX
fst.s a1, Y, 0 * SIZE
add.d Y, Y, INCY
fst.s a2, Y, 0 * SIZE
add.d Y, Y, INCY
fst.s a3, X, 0 * SIZE
add.d Y, Y, INCY
fst.s a4, X, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
fld.s $f12, X, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -1,3 +1,35 @@
/*****************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#define ASSEMBLER
#include "common.h"
@ -11,10 +43,13 @@
#define t2 $r13
#define t3 $r14
#define t4 $r15
/* Don't change following FR unless you know the effects. */
#define VX0 $xr15
#define VX1 $xr16
#define VX2 $xr17
#define VX3 $xr18
#define VX4 $xr21
#define res1 $xr19
#define res2 $xr20
@ -37,14 +72,13 @@
.align 3
.L10:
xvld VX0, X, 0 * SIZE
xvld VX1, X, 0 * SIZE
xvfcvtl.d.s VX0, VX0
xvfcvth.d.s VX1, VX1
xvfmadd.d res1, VX0, VX0, res1
xvfmadd.d res2, VX1, VX1, res2
xvld VX0, X, 0
xvfcvtl.d.s VX1, VX0
xvfcvth.d.s VX2, VX0
xvfmadd.d res1, VX1, VX1, res1
xvfmadd.d res2, VX2, VX2, res2
addi.d I, I, -1
addi.d X, X, 8 * SIZE
addi.d X, X, 8 * SIZE
blt $r0, I, .L10
.align 3
b .L996
@ -54,70 +88,46 @@
.align 3
.L21:
ld.w t1, X, 0 * SIZE
ld.w t1, X, 0
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
ld.w t2, X, 0
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t3, X, 0
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
ld.w t4, X, 0
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
ld.w t1, X, 0
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
ld.w t2, X, 0
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
ld.w t3, X, 0
add.d X, X, INCX
ld.w t4, X, 0
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX1, t1, 0
xvinsgr2vr.w VX1, t2, 1
xvinsgr2vr.w VX1, t3, 2
xvinsgr2vr.w VX1, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX1, t1, 4
xvinsgr2vr.w VX1, t2, 5
xvinsgr2vr.w VX1, t3, 6
xvinsgr2vr.w VX1, t4, 7
xvfcvtl.d.s VX0, VX0
xvfcvth.d.s VX1, VX1
xvfmadd.d res1, VX0, VX0, res1
xvfmadd.d res2, VX1, VX1, res2
xvfcvtl.d.s VX1, VX0
xvfcvth.d.s VX2, VX0
xvfmadd.d res1, VX1, VX1, res1
xvfmadd.d res2, VX2, VX2, res2
addi.d I, I, -1
blt $r0, I, .L21
b .L996
.L996:
xvfadd.d res1, res1, res2
xvpickve.w VX1, res1, 1
xvpickve.w VX2, res1, 2
xvpickve.w VX3, res1, 3
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
xvpickve.d VX1, res1, 1
xvpickve.d VX2, res1, 2
xvpickve.d VX3, res1, 3
fadd.d $f19, $f19, $f16
fadd.d $f19, $f19, $f17
fadd.d $f19, $f19, $f18
.align 3
.L997:
@ -126,11 +136,11 @@
.align 3
.L998:
fld.s $f15, X, 0 * SIZE
addi.d I, I, -1
fld.s $f15, X, 0
add.d X, X, INCX
addi.d I, I, -1
fcvt.d.s $f15, $f15
fmadd.d $f19, $f15, $f15, $f19
add.d X, X, INCX
fmadd.d $f19, $f15, $f15, $f19
blt $r0, I, .L998
.align 3

View File

@ -1,3 +1,35 @@
/*****************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#define ASSEMBLER
#include "common.h"
@ -15,6 +47,9 @@
#define VX1 $vr16
#define VX2 $vr17
#define VX3 $vr18
#define VX4 $vr21
#define VX5 $vr22
/* Don't change following FR unless you know the effects. */
#define res1 $vr19
#define res2 $vr20
@ -24,99 +59,71 @@
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
vxor.v res1, res1, res1
vxor.v res2, res2, res2
bge $r0, N, .L999
bge $r0, N, .L999
beq $r0, INCX, .L999
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L997
bge $r0, I, .L997
.align 3
.L10:
vld VX0, X, 0 * SIZE
vld VX1, X, 0 * SIZE
vfcvtl.d.s VX0, VX0
vfcvth.d.s VX1, VX1
vfmadd.d res1, VX0, VX0, res1
vfmadd.d res2, VX1, VX1, res2
vld VX2, X, 4 * SIZE
vld VX3, X, 4 * SIZE
vfcvtl.d.s VX2, VX2
vfcvth.d.s VX3, VX3
vfmadd.d res1, VX2, VX2, res1
vfmadd.d res2, VX3, VX3, res2
vld VX0, X, 0
vld VX5, X, 4 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
addi.d X, X, 8 * SIZE
vfcvtl.d.s VX1, VX0
vfcvth.d.s VX2, VX0
vfcvtl.d.s VX3, VX5
vfcvth.d.s VX4, VX5
vfmadd.d res1, VX1, VX1, res1
vfmadd.d res2, VX2, VX2, res2
vfmadd.d res1, VX3, VX3, res1
vfmadd.d res2, VX4, VX4, res2
blt $r0, I, .L10
b .L996
.align 3
.L20:
bge $r0, I, .L997
.align 3
.L21:
ld.w t1, X, 0 * SIZE
ld.w t1, X, 0
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
ld.w t2, X, 0
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t3, X, 0
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
ld.w t4, X, 0
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
vinsgr2vr.w VX0, t4, 3
vfcvtl.d.s VX1, VX0
vfcvth.d.s VX2, VX0
vfmadd.d res1, VX1, VX1, res1
vfmadd.d res2, VX2, VX2, res2
ld.w t1, X, 0
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
ld.w t2, X, 0
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t3, X, 0
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
ld.w t4, X, 0
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
vfcvtl.d.s VX0, VX0
vfcvth.d.s VX1, VX1
vfmadd.d res1, VX0, VX0, res1
vfmadd.d res2, VX1, VX1, res2
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX2, t1, 0
vinsgr2vr.w VX2, t2, 1
vinsgr2vr.w VX2, t3, 2
vinsgr2vr.w VX2, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX3, t1, 0
vinsgr2vr.w VX3, t2, 1
vinsgr2vr.w VX3, t3, 2
vinsgr2vr.w VX3, t4, 3
vfcvtl.d.s VX2, VX2
vfcvth.d.s VX3, VX3
vfmadd.d res1, VX2, VX2, res1
vfmadd.d res2, VX3, VX3, res2
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
vfcvtl.d.s VX3, VX0
vfcvth.d.s VX4, VX0
vfmadd.d res1, VX3, VX3, res1
vfmadd.d res2, VX4, VX4, res2
addi.d I, I, -1
blt $r0, I, .L21
b .L996
@ -124,12 +131,8 @@
.L996:
vfadd.d res1, res1, res2
vreplvei.w VX1, res1, 1
vreplvei.w VX2, res1, 2
vreplvei.w VX3, res1, 3
vfadd.s res1, VX1, res1
vfadd.s res1, VX2, res1
vfadd.s res1, VX3, res1
vreplvei.d VX1, res1, 1
vfadd.d res1, VX1, res1
.align 3
.L997:
@ -138,7 +141,7 @@
.align 3
.L998:
fld.s $f15, X, 0 * SIZE
fld.s $f15, X, 0
addi.d I, I, -1
fcvt.d.s $f15, $f15
fmadd.d $f19, $f15, $f15, $f19

View File

@ -1,140 +0,0 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define t1 $r15
#define t2 $r12
#define t3 $r13
#define t4 $r14
#define VX0 $xr12
#define VX1 $xr13
#define VX2 $xr14
#define VX3 $xr15
#define res1 $xr16
#define res2 $xr17
PROLOGUE
xvxor.v res1, res1, res1
xvxor.v res2, res2, res2
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L13
.align 3
.L11:
xvld VX0, X, 0 * SIZE
xvfadd.s res1, VX0, res1
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L11
.align 3
.L12:
xvfadd.s res2, res1, res2
xvpickve.w VX1, res1, 1
xvpickve.w VX2, res1, 2
xvpickve.w VX3, res1, 3
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
xvpickve.w VX0, res2, 4
xvpickve.w VX1, res2, 5
xvpickve.w VX2, res2, 6
xvpickve.w VX3, res2, 7
xvfadd.s res1, VX0, res1
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX2, res1
.align 3
.L13:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L14:
fld.s $f12, X, 0 * SIZE
fadd.s $f16, $f12, $f16
addi.d I, I, -1
addi.d X, X, SIZE
blt $r0, I, .L14
b .L999
.align 3
.L20:
bge $r0, I, .L23
.align 3
.L21:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvfadd.s res1, VX0, res1
addi.d I, I, -1
blt $r0, I, .L21
.align 3
.L22:
xvfadd.s res2, res1, res2
xvpickve.w VX1, res1, 1
xvpickve.w VX2, res1, 2
xvpickve.w VX3, res1, 3
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
xvpickve.w VX0, res2, 4
xvpickve.w VX1, res2, 5
xvpickve.w VX2, res2, 6
xvpickve.w VX3, res2, 7
xvfadd.s res1, VX0, res1
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX2, res1
.align 3
.L23:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
fld.s $f12, X, 0 * SIZE
fadd.s $f16, $f12, $f16
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.s $f0, $f16
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -1,125 +0,0 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define t1 $r15
#define t2 $r12
#define t3 $r13
#define t4 $r14
#define VX0 $vr12
#define VX1 $vr13
#define VX2 $vr14
#define VX3 $vr15
#define res1 $vr16
#define res2 $vr17
PROLOGUE
vxor.v res1, res1, res1
vxor.v res2, res2, res2
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L13
.align 3
.L11:
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
vfadd.s res2, VX0, VX1
vfadd.s res1, res1, res2
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L11
.align 3
.L12:
vreplvei.w VX1, res1, 1
vreplvei.w VX2, res1, 2
vreplvei.w VX3, res1, 3
vfadd.s res1, VX1, res1
vfadd.s res1, VX2, res1
vfadd.s res1, VX3, res1
.align 3
.L13:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L14:
fld.s $f12, X, 0 * SIZE
fadd.s $f16, $f12, $f16
addi.d I, I, -1
addi.d X, X, SIZE
blt $r0, I, .L14
b .L999
.align 3
.L20:
bge $r0, I, .L23
.align 3
.L21:
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
vfadd.s res2, VX0, VX1
vfadd.s res1, res1, res2
addi.d I, I, -1
blt $r0, I, .L21
.align 3
.L22:
vreplvei.w VX1, res1, 1
vreplvei.w VX2, res1, 2
vreplvei.w VX3, res1, 3
vfadd.s res1, VX1, res1
vfadd.s res1, VX2, res1
vfadd.s res1, VX3, res1
.align 3
.L23:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
fld.s $f12, X, 0 * SIZE
fadd.s $f16, $f12, $f16
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.s $f0, $f16
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -1,286 +0,0 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r7
#define INCX $r8
#define Y $r9
#define INCY $r10
#define I $r17
#define TEMP $r18
#define XX $r5
#define YY $r6
#define t1 $r14
#define t2 $r15
#define t3 $r16
#define t4 $r19
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define b1 $f16
#define b2 $f17
#define b3 $f18
#define b4 $f19
#define VX0 $xr12
#define VX1 $xr13
#define VX2 $xr14
#define VX3 $xr15
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L112
.align 3
.L111:
xvld VX0, X, 0 * SIZE
xvld VX2, Y, 0 * SIZE
addi.d I, I, -1
xvst VX2, X, 0 * SIZE
xvst VX0, Y, 0 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
.align 3
.L112:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L113:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
fst.s $f14, X, 0 * SIZE
addi.d X, X, SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L113
b .L999
.align 3
.L12: // INCX==1 and INCY!=1
bge $r0, I, .L122
.align 3
.L121:
xvld VX0, X, 0 * SIZE
ld.w t1, Y, 0 * SIZE
xvstelm.w VX0, Y, 0, 0
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
xvstelm.w VX0, Y, 0, 1
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
xvstelm.w VX0, Y, 0, 2
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
xvstelm.w VX0, Y, 0, 3
xvinsgr2vr.w VX2, t1, 0
xvinsgr2vr.w VX2, t2, 1
xvinsgr2vr.w VX2, t3, 2
xvinsgr2vr.w VX2, t4, 3
add.d Y, Y, INCY
ld.w t1, Y, 0 * SIZE
xvstelm.w VX0, Y, 0, 4
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
xvstelm.w VX0, Y, 0, 5
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
xvstelm.w VX0, Y, 0, 6
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
xvstelm.w VX0, Y, 0, 7
xvinsgr2vr.w VX2, t1, 4
xvinsgr2vr.w VX2, t2, 5
xvinsgr2vr.w VX2, t3, 6
xvinsgr2vr.w VX2, t4, 7
add.d Y, Y, INCY
xvst VX2, X, 0 * SIZE
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
fst.s $f14, X, 0 * SIZE
addi.d X, X, SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
.L21:
bge $r0, I, .L212
.align 3
.L211:
xvld VX2, Y, 0 * SIZE
ld.w t1, X, 0 * SIZE
xvstelm.w VX2, X, 0, 0
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
xvstelm.w VX2, X, 0, 1
add.d X, X, INCY
ld.w t3, X, 0 * SIZE
xvstelm.w VX2, X, 0, 2
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
xvstelm.w VX2, X, 0, 3
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
add.d X, X, INCX
ld.w t1, X, 0 * SIZE
xvstelm.w VX2, X, 0, 4
add.d X, X, INCY
ld.w t2, X, 0 * SIZE
xvstelm.w VX2, X, 0, 5
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
xvstelm.w VX2, X, 0, 6
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
xvstelm.w VX2, X, 0, 7
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
add.d X, X, INCX
xvst VX1, Y, 0 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
fst.s $f14, X, 0 * SIZE
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bge $r0, I, .L223
.align 3
move XX, X
.L222:
fld.s a1, X, 0 * SIZE
add.d X, X, INCX
fld.s a2, X, 0 * SIZE
add.d X, X, INCX
fld.s a3, X, 0 * SIZE
add.d X, X, INCX
fld.s a4, X, 0 * SIZE
add.d X, X, INCX
fld.s b1, Y, 0 * SIZE
fst.s a1, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s b2, Y, 0 * SIZE
fst.s a2, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s b3, Y, 0 * SIZE
fst.s a3, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s b4, Y, 0 * SIZE
fst.s a4, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s a1, X, 0 * SIZE
add.d X, X, INCX
fst.s b1, XX, 0 * SIZE
add.d XX, XX, INCX
fld.s b1, Y, 0 * SIZE
fst.s a1, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s a2, X, 0 * SIZE
add.d X, X, INCX
fst.s b2, XX, 0 * SIZE
add.d XX, XX, INCX
fld.s b2, Y, 0 * SIZE
fst.s a2, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s a3, X, 0 * SIZE
add.d X, X, INCX
fst.s b3, XX, 0 * SIZE
add.d XX, XX, INCX
fld.s b3, Y, 0 * SIZE
fst.s a3, Y, 0 * SIZE
fld.s a4, X, 0 * SIZE
add.d X, X, INCX
fst.s b4, XX, 0 * SIZE
add.d XX, XX, INCX
fld.s b4, Y, 0 * SIZE
fst.s a4, Y, 0 * SIZE
add.d Y, Y, INCY
fst.s b1, XX, 0 * SIZE
add.d XX, XX, INCX
fst.s b2, XX, 0 * SIZE
add.d XX, XX, INCX
fst.s b3, XX, 0 * SIZE
add.d XX, XX, INCX
fst.s b4, XX, 0 * SIZE
add.d XX, XX, INCX
addi.d I, I, -1
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
fst.s $f14, X, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -1,294 +0,0 @@
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r7
#define INCX $r8
#define Y $r9
#define INCY $r10
#define I $r17
#define TEMP $r18
#define XX $r5
#define YY $r6
#define t1 $r14
#define t2 $r15
#define t3 $r16
#define t4 $r19
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define b1 $f16
#define b2 $f17
#define b3 $f18
#define b4 $f19
#define VX0 $vr12
#define VX1 $vr13
#define VX2 $vr14
#define VX3 $vr15
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L112
.align 3
.L111:
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
vld VX2, Y, 0 * SIZE
vld VX3, Y, 4 * SIZE
addi.d I, I, -1
vst VX2, X, 0 * SIZE
vst VX3, X, 4 * SIZE
vst VX0, Y, 0 * SIZE
vst VX1, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
.align 3
.L112:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L113:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
fst.s $f14, X, 0 * SIZE
addi.d X, X, SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L113
b .L999
.align 3
.L12: // INCX==1 and INCY!=1
bge $r0, I, .L122
.align 3
.L121:
vld VX0, X, 0 * SIZE
ld.w t1, Y, 0 * SIZE
vstelm.w VX0, Y, 0, 0
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
vstelm.w VX0, Y, 0, 1
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
vstelm.w VX0, Y, 0, 2
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vstelm.w VX0, Y, 0, 3
vinsgr2vr.w VX2, t1, 0
vinsgr2vr.w VX2, t2, 1
vinsgr2vr.w VX2, t3, 2
vinsgr2vr.w VX2, t4, 3
add.d Y, Y, INCY
vst VX2, X, 0 * SIZE
vld VX1, X, 4 * SIZE
ld.w t1, Y, 0 * SIZE
vstelm.w VX1, Y, 0, 0
add.d Y, Y, INCY
ld.w t2, Y, 0 * SIZE
vstelm.w VX1, Y, 0, 1
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
vstelm.w VX1, Y, 0, 2
add.d Y, Y, INCY
ld.w t4, Y, 0 * SIZE
vstelm.w VX1, Y, 0, 3
vinsgr2vr.w VX3, t1, 0
vinsgr2vr.w VX3, t2, 1
vinsgr2vr.w VX3, t3, 2
vinsgr2vr.w VX3, t4, 3
add.d Y, Y, INCY
vst VX3, X, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
fst.s $f14, X, 0 * SIZE
addi.d X, X, SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
.L21:// INCX!=1 and INCY==1
bge $r0, I, .L212
.align 3
.L211:
vld VX2, Y, 0 * SIZE
ld.w t1, X, 0 * SIZE
vstelm.w VX2, X, 0, 0
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
vstelm.w VX2, X, 0, 1
add.d X, X, INCY
ld.w t3, X, 0 * SIZE
vstelm.w VX2, X, 0, 2
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
vstelm.w VX2, X, 0, 3
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
add.d X, X, INCX
vst VX0, Y, 0 * SIZE
vld VX3, Y, 4 * SIZE
ld.w t1, X, 0 * SIZE
vstelm.w VX3, X, 0, 0
add.d X, X, INCY
ld.w t2, X, 0 * SIZE
vstelm.w VX3, X, 0, 1
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
vstelm.w VX3, X, 0, 2
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
vstelm.w VX3, X, 0, 3
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
add.d X, X, INCX
vst VX1, Y, 0 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
fst.s $f14, X, 0 * SIZE
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bge $r0, I, .L223
.align 3
move XX, X
.L222:
fld.s a1, X, 0 * SIZE
add.d X, X, INCX
fld.s a2, X, 0 * SIZE
add.d X, X, INCX
fld.s a3, X, 0 * SIZE
add.d X, X, INCX
fld.s a4, X, 0 * SIZE
add.d X, X, INCX
fld.s b1, Y, 0 * SIZE
fst.s a1, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s b2, Y, 0 * SIZE
fst.s a2, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s b3, Y, 0 * SIZE
fst.s a3, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s b4, Y, 0 * SIZE
fst.s a4, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s a1, X, 0 * SIZE
add.d X, X, INCX
fst.s b1, XX, 0 * SIZE
add.d XX, XX, INCX
fld.s b1, Y, 0 * SIZE
fst.s a1, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s a2, X, 0 * SIZE
add.d X, X, INCX
fst.s b2, XX, 0 * SIZE
add.d XX, XX, INCX
fld.s b2, Y, 0 * SIZE
fst.s a2, Y, 0 * SIZE
add.d Y, Y, INCY
fld.s a3, X, 0 * SIZE
add.d X, X, INCX
fst.s b3, XX, 0 * SIZE
add.d XX, XX, INCX
fld.s b3, Y, 0 * SIZE
fst.s a3, Y, 0 * SIZE
fld.s a4, X, 0 * SIZE
add.d X, X, INCX
fst.s b4, XX, 0 * SIZE
add.d XX, XX, INCX
fld.s b4, Y, 0 * SIZE
fst.s a4, Y, 0 * SIZE
add.d Y, Y, INCY
fst.s b1, XX, 0 * SIZE
add.d XX, XX, INCX
fst.s b2, XX, 0 * SIZE
add.d XX, XX, INCX
fst.s b3, XX, 0 * SIZE
add.d XX, XX, INCX
fst.s b4, XX, 0 * SIZE
add.d XX, XX, INCX
addi.d I, I, -1
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
fld.s $f12, X, 0 * SIZE
fld.s $f14, Y, 0 * SIZE
addi.d I, I, -1
fst.s $f12, Y, 0 * SIZE
fst.s $f14, X, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,225 @@
/*****************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define t1 $r15
#define t2 $r12
#define t3 $r13
#define t4 $r14
#define VX0 $xr12
#define VX1 $xr13
#define VX2 $xr14
#define VX3 $xr15
#define res1 $xr16
#define res2 $xr17
PROLOGUE
xvxor.v res1, res1, res1
xvxor.v res2, res2, res2
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L13
.align 3
.L11:
xvld VX0, X, 0
xvfadd.s res1, res1, VX0
#ifdef DOUBLE
xvld VX1, X, 32
xvfadd.s res1, res1, VX1
#endif
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L11
.align 3
.L12:
#ifdef DOUBLE
xvpickve.d VX1, res1, 1
xvpickve.d VX2, res1, 2
xvpickve.d VX3, res1, 3
xvfadd.d res1, VX1, res1
xvfadd.d res1, VX2, res1
xvfadd.d res1, VX3, res1
#else
xvfadd.s res2, res1, res2
xvpickve.w VX1, res1, 1
xvpickve.w VX2, res1, 2
xvpickve.w VX3, res1, 3
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
xvpickve.w VX0, res2, 4
xvpickve.w VX1, res2, 5
xvpickve.w VX2, res2, 6
xvpickve.w VX3, res2, 7
xvfadd.s res1, VX0, res1
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX2, res1
#endif
.align 3
.L13:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L14:
LD $f12, X, 0
ADD $f16, $f12, $f16
addi.d I, I, -1
addi.d X, X, SIZE
blt $r0, I, .L14
b .L999
.align 3
.L20:
bge $r0, I, .L23
.align 3
.L21:
#ifdef DOUBLE
ld.d t1, X, 0
add.d X, X, INCX
ld.d t2, X, 0
add.d X, X, INCX
ld.d t3, X, 0
add.d X, X, INCX
ld.d t4, X, 0
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
ld.d t1, X, 0
add.d X, X, INCX
ld.d t2, X, 0
add.d X, X, INCX
ld.d t3, X, 0
add.d X, X, INCX
ld.d t4, X, 0
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
xvfadd.d res2, VX0, VX1
xvfadd.d res1, res1, res2
#else
ld.w t1, X, 0
add.d X, X, INCX
ld.w t2, X, 0
add.d X, X, INCX
ld.w t3, X, 0
add.d X, X, INCX
ld.w t4, X, 0
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0
add.d X, X, INCX
ld.w t2, X, 0
add.d X, X, INCX
ld.w t3, X, 0
add.d X, X, INCX
ld.w t4, X, 0
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvfadd.s res1, VX0, res1
#endif
addi.d I, I, -1
blt $r0, I, .L21
.align 3
.L22:
#ifdef DOUBLE
xvpickve.d VX1, res1, 1
xvpickve.d VX2, res1, 2
xvpickve.d VX3, res1, 3
xvfadd.d res1, VX1, res1
xvfadd.d res1, VX2, res1
xvfadd.d res1, VX3, res1
#else
xvfadd.s res2, res1, res2
xvpickve.w VX1, res1, 1
xvpickve.w VX2, res1, 2
xvpickve.w VX3, res1, 3
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
xvpickve.w VX0, res2, 4
xvpickve.w VX1, res2, 5
xvpickve.w VX2, res2, 6
xvpickve.w VX3, res2, 7
xvfadd.s res1, VX0, res1
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX2, res1
#endif
.align 3
.L23:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
LD $f12, X, 0
ADD $f16, $f12, $f16
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.s $f0, $f16
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,204 @@
/*****************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define t1 $r15
#define t2 $r12
#define t3 $r13
#define t4 $r14
#define VX0 $vr12
#define VX1 $vr13
#define VX2 $vr14
#define VX3 $vr15
#define res1 $vr16
#define res2 $vr17
PROLOGUE
vxor.v res1, res1, res1
vxor.v res2, res2, res2
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L13
.align 3
.L11:
vld VX0, X, 0
vld VX1, X, 16
VFADD res2, VX0, VX1
VFADD res1, res1, res2
#ifdef DOUBLE
vld VX0, X, 32
vld VX1, X, 48
VFADD res2, VX0, VX1
VFADD res1, res1, res2
#endif
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L11
.align 3
.L12:
#ifdef DOUBLE
vreplvei.d VX1, res1, 1
vfadd.d res1, res1, VX1
#else
vreplvei.w VX1, res1, 1
vreplvei.w VX2, res1, 2
vreplvei.w VX3, res1, 3
vfadd.s res1, VX1, res1
vfadd.s res1, VX2, res1
vfadd.s res1, VX3, res1
#endif
.align 3
.L13:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L14:
LD $f12, X, 0
ADD $f16, $f12, $f16
addi.d I, I, -1
addi.d X, X, SIZE
blt $r0, I, .L14
b .L999
.align 3
.L20:
bge $r0, I, .L23
.align 3
.L21:
#ifdef DOUBLE
ld.d t1, X, 0
add.d X, X, INCX
ld.d t2, X, 0
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
ld.d t1, X, 0
add.d X, X, INCX
ld.d t2, X, 0
add.d X, X, INCX
vinsgr2vr.d VX1, t1, 0
vinsgr2vr.d VX1, t2, 1
vfadd.d res2, VX0, VX1
vfadd.d res1, res1, res2
ld.d t3, X, 0
add.d X, X, INCX
ld.d t4, X, 0
add.d X, X, INCX
vinsgr2vr.d VX0, t3, 0
vinsgr2vr.d VX0, t4, 1
ld.d t3, X, 0
add.d X, X, INCX
ld.d t4, X, 0
add.d X, X, INCX
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
vfadd.d res2, VX0, VX1
vfadd.d res1, res1, res2
#else
ld.w t1, X, 0
add.d X, X, INCX
ld.w t2, X, 0
add.d X, X, INCX
ld.w t3, X, 0
add.d X, X, INCX
ld.w t4, X, 0
add.d X, X, INCX
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0
add.d X, X, INCX
ld.w t2, X, 0
add.d X, X, INCX
ld.w t3, X, 0
add.d X, X, INCX
ld.w t4, X, 0
add.d X, X, INCX
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
vfadd.s res2, VX0, VX1
vfadd.s res1, res1, res2
#endif
addi.d I, I, -1
blt $r0, I, .L21
.align 3
.L22:
#ifdef DOUBLE
vreplvei.d VX1, res1, 1
vfadd.d res1, VX1, res1
#else
vreplvei.w VX1, res1, 1
vreplvei.w VX2, res1, 2
vreplvei.w VX3, res1, 3
vfadd.s res1, VX1, res1
vfadd.s res1, VX2, res1
vfadd.s res1, VX3, res1
#endif
.align 3
.L23:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L24:
LD $f12, X, 0
ADD $f16, $f12, $f16
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L24
.align 3
.L999:
fmov.s $f0, $f16
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,401 @@
/***************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r7
#define INCX $r8
#define Y $r9
#define INCY $r10
#define I $r17
#define TEMP $r18
#define XX $r5
#define YY $r6
#define t1 $r14
#define t2 $r15
#define t3 $r16
#define t4 $r19
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define b1 $f16
#define b2 $f17
#define b3 $f18
#define b4 $f19
#define VX0 $xr12
#define VX1 $xr13
#define VX2 $xr14
#define VX3 $xr15
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
/* INCX==1 and INCY==1 */
.L11:
bge $r0, I, .L112
.align 3
.L111:
xvld VX0, X, 0
xvld VX2, Y, 0
addi.d I, I, -1
xvst VX2, X, 0
xvst VX0, Y, 0
#ifdef DOUBLE
xvld VX0, X, 32
xvld VX2, Y, 32
xvst VX2, X, 32
xvst VX0, Y, 32
#endif
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
.align 3
.L112:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L113:
LD $f12, X, 0
LD $f14, Y, 0
addi.d I, I, -1
ST $f12, Y, 0
ST $f14, X, 0
addi.d X, X, SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L113
b .L999
.align 3
/* INCX==1 and INCY!=1 */
.L12:
bge $r0, I, .L122
.align 3
.L121:
#ifdef DOUBLE
xvld VX0, X, 0
ld.d t1, Y, 0
xvstelm.d VX0, Y, 0, 0
add.d Y, Y, INCY
ld.d t2, Y, 0
xvstelm.d VX0, Y, 0, 1
add.d Y, Y, INCY
ld.d t3, Y, 0
xvstelm.d VX0, Y, 0, 2
add.d Y, Y, INCY
ld.d t4, Y, 0
xvstelm.d VX0, Y, 0, 3
xvinsgr2vr.d VX2, t1, 0
xvinsgr2vr.d VX2, t2, 1
xvinsgr2vr.d VX2, t3, 2
xvinsgr2vr.d VX2, t4, 3
add.d Y, Y, INCY
xvst VX2, X, 0
xvld VX1, X, 4 * SIZE
ld.d t1, Y, 0
xvstelm.d VX1, Y, 0, 0
add.d Y, Y, INCY
ld.d t2, Y, 0
xvstelm.d VX1, Y, 0, 1
add.d Y, Y, INCY
ld.d t3, Y, 0
xvstelm.d VX1, Y, 0, 2
add.d Y, Y, INCY
ld.d t4, Y, 0
xvstelm.d VX1, Y, 0, 3
xvinsgr2vr.d VX3, t1, 0
xvinsgr2vr.d VX3, t2, 1
xvinsgr2vr.d VX3, t3, 2
xvinsgr2vr.d VX3, t4, 3
add.d Y, Y, INCY
xvst VX3, X, 4 * SIZE
addi.d X, X, 8 * SIZE
#else
xvld VX0, X, 0
ld.w t1, Y, 0
xvstelm.w VX0, Y, 0, 0
add.d Y, Y, INCY
ld.w t2, Y, 0
xvstelm.w VX0, Y, 0, 1
add.d Y, Y, INCY
ld.w t3, Y, 0
xvstelm.w VX0, Y, 0, 2
add.d Y, Y, INCY
ld.w t4, Y, 0
xvstelm.w VX0, Y, 0, 3
xvinsgr2vr.w VX2, t1, 0
xvinsgr2vr.w VX2, t2, 1
xvinsgr2vr.w VX2, t3, 2
xvinsgr2vr.w VX2, t4, 3
add.d Y, Y, INCY
ld.w t1, Y, 0
xvstelm.w VX0, Y, 0, 4
add.d Y, Y, INCY
ld.w t2, Y, 0
xvstelm.w VX0, Y, 0, 5
add.d Y, Y, INCY
ld.w t3, Y, 0
xvstelm.w VX0, Y, 0, 6
add.d Y, Y, INCY
ld.w t4, Y, 0
xvstelm.w VX0, Y, 0, 7
xvinsgr2vr.w VX2, t1, 4
xvinsgr2vr.w VX2, t2, 5
xvinsgr2vr.w VX2, t3, 6
xvinsgr2vr.w VX2, t4, 7
add.d Y, Y, INCY
xvst VX2, X, 0
addi.d X, X, 8 * SIZE
#endif
addi.d I, I, -1
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
LD $f12, X, 0
LD $f14, Y, 0
addi.d I, I, -1
ST $f12, Y, 0
ST $f14, X, 0
addi.d X, X, SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
.L21:
bge $r0, I, .L212
.align 3
.L211:
#ifdef DOUBLE
xvld VX2, Y, 0
ld.d t1, X, 0
xvstelm.d VX2, X, 0, 0
add.d X, X, INCX
ld.d t2, X, 0
xvstelm.d VX2, X, 0, 1
add.d X, X, INCX
ld.d t3, X, 0
xvstelm.d VX2, X, 0, 2
add.d X, X, INCX
ld.d t4, X, 0
xvstelm.d VX2, X, 0, 3
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
add.d X, X, INCX
xvst VX0, Y, 0
xvld VX3, Y, 4 * SIZE
ld.d t1, X, 0
xvstelm.d VX3, X, 0, 0
add.d X, X, INCX
ld.d t2, X, 0
xvstelm.d VX3, X, 0, 1
add.d X, X, INCX
ld.d t3, X, 0
xvstelm.d VX3, X, 0, 2
add.d X, X, INCX
ld.d t4, X, 0
xvstelm.d VX3, X, 0, 3
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
add.d X, X, INCX
xvst VX1, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
#else
xvld VX2, Y, 0
ld.w t1, X, 0
xvstelm.w VX2, X, 0, 0
add.d X, X, INCX
ld.w t2, X, 0
xvstelm.w VX2, X, 0, 1
add.d X, X, INCX
ld.w t3, X, 0
xvstelm.w VX2, X, 0, 2
add.d X, X, INCX
ld.w t4, X, 0
xvstelm.w VX2, X, 0, 3
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
add.d X, X, INCX
ld.w t1, X, 0
xvstelm.w VX2, X, 0, 4
add.d X, X, INCX
ld.w t2, X, 0
xvstelm.w VX2, X, 0, 5
add.d X, X, INCX
ld.w t3, X, 0
xvstelm.w VX2, X, 0, 6
add.d X, X, INCX
ld.w t4, X, 0
xvstelm.w VX2, X, 0, 7
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
add.d X, X, INCX
xvst VX0, Y, 0
addi.d Y, Y, 8 * SIZE
#endif
addi.d I, I, -1
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
LD $f12, X, 0
LD $f14, Y, 0
addi.d I, I, -1
ST $f12, Y, 0
ST $f14, X, 0
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bge $r0, I, .L223
.align 3
move XX, X
.L222:
LD a1, X, 0
add.d X, X, INCX
LD a2, X, 0
add.d X, X, INCX
LD a3, X, 0
add.d X, X, INCX
LD a4, X, 0
add.d X, X, INCX
LD b1, Y, 0
ST a1, Y, 0
add.d Y, Y, INCY
LD b2, Y, 0
ST a2, Y, 0
add.d Y, Y, INCY
LD b3, Y, 0
ST a3, Y, 0
add.d Y, Y, INCY
LD b4, Y, 0
ST a4, Y, 0
add.d Y, Y, INCY
LD a1, X, 0
add.d X, X, INCX
ST b1, XX, 0
add.d XX, XX, INCX
LD b1, Y, 0
ST a1, Y, 0
add.d Y, Y, INCY
LD a2, X, 0
add.d X, X, INCX
ST b2, XX, 0
add.d XX, XX, INCX
LD b2, Y, 0
ST a2, Y, 0
add.d Y, Y, INCY
LD a3, X, 0
add.d X, X, INCX
ST b3, XX, 0
add.d XX, XX, INCX
LD b3, Y, 0
ST a3, Y, 0
LD a4, X, 0
add.d X, X, INCX
ST b4, XX, 0
add.d XX, XX, INCX
LD b4, Y, 0
ST a4, Y, 0
add.d Y, Y, INCY
ST b1, XX, 0
add.d XX, XX, INCX
ST b2, XX, 0
add.d XX, XX, INCX
ST b3, XX, 0
add.d XX, XX, INCX
ST b4, XX, 0
add.d XX, XX, INCX
addi.d I, I, -1
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
LD $f12, X, 0
LD $f14, Y, 0
addi.d I, I, -1
ST $f12, Y, 0
ST $f14, X, 0
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE

View File

@ -0,0 +1,431 @@
/***************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r7
#define INCX $r8
#define Y $r9
#define INCY $r10
#define I $r17
#define TEMP $r18
#define XX $r5
#define YY $r6
#define t1 $r14
#define t2 $r15
#define t3 $r16
#define t4 $r19
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define b1 $f16
#define b2 $f17
#define b3 $f18
#define b4 $f19
#define VX0 $vr12
#define VX1 $vr13
#define VX2 $vr14
#define VX3 $vr15
PROLOGUE
bge $r0, N, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
/* INCX==1 and incy==1 */
.L11:
bge $r0, I, .L112
.align 3
.L111:
vld VX0, X, 0
vld VX1, X, 16
vld VX2, Y, 0
vld VX3, Y, 16
addi.d I, I, -1
vst VX2, X, 0
vst VX3, X, 16
vst VX0, Y, 0
vst VX1, Y, 16
#ifdef DOUBLE
vld VX0, X, 32
vld VX1, X, 48
vld VX2, Y, 32
vld VX3, Y, 48
vst VX2, X, 32
vst VX3, X, 48
vst VX0, Y, 32
vst VX1, Y, 48
#endif
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L111
.align 3
.L112:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L113:
#ifdef DOUBLE
fld.d $f12, X, 0
fld.d $f14, Y, 0
addi.d I, I, -1
fst.d $f12, Y, 0
fst.d $f14, X, 0
#else
fld.s $f12, X, 0
fld.s $f14, Y, 0
addi.d I, I, -1
fst.s $f12, Y, 0
fst.s $f14, X, 0
#endif
addi.d X, X, SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L113
b .L999
.align 3
/* INCX==1 and INCY!=1 */
.L12:
bge $r0, I, .L122
.align 3
.L121:
#ifdef DOUBLE
vld VX0, X, 0
ld.d t1, Y, 0
vstelm.d VX0, Y, 0, 0
add.d Y, Y, INCY
ld.d t2, Y, 0
vstelm.d VX0, Y, 0, 1
vinsgr2vr.d VX2, t1, 0
vinsgr2vr.d VX2, t2, 1
add.d Y, Y, INCY
vst VX2, X, 0
vld VX1, X, 2 * SIZE
ld.d t3, Y, 0
vstelm.d VX1, Y, 0, 0
add.d Y, Y, INCY
ld.d t4, Y, 0
vstelm.d VX1, Y, 0, 1
vinsgr2vr.d VX3, t3, 0
vinsgr2vr.d VX3, t4, 1
add.d Y, Y, INCY
vst VX3, X, 2 * SIZE
vld VX0, X, 4 * SIZE
ld.d t1, Y, 0
vstelm.d VX0, Y, 0, 0
add.d Y, Y, INCY
ld.d t2, Y, 0
vstelm.d VX0, Y, 0, 1
vinsgr2vr.d VX2, t1, 0
vinsgr2vr.d VX2, t2, 1
add.d Y, Y, INCY
vst VX2, X, 4 * SIZE
vld VX1, X, 6 * SIZE
ld.d t3, Y, 0
vstelm.d VX1, Y, 0, 0
add.d Y, Y, INCY
ld.d t4, Y, 0
vstelm.d VX1, Y, 0, 1
vinsgr2vr.d VX3, t3, 0
vinsgr2vr.d VX3, t4, 1
add.d Y, Y, INCY
vst VX3, X, 6 * SIZE
addi.d X, X, 8 * SIZE
#else
vld VX0, X, 0
ld.w t1, Y, 0
vstelm.w VX0, Y, 0, 0
add.d Y, Y, INCY
ld.w t2, Y, 0
vstelm.w VX0, Y, 0, 1
add.d Y, Y, INCY
ld.w t3, Y, 0
vstelm.w VX0, Y, 0, 2
add.d Y, Y, INCY
ld.w t4, Y, 0
vstelm.w VX0, Y, 0, 3
vinsgr2vr.w VX2, t1, 0
vinsgr2vr.w VX2, t2, 1
vinsgr2vr.w VX2, t3, 2
vinsgr2vr.w VX2, t4, 3
add.d Y, Y, INCY
vst VX2, X, 0
vld VX1, X, 4 * SIZE
ld.w t1, Y, 0
vstelm.w VX1, Y, 0, 0
add.d Y, Y, INCY
ld.w t2, Y, 0
vstelm.w VX1, Y, 0, 1
add.d Y, Y, INCY
ld.w t3, Y, 0
vstelm.w VX1, Y, 0, 2
add.d Y, Y, INCY
ld.w t4, Y, 0
vstelm.w VX1, Y, 0, 3
vinsgr2vr.w VX3, t1, 0
vinsgr2vr.w VX3, t2, 1
vinsgr2vr.w VX3, t3, 2
vinsgr2vr.w VX3, t4, 3
add.d Y, Y, INCY
vst VX3, X, 4 * SIZE
addi.d X, X, 8 * SIZE
#endif
addi.d I, I, -1
blt $r0, I, .L121
.align 3
.L122:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L123:
LD $f12, X, 0
LD $f14, Y, 0
addi.d I, I, -1
ST $f12, Y, 0
ST $f14, X, 0
addi.d X, X, SIZE
add.d Y, Y, INCY
blt $r0, I, .L123
b .L999
.align 3
/* INCX!=1 and INCY==1 */
.L21:
bge $r0, I, .L212
.align 3
.L211:
#ifdef DOUBLE
vld VX2, Y, 0
ld.d t1, X, 0
vstelm.d VX2, X, 0, 0
add.d X, X, INCX
ld.d t2, X, 0
vstelm.d VX2, X, 0, 1
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
add.d X, X, INCX
vst VX0, Y, 0
vld VX3, Y, 2 * SIZE
ld.d t3, X, 0
vstelm.d VX3, X, 0, 0
add.d X, X, INCX
ld.d t4, X, 0
vstelm.d VX3, X, 0, 1
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
add.d X, X, INCX
vst VX1, Y, 2 * SIZE
vld VX2, Y, 4 * SIZE
ld.d t1, X, 0
vstelm.d VX2, X, 0, 0
add.d X, X, INCX
ld.d t2, X, 0
vstelm.d VX2, X, 0, 1
vinsgr2vr.d VX0, t1, 0
vinsgr2vr.d VX0, t2, 1
add.d X, X, INCX
vst VX0, Y, 4 * SIZE
vld VX3, Y, 6 * SIZE
ld.d t3, X, 0
vstelm.d VX3, X, 0, 0
add.d X, X, INCX
ld.d t4, X, 0
vstelm.d VX3, X, 0, 1
vinsgr2vr.d VX1, t3, 0
vinsgr2vr.d VX1, t4, 1
add.d X, X, INCX
vst VX1, Y, 6 * SIZE
addi.d Y, Y, 8 * SIZE
#else
vld VX2, Y, 0
ld.w t1, X, 0
vstelm.w VX2, X, 0, 0
add.d X, X, INCX
ld.w t2, X, 0
vstelm.w VX2, X, 0, 1
add.d X, X, INCX
ld.w t3, X, 0
vstelm.w VX2, X, 0, 2
add.d X, X, INCX
ld.w t4, X, 0
vstelm.w VX2, X, 0, 3
vinsgr2vr.w VX0, t1, 0
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
add.d X, X, INCX
vst VX0, Y, 0
vld VX3, Y, 4 * SIZE
ld.w t1, X, 0
vstelm.w VX3, X, 0, 0
add.d X, X, INCX
ld.w t2, X, 0
vstelm.w VX3, X, 0, 1
add.d X, X, INCX
ld.w t3, X, 0
vstelm.w VX3, X, 0, 2
add.d X, X, INCX
ld.w t4, X, 0
vstelm.w VX3, X, 0, 3
vinsgr2vr.w VX1, t1, 0
vinsgr2vr.w VX1, t2, 1
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
add.d X, X, INCX
vst VX1, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
#endif
addi.d I, I, -1
blt $r0, I, .L211
.align 3
.L212:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L213:
LD $f12, X, 0 * SIZE
LD $f14, Y, 0 * SIZE
addi.d I, I, -1
ST $f12, Y, 0 * SIZE
ST $f14, X, 0 * SIZE
add.d X, X, INCX
addi.d Y, Y, SIZE
blt $r0, I, .L213
b .L999
.align 3
.L22:
bge $r0, I, .L223
.align 3
move XX, X
.L222:
LD a1, X, 0
add.d X, X, INCX
LD a2, X, 0
add.d X, X, INCX
LD a3, X, 0
add.d X, X, INCX
LD a4, X, 0
add.d X, X, INCX
LD b1, Y, 0
ST a1, Y, 0
add.d Y, Y, INCY
LD b2, Y, 0
ST a2, Y, 0
add.d Y, Y, INCY
LD b3, Y, 0
ST a3, Y, 0
add.d Y, Y, INCY
LD b4, Y, 0
ST a4, Y, 0
add.d Y, Y, INCY
LD a1, X, 0
add.d X, X, INCX
ST b1, XX, 0
add.d XX, XX, INCX
LD b1, Y, 0
ST a1, Y, 0
add.d Y, Y, INCY
LD a2, X, 0
add.d X, X, INCX
ST b2, XX, 0
add.d XX, XX, INCX
LD b2, Y, 0
ST a2, Y, 0
add.d Y, Y, INCY
LD a3, X, 0
add.d X, X, INCX
ST b3, XX, 0
add.d XX, XX, INCX
LD b3, Y, 0
ST a3, Y, 0
LD a4, X, 0
add.d X, X, INCX
ST b4, XX, 0
add.d XX, XX, INCX
LD b4, Y, 0
ST a4, Y, 0
add.d Y, Y, INCY
ST b1, XX, 0
add.d XX, XX, INCX
ST b2, XX, 0
add.d XX, XX, INCX
ST b3, XX, 0
add.d XX, XX, INCX
ST b4, XX, 0
add.d XX, XX, INCX
addi.d I, I, -1
blt $r0, I, .L222
.align 3
.L223:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L224:
LD $f12, X, 0
LD $f14, Y, 0
addi.d I, I, -1
ST $f12, Y, 0
ST $f14, X, 0
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L224
.align 3
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE