Merge pull request #4340 from yinshiyou/la-dev

Add some refines and optimizations for LoongArch.
This commit is contained in:
Martin Kroeker 2023-11-29 08:22:25 +01:00 committed by GitHub
commit 39bf8ece20
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 785 additions and 25 deletions

View File

@ -127,7 +127,7 @@ int main(int argc, char *argv[]){
long long muls = n*(n+1)/2.0;
long long adds = (n - 1.0)*n/2.0;
fprintf(stderr, "%10d %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg);
fprintf(stderr, "%10d : %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg);
if(a != NULL){
free(a);
}

View File

@ -199,8 +199,7 @@ if [ "$architecture" = "loongarch64" ]; then
tmpd="$(mktemp -d)"
tmplsx="$tmpd/lsx.c"
codelsx='"vadd.b $vr0, $vr0, $vr0"'
lsx_flags='-march=loongarch64 -mlsx'
printf "#include <lsxintrin.h>\n\n" >> "$tmplsx"
lsx_flags='-march=loongarch64'
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx"
args="$lsx_flags -o $tmplsx.o $tmplsx"
{
@ -211,8 +210,7 @@ if [ "$architecture" = "loongarch64" ]; then
tmplasx="$tmpd/lasx.c"
codelasx='"xvadd.b $xr0, $xr0, $xr0"'
lasx_flags='-march=loongarch64 -mlasx'
printf "#include <lasxintrin.h>\n\n" >> "$tmplasx"
lasx_flags='-march=loongarch64'
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx"
args="$lasx_flags -o $tmplasx.o $tmplasx"
{

View File

@ -241,8 +241,7 @@ if (($architecture eq "loongarch64")) {
} else {
$tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
$codelsx = '"vadd.b $vr0, $vr0, $vr0"';
$lsx_flags = "-march=loongarch64 -mlsx";
print $tmplsx "#include <lsxintrin.h>\n\n";
$lsx_flags = "-march=loongarch64";
print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n";
$args = "$lsx_flags -o $tmplsx.o $tmplsx";
@ -257,8 +256,7 @@ if (($architecture eq "loongarch64")) {
$tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
$codelasx = '"xvadd.b $xr0, $xr0, $xr0"';
$lasx_flags = "-march=loongarch64 -mlasx";
print $tmplasx "#include <lasxintrin.h>\n\n";
$lasx_flags = "-march=loongarch64";
print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n";
$args = "$lasx_flags -o $tmplasx.o $tmplasx";

View File

@ -124,7 +124,17 @@ static inline int WhereAmI(void){
#define CMPLE fcmp.cle.d
#define CMPLT fcmp.clt.d
#define NEG fneg.d
#define XVFSUB xvfsub.d
#define XVFADD xvfadd.d
#define XVFMADD xvfmadd.d
#define VFSUB vfsub.d
#define VFADD vfadd.d
#define VFMADD vfmadd.d
#else
#define LD fld.s
#define ST fst.s
#define MADD fmadd.s
@ -142,6 +152,15 @@ static inline int WhereAmI(void){
#define CMPLE fcmp.cle.s
#define CMPLT fcmp.clt.s
#define NEG fneg.s
#define XVFSUB xvfsub.s
#define XVFADD xvfadd.s
#define XVFMADD xvfmadd.s
#define VFSUB vfsub.s
#define VFADD vfadd.s
#define VFMADD vfmadd.s
#endif /* defined(DOUBLE) */
#if defined(__64BIT__) && defined(USE64BITINT)

View File

@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_LOONGSON3R5 1
#define CPU_LOONGSON2K1000 2
#define LA_HWCAP_LSX (1<<4)
#define LA_HWCAP_LASX (1<<5)
#define LA_HWCAP_LSX (1U << 4)
#define LA_HWCAP_LASX (1U << 5)
static char *cpuname[] = {
"LOONGSONGENERIC",
@ -64,11 +64,11 @@ static char *cpuname_lower[] = {
int detect(void) {
#ifdef __linux
int flag = (int)getauxval(AT_HWCAP);
int hwcap = (int)getauxval(AT_HWCAP);
if (flag & LA_HWCAP_LASX)
if (hwcap & LA_HWCAP_LASX)
return CPU_LOONGSON3R5;
else if (flag & LA_HWCAP_LSX)
else if (hwcap & LA_HWCAP_LSX)
return CPU_LOONGSON2K1000;
else
return CPU_GENERIC;
@ -94,7 +94,9 @@ void get_subdirname(void) {
}
void get_cpuconfig(void) {
uint32_t hwcaps = 0;
int d = detect();
switch (d) {
case CPU_LOONGSON3R5:
printf("#define LOONGSON3R5\n");
@ -129,6 +131,10 @@ void get_cpuconfig(void) {
printf("#define L2_ASSOCIATIVE 16\n");
break;
}
hwcaps = (uint32_t)getauxval( AT_HWCAP );
if (hwcaps & LA_HWCAP_LSX) printf("#define HAVE_LSX\n");
if (hwcaps & LA_HWCAP_LASX) printf("#define HAVE_LASX\n");
}
void get_libname(void){

View File

@ -25,6 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include <sys/auxv.h>
#include "common.h"
extern gotoblas_t gotoblas_LOONGSON3R5;
@ -74,21 +75,15 @@ static gotoblas_t *force_coretype(char *coretype) {
return NULL;
}
#define LASX_MASK 1<<7
#define LSX_MASK 1<<6
#define LOONGARCH_CFG2 0x02
#define LA_HWCAP_LSX (1U << 4)
#define LA_HWCAP_LASX (1U << 5)
static gotoblas_t *get_coretype(void) {
int ret = 0;
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(ret)
: "r"(LOONGARCH_CFG2)
);
int hwcap = (int)getauxval(AT_HWCAP);
if (ret & LASX_MASK)
if (hwcap & LA_HWCAP_LASX)
return &gotoblas_LOONGSON3R5;
else if (ret & LSX_MASK)
else if (hwcap & LA_HWCAP_LSX)
return &gotoblas_LOONGSON2K1000;
else
return &gotoblas_LOONGSONGENERIC;

View File

@ -0,0 +1,7 @@
ifndef NO_LSX
SDOTKERNEL = dot_lsx.S
DSDOTKERNEL = dot_lsx.S
DDOTKERNEL = dot_lsx.S
endif

View File

@ -1,4 +1,9 @@
ifndef NO_LASX
SDOTKERNEL = dot_lasx.S
DSDOTKERNEL = dot_lasx.S
DDOTKERNEL = dot_lasx.S
DGEMMKERNEL = dgemm_kernel_16x4.S
DGEMMINCOPY = dgemm_ncopy_16.S
DGEMMITCOPY = dgemm_tcopy_16.S

View File

@ -0,0 +1,368 @@
/***************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
/* Don't change following FR unless you know the effects. */
#define s1 $f8
#define s2 $f9
#define a1 $f10
#define b1 $f11
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
LDINT INCY, 0(INCY)
#endif
/* init $f8 and $f9 to zero */
SUB s1, s1, s1
SUB s2, s2, s2
slli.d INCX, INCX, BASE_SHIFT
li.d TEMP, SIZE
slli.d INCY, INCY, BASE_SHIFT
bge $r0, N, .L999
bne INCX, TEMP, .L20 /* inc_x=1 */
bne INCY, TEMP, .L20 /* inc_y=1 */
/* !((inc_x == 1) && (inc_y == 1)) */
/* init $xr8 and $xr9 to zero */
#ifdef DOUBLE
xvldrepl.d $xr0, X, 0
#else
xvldrepl.w $xr0, X, 0
#endif
#ifdef DSDOT
xvfcvtl.d.s $xr0, $xr0
xvfsub.d $xr8, $xr0, $xr0
xvfsub.d $xr9, $xr0, $xr0
#else
XVFSUB $xr8, $xr0, $xr0
XVFSUB $xr9, $xr0, $xr0
#endif
#ifdef DOUBLE
srai.d I, N, 4
#else
srai.d I, N, 5
#endif
bge $r0, I, .L12 /* FLOAT: <32 ; DOUBLE: <16 */
.align 3
.L11:
/* FLOAT: 32~ ; DOUBLE: 16~ */
xvld $xr0, X, 0
xvld $xr1, X, 32
xvld $xr2, X, 64
xvld $xr3, X, 96
xvld $xr4, Y, 0
xvld $xr5, Y, 32
xvld $xr6, Y, 64
xvld $xr7, Y, 96
addi.w I, I, -1
addi.d X, X, 128
addi.d Y, Y, 128
#ifdef DSDOT
xvfcvtl.d.s $xr10, $xr0
xvfcvtl.d.s $xr11, $xr4
xvfcvth.d.s $xr12, $xr0
xvfcvth.d.s $xr13, $xr4
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
xvfcvtl.d.s $xr10, $xr1
xvfcvtl.d.s $xr11, $xr5
xvfcvth.d.s $xr12, $xr1
xvfcvth.d.s $xr13, $xr5
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
xvfcvtl.d.s $xr10, $xr2
xvfcvtl.d.s $xr11, $xr6
xvfcvth.d.s $xr12, $xr2
xvfcvth.d.s $xr13, $xr6
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
xvfcvtl.d.s $xr10, $xr3
xvfcvtl.d.s $xr11, $xr7
xvfcvth.d.s $xr12, $xr3
xvfcvth.d.s $xr13, $xr7
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
#else
XVFMADD $xr8, $xr0, $xr4, $xr8
XVFMADD $xr9, $xr1, $xr5, $xr9
XVFMADD $xr8, $xr2, $xr6, $xr8
XVFMADD $xr9, $xr3, $xr7, $xr9
#endif
bnez I, .L11
.align 3
.L12:
#ifdef DOUBLE
andi I, N, 0xf
srai.d I, I, 2
#else
andi I, N, 0x1f
srai.d I, I, 3
#endif
bge $r0, I, .L14 /* DOUBLE: <4 ; FLOAT: <8 */
.align 3
.L13:
/* FLOAT: 8~31 ; DOUBLE: 4~15 */
xvld $xr0, X, 0
xvld $xr4, Y, 0
addi.w I, I, -1
addi.d X, X, 32
addi.d Y, Y, 32
#ifdef DSDOT
xvfcvtl.d.s $xr10, $xr0
xvfcvtl.d.s $xr11, $xr4
xvfcvth.d.s $xr12, $xr0
xvfcvth.d.s $xr13, $xr4
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
#else
XVFMADD $xr8, $xr0, $xr4, $xr8
#endif
bnez I, .L13
.align 3
.L14:
/* store dot in s1 $f8 */
#ifdef DSDOT
xvfadd.d $xr8, $xr8, $xr9
fsub.s s2, s2, s2, /* set s2 to 0.0 */
xvpermi.q $xr0, $xr8, 0x1
vfadd.d $vr8, $vr8, $vr0
vpackod.d $vr0, $vr8, $vr8
vfadd.d $vr8, $vr8, $vr0
#else
XVFADD $xr8, $xr8, $xr9
SUB s2, s2, s2 /* set s2 to 0.0 */
xvpermi.q $xr0, $xr8, 0x1
VFADD $vr8, $vr8, $vr0
vpackod.d $vr0, $vr8, $vr8
#ifdef DOUBLE
VFADD $vr8, $vr8, $vr0
#else
VFADD $vr8, $vr8, $vr0
vpackod.w $vr0, $vr8, $vr8
VFADD $vr8, $vr8, $vr0
#endif /* defined DOUBLE */
#endif /* defined DSDOT */
.align 3
.L15:
#ifdef DOUBLE
andi I, N, 0x3
#else
andi I, N, 0x7
#endif
bge $r0, I, .L999 /* =0 */
.align 3
.L16:
/* FLOAT: 1~7 ; DOUBLE: 1~3 */
LD a1, X, 0
LD b1, Y, 0
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
addi.d I, I, -1
addi.d X, X, SIZE
addi.d Y, Y, SIZE
bnez I, .L16
b .L999
.align 3
.L20:
/* !((inc_x == 1) && (inc_y == 1)) */
srai.d I, N, 3
#ifdef F_INTERFACE
bgez INCX, .L21
addi.d TEMP, N, -1
mult TEMP, INCX
mflo TEMP
dsub X, X, TEMP
.align 3
.L21:
bgez INCY, .L22
addi.d TEMP, N, -1
mult TEMP, INCY
mflo TEMP
dsub Y, Y, TEMP
.align 3
.L22:
#endif
bge $r0, I, .L25 /* <8 */
.align 3
.L23:
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
blt $r0, I, .L23
.align 3
.L25:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
blt $r0, I, .L26
.align 3
.L999:
#ifdef DSDOT
fadd.d $f0, s1, s2
#else
ADD $f0, s1, s2
#endif
move $r4, $r17
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,364 @@
/***************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
/* Don't change following FR unless you know the effects. */
#define s1 $f8
#define s2 $f9
#define a1 $f10
#define b1 $f11
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
LDINT INCY, 0(INCY)
#endif
/* init $f8 and $f9 to zero */
SUB s1, s1, s1
SUB s2, s2, s2
slli.d INCX, INCX, BASE_SHIFT
li.d TEMP, SIZE
slli.d INCY, INCY, BASE_SHIFT
bge $r0, N, .L999
bne INCX, TEMP, .L20 /* inc_x=1 */
bne INCY, TEMP, .L20 /* inc_y=1 */
/* !((inc_x == 1) && (inc_y == 1)) */
/* init $vr8 and $vr9 to zero */
#ifdef DOUBLE
vldrepl.d $vr0, X, 0
#else
vldrepl.w $vr0, X, 0
#endif
#ifdef DSDOT
vfcvtl.d.s $vr0, $vr0
vfsub.d $vr8, $vr0, $vr0
vfsub.d $vr9, $vr0, $vr0
#else
VFSUB $vr8, $vr0, $vr0
VFSUB $vr9, $vr0, $vr0
#endif
#ifdef DOUBLE
srai.d I, N, 3
#else
srai.d I, N, 4
#endif
bge $r0, I, .L12 /* FLOAT: <16 ; DOUBLE: <8 */
.align 3
.L11:
/* FLOAT: 16~ ; DOUBLE: 8~ */
vld $vr0, X, 0
vld $vr1, X, 16
vld $vr2, X, 32
vld $vr3, X, 48
vld $vr4, Y, 0
vld $vr5, Y, 16
vld $vr6, Y, 32
vld $vr7, Y, 48
addi.w I, I, -1
addi.d X, X, 64
addi.d Y, Y, 64
#ifdef DSDOT
vfcvtl.d.s $vr10, $vr0
vfcvtl.d.s $vr11, $vr4
vfcvth.d.s $vr12, $vr0
vfcvth.d.s $vr13, $vr4
vfmadd.d $vr8, $vr10, $vr12, $vr8
vfmadd.d $vr9, $vr11, $vr13, $vr9
vfcvtl.d.s $vr10, $vr1
vfcvtl.d.s $vr11, $vr5
vfcvth.d.s $vr12, $vr1
vfcvth.d.s $vr13, $vr5
vfmadd.d $vr8, $vr10, $vr12, $vr8
vfmadd.d $vr9, $vr11, $vr13, $vr9
vfcvtl.d.s $vr10, $vr2
vfcvtl.d.s $vr11, $vr6
vfcvth.d.s $vr12, $vr2
vfcvth.d.s $vr13, $vr6
vfmadd.d $vr8, $vr10, $vr12, $vr8
vfmadd.d $vr9, $vr11, $vr13, $vr9
vfcvtl.d.s $vr10, $vr3
vfcvtl.d.s $vr11, $vr7
vfcvth.d.s $vr12, $vr3
vfcvth.d.s $vr13, $vr7
vfmadd.d $vr8, $vr10, $vr12, $vr8
vfmadd.d $vr9, $vr11, $vr13, $vr9
#else
VFMADD $vr8, $vr0, $vr4, $vr8
VFMADD $vr9, $vr1, $vr5, $vr9
VFMADD $vr8, $vr2, $vr6, $vr8
VFMADD $vr9, $vr3, $vr7, $vr9
#endif
bnez I, .L11
.align 3
.L12:
#ifdef DOUBLE
andi I, N, 0x7
srai.d I, I, 1
#else
andi I, N, 0xf
srai.d I, I, 2
#endif
bge $r0, I, .L14 /* DOUBLE: <2 ; FLOAT: <4 */
.align 3
.L13:
/* FLOAT: 4~15 ; DOUBLE: 2~7 */
vld $vr0, X, 0
vld $vr4, Y, 0
addi.w I, I, -1
addi.d X, X, 16
addi.d Y, Y, 16
#ifdef DSDOT
vfcvtl.d.s $vr10, $vr0
vfcvtl.d.s $vr11, $vr4
vfcvth.d.s $vr12, $vr0
vfcvth.d.s $vr13, $vr4
vfmadd.d $vr8, $vr10, $vr12, $vr8
vfmadd.d $vr9, $vr11, $vr13, $vr9
#else
VFMADD $vr8, $vr0, $vr4, $vr8
#endif
bnez I, .L13
.align 3
.L14:
/* store dot in s1 $f8 */
#ifdef DSDOT
vfadd.d $vr8, $vr8, $vr9
fsub.s s2, s2, s2, /* set s2 to 0.0 */
vpackod.d $vr0, $vr8, $vr8
vfadd.d $vr8, $vr8, $vr0
#else
VFADD $vr8, $vr8, $vr9
SUB s2, s2, s2 /* set s2 to 0.0 */
vpackod.d $vr0, $vr8, $vr8
#ifdef DOUBLE
VFADD $vr8, $vr8, $vr0
#else
VFADD $vr8, $vr8, $vr0
vpackod.w $vr0, $vr8, $vr8
VFADD $vr8, $vr8, $vr0
#endif /* defined DOUBLE */
#endif /* defined DSDOT */
.align 3
.L15:
#ifdef DOUBLE
andi I, N, 0x1
#else
andi I, N, 0x3
#endif
bge $r0, I, .L999 /* =0 */
.align 3
.L16:
/* DOUBLE: 1 ; FLOAT: 1~3 */
LD a1, X, 0
LD b1, Y, 0
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
addi.d I, I, -1
addi.d X, X, SIZE
addi.d Y, Y, SIZE
bnez I, .L16
b .L999
.align 3
.L20:
/* !((inc_x == 1) && (inc_y == 1)) */
srai.d I, N, 3
#ifdef F_INTERFACE
bgez INCX, .L21
addi.d TEMP, N, -1
mult TEMP, INCX
mflo TEMP
dsub X, X, TEMP
.align 3
.L21:
bgez INCY, .L22
addi.d TEMP, N, -1
mult TEMP, INCY
mflo TEMP
dsub Y, Y, TEMP
.align 3
.L22:
#endif
bge $r0, I, .L25 /* <8 */
.align 3
.L23:
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
blt $r0, I, .L23
.align 3
.L25:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
blt $r0, I, .L26
.align 3
.L999:
#ifdef DSDOT
fadd.d $f0, s1, s2
#else
ADD $f0, s1, s2
#endif
move $r4, $r17
jirl $r0, $r1, 0x0
EPILOGUE