loongarch: Add optimization for dsdot kernel.

This commit is contained in:
Shiyou Yin 2023-11-24 16:40:32 +08:00
parent 3def6a8143
commit 13b8c44b44
2 changed files with 74 additions and 14 deletions

View File

@ -1,7 +1,8 @@
ifndef NO_LASX ifndef NO_LASX
SDOTKERNEL = dot_lasx.S SDOTKERNEL = dot_lasx.S
DDOTKERNEL = dot_lasx.S DSDOTKERNEL = dot_lasx.S
DDOTKERNEL = dot_lasx.S
DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMKERNEL = dgemm_kernel_16x4.S
DGEMMINCOPY = dgemm_ncopy_16.S DGEMMINCOPY = dgemm_ncopy_16.S

View File

@ -51,6 +51,8 @@ PROLOGUE
LDINT INCX, 0(INCX) LDINT INCX, 0(INCX)
LDINT INCY, 0(INCY) LDINT INCY, 0(INCY)
#endif #endif
/* init $f8 and $f9 to zero */
SUB s1, s1, s1 SUB s1, s1, s1
SUB s2, s2, s2 SUB s2, s2, s2
slli.d INCX, INCX, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT
@ -59,11 +61,8 @@ PROLOGUE
bge $r0, N, .L999 bge $r0, N, .L999
bne INCX, TEMP, .L20 /* inc_x=1 */ bne INCX, TEMP, .L20 /* inc_x=1 */
bne INCY, TEMP, .L20 /* inc_y=1 */ bne INCY, TEMP, .L20 /* inc_y=1 */
#ifdef DOUBLE
srai.d I, N, 4 /* !((inc_x == 1) && (inc_y == 1)) */
#else
srai.d I, N, 5
#endif
/* init $xr8 and $xr9 to zero */ /* init $xr8 and $xr9 to zero */
#ifdef DOUBLE #ifdef DOUBLE
@ -71,13 +70,24 @@ PROLOGUE
#else #else
xvldrepl.w $xr0, X, 0 xvldrepl.w $xr0, X, 0
#endif #endif
#ifdef DSDOT
xvfcvtl.d.s $xr0, $xr0
xvfsub.d $xr8, $xr0, $xr0
xvfsub.d $xr9, $xr0, $xr0
#else
XVFSUB $xr8, $xr0, $xr0 XVFSUB $xr8, $xr0, $xr0
XVFSUB $xr9, $xr0, $xr0 XVFSUB $xr9, $xr0, $xr0
#endif
/* !((inc_x == 1) && (inc_y == 1)) */ #ifdef DOUBLE
bge $r0, I, .L12 /* <32 */ srai.d I, N, 4
#else
srai.d I, N, 5
#endif
bge $r0, I, .L12 /* FLOAT: <32 ; DOUBLE: <16 */
.align 3
.L11: .L11:
/* case 32~ */ /* FLOAT: 32~ ; DOUBLE: 16~ */
xvld $xr0, X, 0 xvld $xr0, X, 0
xvld $xr1, X, 32 xvld $xr1, X, 32
xvld $xr2, X, 64 xvld $xr2, X, 64
@ -89,11 +99,39 @@ PROLOGUE
addi.w I, I, -1 addi.w I, I, -1
addi.d X, X, 128 addi.d X, X, 128
addi.d Y, Y, 128 addi.d Y, Y, 128
#ifdef DSDOT
xvfcvtl.d.s $xr10, $xr0
xvfcvtl.d.s $xr11, $xr4
xvfcvth.d.s $xr12, $xr0
xvfcvth.d.s $xr13, $xr4
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
xvfcvtl.d.s $xr10, $xr1
xvfcvtl.d.s $xr11, $xr5
xvfcvth.d.s $xr12, $xr1
xvfcvth.d.s $xr13, $xr5
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
xvfcvtl.d.s $xr10, $xr2
xvfcvtl.d.s $xr11, $xr6
xvfcvth.d.s $xr12, $xr2
xvfcvth.d.s $xr13, $xr6
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
xvfcvtl.d.s $xr10, $xr3
xvfcvtl.d.s $xr11, $xr7
xvfcvth.d.s $xr12, $xr3
xvfcvth.d.s $xr13, $xr7
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
#else
XVFMADD $xr8, $xr0, $xr4, $xr8 XVFMADD $xr8, $xr0, $xr4, $xr8
XVFMADD $xr9, $xr1, $xr5, $xr9 XVFMADD $xr9, $xr1, $xr5, $xr9
XVFMADD $xr8, $xr2, $xr6, $xr8 XVFMADD $xr8, $xr2, $xr6, $xr8
XVFMADD $xr9, $xr3, $xr7, $xr9 XVFMADD $xr9, $xr3, $xr7, $xr9
#endif
bnez I, .L11 bnez I, .L11
.align 3
.L12: .L12:
#ifdef DOUBLE #ifdef DOUBLE
andi I, N, 0xf andi I, N, 0xf
@ -102,18 +140,37 @@ PROLOGUE
andi I, N, 0x1f andi I, N, 0x1f
srai.d I, I, 3 srai.d I, I, 3
#endif #endif
bge $r0, I, .L14 /* <8 */ bge $r0, I, .L14 /* DOUBLE: <4 ; FLOAT: <8 */
.align 3
.L13: .L13:
/* case 8~31 */ /* FLOAT: 8~31 ; DOUBLE: 4~15 */
xvld $xr0, X, 0 xvld $xr0, X, 0
xvld $xr4, Y, 0 xvld $xr4, Y, 0
addi.w I, I, -1 addi.w I, I, -1
addi.d X, X, 32 addi.d X, X, 32
addi.d Y, Y, 32 addi.d Y, Y, 32
#ifdef DSDOT
xvfcvtl.d.s $xr10, $xr0
xvfcvtl.d.s $xr11, $xr4
xvfcvth.d.s $xr12, $xr0
xvfcvth.d.s $xr13, $xr4
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
#else
XVFMADD $xr8, $xr0, $xr4, $xr8 XVFMADD $xr8, $xr0, $xr4, $xr8
#endif
bnez I, .L13 bnez I, .L13
.align 3
.L14: .L14:
/* store dot in s1 $f8 */ /* store dot in s1 $f8 */
#ifdef DSDOT
xvfadd.d $xr8, $xr8, $xr9
fsub.s s2, s2, s2, /* set s2 to 0.0 */
xvpermi.q $xr0, $xr8, 0x1
vfadd.d $vr8, $vr8, $vr0
vpackod.d $vr0, $vr8, $vr8
vfadd.d $vr8, $vr8, $vr0
#else
XVFADD $xr8, $xr8, $xr9 XVFADD $xr8, $xr8, $xr9
SUB s2, s2, s2 /* set s2 to 0.0 */ SUB s2, s2, s2 /* set s2 to 0.0 */
xvpermi.q $xr0, $xr8, 0x1 xvpermi.q $xr0, $xr8, 0x1
@ -125,7 +182,9 @@ PROLOGUE
VFADD $vr8, $vr8, $vr0 VFADD $vr8, $vr8, $vr0
vpackod.w $vr0, $vr8, $vr8 vpackod.w $vr0, $vr8, $vr8
VFADD $vr8, $vr8, $vr0 VFADD $vr8, $vr8, $vr0
#endif #endif /* defined DOUBLE */
#endif /* defined DSDOT */
.align 3
.L15: .L15:
#ifdef DOUBLE #ifdef DOUBLE
andi I, N, 0x3 andi I, N, 0x3
@ -135,7 +194,7 @@ PROLOGUE
bge $r0, I, .L999 /* =0 */ bge $r0, I, .L999 /* =0 */
.align 3 .align 3
.L16: .L16:
/* case 1~7 */ /* FLOAT: 1~7 ; DOUBLE: 1~3 */
LD a1, X, 0 LD a1, X, 0
LD b1, Y, 0 LD b1, Y, 0
#ifdef DSDOT #ifdef DSDOT