loongarch64: Fixed clang compilation issues

This commit is contained in:
gxw 2024-04-15 10:31:33 +08:00
parent 15b9fc3f78
commit 7cd438a5ac
27 changed files with 645 additions and 514 deletions

View File

@ -955,12 +955,18 @@ endif
ifeq ($(ARCH), loongarch64) ifeq ($(ARCH), loongarch64)
LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d) LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d)
LA64_ARCH=$(shell $(CC) -march=loongarch64 -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo loongarch64)
ifneq ($(LA64_ABI), lp64d) ifneq ($(LA64_ABI), lp64d)
LA64_ABI=lp64 LA64_ABI=lp64
endif endif
ifneq ($(LA64_ARCH), loongarch64)
CCOMMON_OPT += -mabi=$(LA64_ABI)
FCOMMON_OPT += -mabi=$(LA64_ABI)
else
CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
endif endif
endif
endif endif

15
c_check
View File

@ -197,10 +197,22 @@ fi
no_lsx=0 no_lsx=0
no_lasx=0 no_lasx=0
if [ "$architecture" = "loongarch64" ]; then if [ "$architecture" = "loongarch64" ]; then
lasx_flags='-march=loongarch64'
lsx_flags='-march=loongarch64'
tmpd="$(mktemp -d)" tmpd="$(mktemp -d)"
tmparch="$tmpd/arch.c"
printf "void main(void){ }\n" >> "$tmparch"
args="-march=loongarch64 -o $tmparch.o $tmparch"
{
$compiler_name $flags $args >/dev/null 2>&1
} || {
lasx_flags=''
lsx_flags=''
}
tmplsx="$tmpd/lsx.c" tmplsx="$tmpd/lsx.c"
codelsx='"vadd.b $vr0, $vr0, $vr0"' codelsx='"vadd.b $vr0, $vr0, $vr0"'
lsx_flags='-march=loongarch64'
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx" printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx"
args="$lsx_flags -o $tmplsx.o $tmplsx" args="$lsx_flags -o $tmplsx.o $tmplsx"
{ {
@ -211,7 +223,6 @@ if [ "$architecture" = "loongarch64" ]; then
tmplasx="$tmpd/lasx.c" tmplasx="$tmpd/lasx.c"
codelasx='"xvadd.b $xr0, $xr0, $xr0"' codelasx='"xvadd.b $xr0, $xr0, $xr0"'
lasx_flags='-march=loongarch64'
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx" printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx"
args="$lasx_flags -o $tmplasx.o $tmplasx" args="$lasx_flags -o $tmplasx.o $tmplasx"
{ {

View File

@ -279,7 +279,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
PTR_LD INC_Y, $sp, 0 PTR_LD INC_Y, $sp, 0
push_if_used 17 + 7, 31 push_if_used 7, 7
PTR_ADDI K, $r0, 0x01 PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K PTR_SUB I, INC_X, K
PTR_SUB J, INC_Y, K PTR_SUB J, INC_Y, K
@ -318,6 +318,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1
.L_END: .L_END:
pop_if_used 17 + 7, 31 pop_if_used 7, 7
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
EPILOGUE EPILOGUE

View File

@ -336,7 +336,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
PTR_LD INC_Y, $sp, 0 PTR_LD INC_Y, $sp, 0
push_if_used 17 + 7, 31 push_if_used 7, 7
PTR_ADDI K, $r0, 0x01 PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K PTR_SUB I, INC_X, K
PTR_SUB J, INC_Y, K PTR_SUB J, INC_Y, K
@ -378,6 +378,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
CGEMV_N_LASX GAP_1_1, X_8_GAP, X_1, Y_8_GAP, Y_1 CGEMV_N_LASX GAP_1_1, X_8_GAP, X_1, Y_8_GAP, Y_1
.L_END: .L_END:
pop_if_used 17 + 7, 31 pop_if_used 7, 7
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
EPILOGUE EPILOGUE

View File

@ -255,7 +255,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
PTR_LD INC_Y, $sp, 0 PTR_LD INC_Y, $sp, 0
push_if_used 17 + 8, 30 push_if_used 8, 6
PTR_ADDI K, $r0, 0x01 PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K PTR_SUB I, INC_X, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
@ -285,6 +285,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1: /* if (incx != 1) */ .L_GAP_1: /* if (incx != 1) */
CGEMV_T_LSX GAP_1, X4_GAP CGEMV_T_LSX GAP_1, X4_GAP
.L_END: .L_END:
pop_if_used 17 + 8, 30 pop_if_used 8, 6
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
EPILOGUE EPILOGUE

View File

@ -304,7 +304,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
PTR_LD INC_Y, $sp, 0 PTR_LD INC_Y, $sp, 0
push_if_used 17 + 8, 30 push_if_used 8, 6
PTR_ADDI K, $r0, 0x01 PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K PTR_SUB I, INC_X, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
@ -337,6 +337,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1: /* if (incx != 1) */ .L_GAP_1: /* if (incx != 1) */
CGEMV_T_LASX GAP_1, X8_GAP CGEMV_T_LASX GAP_1, X8_GAP
.L_END: .L_END:
pop_if_used 17 + 8, 30 pop_if_used 8, 6
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
EPILOGUE EPILOGUE

View File

@ -79,7 +79,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define D7 $vr15 #define D7 $vr15
PROLOGUE PROLOGUE
push_if_used 26, 32 push_if_used 0, 0
move TD, DST move TD, DST
move TS, SRC move TS, SRC
slli.d TL, LDA, 0x03 slli.d TL, LDA, 0x03
@ -278,6 +278,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d M, M, -1 addi.d M, M, -1
blt ZERO, M, .L_M1 blt ZERO, M, .L_M1
.L_N0: .L_N0:
pop_if_used 26, 32 pop_if_used 0, 0
jirl $r0, $r1, 0x00 jirl $r0, $r1, 0x00
EPILOGUE EPILOGUE

View File

@ -66,7 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define U7 $vr7 #define U7 $vr7
PROLOGUE PROLOGUE
push_if_used 18, 8 push_if_used 1, 0
move S0, SRC move S0, SRC
move P0, DST move P0, DST
@ -274,7 +274,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fst.d F0, P3, 0x00 fst.d F0, P3, 0x00
.L_M0: .L_M0:
pop_if_used 18, 8 pop_if_used 1, 0
jirl $r0, $r1, 0x00 jirl $r0, $r1, 0x00
EPILOGUE EPILOGUE

View File

@ -76,7 +76,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define U7 $vr7 #define U7 $vr7
PROLOGUE PROLOGUE
push_if_used 24, 8 push_if_used 7, 0
move S0, SRC move S0, SRC
move P0, DST move P0, DST
@ -592,6 +592,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d S1, S1, 0x08 addi.d S1, S1, 0x08
addi.d P4, P4, 0x08 addi.d P4, P4, 0x08
.L_M0: .L_M0:
pop_if_used 24, 8 pop_if_used 7, 0
jirl $r0, $r1, 0x00 jirl $r0, $r1, 0x00
EPILOGUE EPILOGUE

View File

@ -509,7 +509,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
PTR_LD INC_Y, $sp, 0 PTR_LD INC_Y, $sp, 0
push_if_used 17 + 7, 24 + 4 push_if_used 7, 4
PTR_ADDI K, $r0, 0x01 PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K PTR_SUB I, INC_X, K
PTR_SUB J, INC_Y, K PTR_SUB J, INC_Y, K
@ -549,6 +549,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
DGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 DGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
.L_END: .L_END:
pop_if_used 17 + 7, 24 + 4 pop_if_used 7, 4
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
EPILOGUE EPILOGUE

View File

@ -445,7 +445,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
PTR_LD INC_Y, $sp, 0 PTR_LD INC_Y, $sp, 0
push_if_used 17 + 8, 24 + 3 push_if_used 8, 3
PTR_ADDI K, $r0, 0x01 PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K PTR_SUB I, INC_X, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
@ -476,6 +476,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1: /* if (incx != 1) */ .L_GAP_1: /* if (incx != 1) */
DGEMV_T_LASX GAP_1, X8_GAP, X4_GAP DGEMV_T_LASX GAP_1, X8_GAP, X4_GAP
.L_END: .L_END:
pop_if_used 17 + 8, 24 + 3 pop_if_used 8, 3
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
EPILOGUE EPILOGUE

View File

@ -1029,7 +1029,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
PROLOGUE PROLOGUE
push_if_used 26, 32 push_if_used 9, 8
PTR_SLLI LDC, LDC, 3 PTR_SLLI LDC, LDC, 3
/* if (!(N >> 2)) goto L_N3 */ /* if (!(N >> 2)) goto L_N3 */
PTR_SRAI J, N, 2 /* J = bn >> 2 */ PTR_SRAI J, N, 2 /* J = bn >> 2 */
@ -1361,6 +1361,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
blt ZERO, I, .L_N1_I1 blt ZERO, I, .L_N1_I1
.L_N1_M0: .L_N1_M0:
.L_N0: .L_N0:
pop_if_used 26, 32 pop_if_used 9, 8
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
EPILOGUE EPILOGUE

View File

@ -128,31 +128,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dtrsm_kernel_macro.S" #include "dtrsm_kernel_macro.S"
.macro ldrepl_macro start, end, stride .macro ldrepl_macro stride:req, index:req, more:vararg
// Load Ux (x = 0...15) // Load Ux (x = 0...15)
.if \start <= \end GLDREPL xv, d, $xr\index, A0, \index * 8 - \stride * 8
GLDREPL xv, d, $xr\start, A0, \stride * 8 .ifnb \more
ldrepl_macro %start + 1, \end, %stride + 1 ldrepl_macro \stride, \more
.endif .endif
.endm .endm
.macro nmsub_macro start0, end0, start1, reg .macro nmsub_macro reg:req, start0:req, start1:req, more:vararg
// Gx -= reg * Ux // Gx -= reg * Ux
.if \start0 <= \end0
xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0
nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg .ifnb \more
nmsub_macro \reg, \more
.endif .endif
.endm .endm
.macro B_st_macro start, end, stride, N .macro B_st_macro N:req, stride:req, start:req, more:vararg
// Store Gx(x = 16...31) // Store Gx(x = 16...31)
.if \start <= \end
.if \N == 4 .if \N == 4
xvst $xr\start, B0, \stride * 0x20 xvst $xr\start, B0, \start * 0x20 - \stride * 0x20
.elseif \N == 2 .elseif \N == 2
vst $vr\start, B0, \stride * 0x10 vst $vr\start, B0, \start * 0x10 - \stride * 0x10
.elseif \N == 1 .elseif \N == 1
fst.d $f\start, B0, \stride * 0x08 fst.d $f\start, B0, \start * 0x08 - \stride * 0x08
.endif .endif
B_st_macro %start + 1, \end, %stride + 1, \N .ifnb \more
B_st_macro \N, \stride, \more
.endif .endif
.endm .endm
@ -194,86 +194,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 255 // 255
// Sequentially extract data from A in row order // Sequentially extract data from A in row order
// Load 0 // Load 0
ldrepl_macro 0, 15, 0 ldrepl_macro 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G0, G0, U0 GMUL xvf, d, G0, G0, U0
nmsub_macro 17, 31, 1, G0 nmsub_macro G0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, \
25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8 PTR_ADDI A0, A0, 17 * 8
// Load 1 // Load 1
ldrepl_macro 1, 15, 0 ldrepl_macro 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G1, G1, U1 GMUL xvf, d, G1, G1, U1
nmsub_macro 18, 31, 2, G1 nmsub_macro G1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, \
25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8 PTR_ADDI A0, A0, 17 * 8
// Load 2 // Load 2
ldrepl_macro 2, 15, 0 ldrepl_macro 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G2, G2, U2 GMUL xvf, d, G2, G2, U2
nmsub_macro 19, 31, 3, G2 nmsub_macro G2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, 25, 9, 26, \
10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8 PTR_ADDI A0, A0, 17 * 8
// Load 3 // Load 3
ldrepl_macro 3, 15, 0 ldrepl_macro 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G3, G3, U3 GMUL xvf, d, G3, G3, U3
nmsub_macro 20, 31, 4, G3 nmsub_macro G3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, 25, 9, 26, 10, \
27, 11, 28, 12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8 PTR_ADDI A0, A0, 17 * 8
// Load 4 // Load 4
ldrepl_macro 4, 15, 0 ldrepl_macro 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G4, G4, U4 GMUL xvf, d, G4, G4, U4
nmsub_macro 21, 31, 5, G4 nmsub_macro G4, 21, 5, 22, 6, 23, 7, 24, 8, 25, 9, 26, 10, 27, 11, \
28, 12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8 PTR_ADDI A0, A0, 17 * 8
// Load 5 // Load 5
ldrepl_macro 5, 15, 0 ldrepl_macro 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G5, G5, U5 GMUL xvf, d, G5, G5, U5
nmsub_macro 22, 31, 6, G5 nmsub_macro G5, 22, 6, 23, 7, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, \
29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8 PTR_ADDI A0, A0, 17 * 8
// Load 6 // Load 6
ldrepl_macro 6, 15, 0 ldrepl_macro 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G6, G6, U6 GMUL xvf, d, G6, G6, U6
nmsub_macro 23, 31, 7, G6 nmsub_macro G6, 23, 7, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, \
30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8 PTR_ADDI A0, A0, 17 * 8
// Load 7 // Load 7
ldrepl_macro 7, 15, 0 ldrepl_macro 7, 7, 8, 9, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G7, G7, U7 GMUL xvf, d, G7, G7, U7
nmsub_macro 24, 31, 8, G7 nmsub_macro G7, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8 PTR_ADDI A0, A0, 17 * 8
// Load 8 // Load 8
ldrepl_macro 8, 15, 0 ldrepl_macro 8, 8, 9, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G8, G8, U8 GMUL xvf, d, G8, G8, U8
nmsub_macro 25, 31, 9, G8 nmsub_macro G8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8 PTR_ADDI A0, A0, 17 * 8
// Load 9 // Load 9
ldrepl_macro 9, 15, 0 ldrepl_macro 9, 9, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G9, G9, U9 GMUL xvf, d, G9, G9, U9
nmsub_macro 26, 31, 10, G9 nmsub_macro G9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8 PTR_ADDI A0, A0, 17 * 8
// Load 10 // Load 10
ldrepl_macro 10, 15, 0 ldrepl_macro 10, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G10, G10, U10 GMUL xvf, d, G10, G10, U10
nmsub_macro 27, 31, 11, G10 nmsub_macro G10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8 PTR_ADDI A0, A0, 17 * 8
// Load 11 // Load 11
ldrepl_macro 11, 15, 0 ldrepl_macro 11, 11, 12, 13, 14, 15
GMUL xvf, d, G11, G11, U11 GMUL xvf, d, G11, G11, U11
nmsub_macro 28, 31, 12, G11 nmsub_macro G11, 28, 12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8 PTR_ADDI A0, A0, 17 * 8
// Load 12 // Load 12
ldrepl_macro 12, 15, 0 ldrepl_macro 12, 12, 13, 14, 15
GMUL xvf, d, G12, G12, U12 GMUL xvf, d, G12, G12, U12
nmsub_macro 29, 31, 13, G12 nmsub_macro G12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8 PTR_ADDI A0, A0, 17 * 8
// Load 13 // Load 13
ldrepl_macro 13, 15, 0 ldrepl_macro 13, 13, 14, 15
GMUL xvf, d, G13, G13, U13 GMUL xvf, d, G13, G13, U13
nmsub_macro 30, 31, 14, G13 nmsub_macro G13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8 PTR_ADDI A0, A0, 17 * 8
// Load 14 // Load 14
ldrepl_macro 14, 15, 0 ldrepl_macro 14, 14, 15
GMUL xvf, d, G14, G14, U14 GMUL xvf, d, G14, G14, U14
nmsub_macro 31, 31, 15, G14 nmsub_macro G14, 31, 15
PTR_ADDI A0, A0, 17 * 8 PTR_ADDI A0, A0, 17 * 8
// Load 15 // Load 15
ldrepl_macro 15, 15, 0 ldrepl_macro 15, 15
GMUL xvf, d, G15, G15, U15 GMUL xvf, d, G15, G15, U15
// Finally, We can store the result. // Finally, We can store the result.
// For B, stored sequentially, and C, first transpose and then store // For B, stored sequentially, and C, first transpose and then store
B_st_macro 16, 31, 0, \N B_st_macro \N, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1 GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1
GTRANSPOSE4x4_D G8, G9, G10, G11, G8, G9, G10, G11, U0, U1 GTRANSPOSE4x4_D G8, G9, G10, G11, G8, G9, G10, G11, U0, U1
@ -334,46 +341,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 63 // 63
// Sequentially extract data from A in row order // Sequentially extract data from A in row order
// Load 0 // Load 0
ldrepl_macro 0, 7, 0 ldrepl_macro 0, 0, 1, 2, 3, 4, 5, 6, 7
GMUL xvf, d, G0, G0, U0 GMUL xvf, d, G0, G0, U0
nmsub_macro 17, 23, 1, G0 nmsub_macro G0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
PTR_ADDI A0, A0, 9 * 8 PTR_ADDI A0, A0, 9 * 8
// Load 1 // Load 1
ldrepl_macro 1, 7, 0 ldrepl_macro 1, 1, 2, 3, 4, 5, 6, 7
GMUL xvf, d, G1, G1, U1 GMUL xvf, d, G1, G1, U1
nmsub_macro 18, 23, 2, G1 nmsub_macro G1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
PTR_ADDI A0, A0, 9 * 8 PTR_ADDI A0, A0, 9 * 8
// Load 2 // Load 2
ldrepl_macro 2, 7, 0 ldrepl_macro 2, 2, 3, 4, 5, 6, 7
GMUL xvf, d, G2, G2, U2 GMUL xvf, d, G2, G2, U2
nmsub_macro 19, 23, 3, G2 nmsub_macro G2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
PTR_ADDI A0, A0, 9 * 8 PTR_ADDI A0, A0, 9 * 8
// Load 3 // Load 3
ldrepl_macro 3, 7, 0 ldrepl_macro 3, 3, 4, 5, 6, 7
GMUL xvf, d, G3, G3, U3 GMUL xvf, d, G3, G3, U3
nmsub_macro 20, 23, 4, G3 nmsub_macro G3, 20, 4, 21, 5, 22, 6, 23, 7
PTR_ADDI A0, A0, 9 * 8 PTR_ADDI A0, A0, 9 * 8
// Load 4 // Load 4
ldrepl_macro 4, 7, 0 ldrepl_macro 4, 4, 5, 6, 7
GMUL xvf, d, G4, G4, U4 GMUL xvf, d, G4, G4, U4
nmsub_macro 21, 23, 5, G4 nmsub_macro G4, 21, 5, 22, 6, 23, 7
PTR_ADDI A0, A0, 9 * 8 PTR_ADDI A0, A0, 9 * 8
// Load 5 // Load 5
ldrepl_macro 5, 7, 0 ldrepl_macro 5, 5, 6, 7
GMUL xvf, d, G5, G5, U5 GMUL xvf, d, G5, G5, U5
nmsub_macro 22, 23, 6, G5 nmsub_macro G5, 22, 6, 23, 7
PTR_ADDI A0, A0, 9 * 8 PTR_ADDI A0, A0, 9 * 8
// Load 6 // Load 6
ldrepl_macro 6, 7, 0 ldrepl_macro 6, 6, 7
GMUL xvf, d, G6, G6, U6 GMUL xvf, d, G6, G6, U6
nmsub_macro 23, 23, 7, G6 nmsub_macro G6, 23, 7
PTR_ADDI A0, A0, 9 * 8 PTR_ADDI A0, A0, 9 * 8
// Load 7 // Load 7
ldrepl_macro 7, 7, 0 ldrepl_macro 7, 7
GMUL xvf, d, G7, G7, U7 GMUL xvf, d, G7, G7, U7
// Finally, We can store the result. // Finally, We can store the result.
// For B, stored sequentially, and C, first transpose and then store // For B, stored sequentially, and C, first transpose and then store
B_st_macro 16, 23, 0, \N B_st_macro \N, 16, 16, 17, 18, 19, 20, 21, 22, 23
GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1 GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1
.if \N == 4 .if \N == 4
@ -437,26 +444,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 15 // 15
// Sequentially extract data from A in row order // Sequentially extract data from A in row order
// Load 0 // Load 0
ldrepl_macro 0, 3, 0 ldrepl_macro 0, 0, 1, 2, 3
GMUL xvf, d, G0, G0, U0 GMUL xvf, d, G0, G0, U0
nmsub_macro 17, 19, 1, G0 nmsub_macro G0, 17, 1, 18, 2, 19, 3
PTR_ADDI A0, A0, 5 * 8 PTR_ADDI A0, A0, 5 * 8
// Load 1 // Load 1
ldrepl_macro 1, 3, 0 ldrepl_macro 1, 1, 2, 3
GMUL xvf, d, G1, G1, U1 GMUL xvf, d, G1, G1, U1
nmsub_macro 18, 19, 2, G1 nmsub_macro G1, 18, 2, 19, 3
PTR_ADDI A0, A0, 5 * 8 PTR_ADDI A0, A0, 5 * 8
// Load 2 // Load 2
ldrepl_macro 2, 3, 0 ldrepl_macro 2, 2, 3
GMUL xvf, d, G2, G2, U2 GMUL xvf, d, G2, G2, U2
nmsub_macro 19, 19, 3, G2 nmsub_macro G2, 19, 3
PTR_ADDI A0, A0, 5 * 8 PTR_ADDI A0, A0, 5 * 8
// Load 3 // Load 3
ldrepl_macro 3, 3, 0 ldrepl_macro 3, 3
GMUL xvf, d, G3, G3, U3 GMUL xvf, d, G3, G3, U3
// Finally, We can store the result. // Finally, We can store the result.
// For B, stored sequentially, and C, first transpose and then store // For B, stored sequentially, and C, first transpose and then store
B_st_macro 16, 19, 0, \N B_st_macro \N, 16, 16, 17, 18, 19
GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
.if \N == 4 .if \N == 4
GST xv, , G0, C0, 0x00, G1, C1, 0x00, G2, C2, 0x00, G3, C3, 0x00 GST xv, , G0, C0, 0x00, G1, C1, 0x00, G2, C2, 0x00, G3, C3, 0x00
@ -501,16 +508,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 3 // 3
// Sequentially extract data from A in row order // Sequentially extract data from A in row order
// Load 0 // Load 0
ldrepl_macro 0, 1, 0 ldrepl_macro 0, 0, 1
GMUL xvf, d, G0, G0, U0 GMUL xvf, d, G0, G0, U0
nmsub_macro 17, 17, 1, G0 nmsub_macro G0, 17, 1
PTR_ADDI A0, A0, 3 * 8 PTR_ADDI A0, A0, 3 * 8
// Load 1 // Load 1
ldrepl_macro 1, 1, 0 ldrepl_macro 1, 1
GMUL xvf, d, G1, G1, U1 GMUL xvf, d, G1, G1, U1
// Finally, We can store the result. // Finally, We can store the result.
// For B, stored sequentially, and C, first transpose and then store // For B, stored sequentially, and C, first transpose and then store
B_st_macro 16, 17, 0, \N B_st_macro \N, 16, 16, 17
GSBUTTERFLY xv, d, U0, U1, G1, G0 GSBUTTERFLY xv, d, U0, U1, G1, G0
.if \N == 4 .if \N == 4
vst $vr0, C0, 0x00 vst $vr0, C0, 0x00
@ -717,7 +724,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
PROLOGUE PROLOGUE
push_if_used 26, 32 push_if_used 9, 8
PTR_SLLI LDC, LDC, 3 PTR_SLLI LDC, LDC, 3
/* if (!(N >> 2)) goto L_N3 */ /* if (!(N >> 2)) goto L_N3 */
PTR_SRAI J, N, 2 /* J = bn >> 2 */ PTR_SRAI J, N, 2 /* J = bn >> 2 */
@ -954,6 +961,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PTR_ADD AA, AA, T0 // aa += 1 * k PTR_ADD AA, AA, T0 // aa += 1 * k
.L_N1_M0: .L_N1_M0:
.L_N0: .L_N0:
pop_if_used 26, 32 pop_if_used 9, 8
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
EPILOGUE EPILOGUE

View File

@ -128,33 +128,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dtrsm_kernel_macro.S" #include "dtrsm_kernel_macro.S"
.macro ldrepl_macro start, end, stride .macro ldrepl_macro stride:req, index:req, more:vararg
// Load Ux (x = 0...15) // Load Ux (x = 0...15)
.if \start <= \end GLDREPL xv, d, $xr\index, B0, \index * 8 - \stride * 8
GLDREPL xv, d, $xr\start, B0, \stride * 8 .ifnb \more
ldrepl_macro %start + 1, \end, %stride + 1 ldrepl_macro \stride, \more
.endif .endif
.endm .endm
.macro nmsub_macro reg:req, start0:req, start1:req, more:vararg
.macro nmsub_macro start0, end0, start1, reg // Gx -= reg * Ux
// Ux -= reg * Dx
.if \start0 <= \end0
xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0
nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg .ifnb \more
nmsub_macro \reg, \more
.endif .endif
.endm .endm
.macro A_st_macro N:req, stride:req, start:req, more:vararg
.macro A_st_macro start, end, stride, N // Store Gx(x = 16...31)
// Store Ux(x = 0...15)
.if \start <= \end
.if \N == 4 .if \N == 4
xvst $xr\start, A0, \stride * 0x20 xvst $xr\start, A0, \start * 0x20 - \stride * 0x20
.elseif \N == 2 .elseif \N == 2
vst $vr\start, A0, \stride * 0x10 vst $vr\start, A0, \start * 0x10 - \stride * 0x10
.elseif \N == 1 .elseif \N == 1
fst.d $f\start, A0, \stride * 0x08 fst.d $f\start, A0, \start * 0x08 - \stride * 0x08
.endif .endif
A_st_macro %start + 1, \end, %stride + 1, \N .ifnb \more
A_st_macro \N, \stride, \more
.endif .endif
.endm .endm
@ -167,22 +165,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 10 11 // 10 11
// 15 // 15
// Sequentially extract data from B in row order // Sequentially extract data from B in row order
ldrepl_macro 16, 19, 0 ldrepl_macro 16, 16, 17, 18, 19
GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
ldrepl_macro 20, 22, 5 ldrepl_macro 15, 20, 21, 22
nmsub_macro 4, 7, 0, D1
ldrepl_macro 23, 24, 10 nmsub_macro D1, 4, 0, 5, 1, 6, 2, 7, 3
ldrepl_macro 13, 23, 24
GMUL xvf, d, U4, D4, U4, U5, D4, U5, U6, D4, U6, U7, D4, U7 GMUL xvf, d, U4, D4, U4, U5, D4, U5, U6, D4, U6, U7, D4, U7
ldrepl_macro 25, 25, 15 ldrepl_macro 10, 25
nmsub_macro 8, 11, 0, D2 nmsub_macro D2, 8, 0, 9, 1, 10, 2, 11, 3
nmsub_macro 8, 11, 4, D5 nmsub_macro D5, 8, 4, 9, 5, 10, 6, 11, 7
GMUL xvf, d, U8, D7, U8, U9, D7, U9, U10, D7, U10, U11, D7, U11 GMUL xvf, d, U8, D7, U8, U9, D7, U9, U10, D7, U10, U11, D7, U11
nmsub_macro 12, 15, 0, D3 nmsub_macro D3, 12, 0, 13, 1, 14, 2, 15, 3
nmsub_macro 12, 15, 4, D6 nmsub_macro D6, 12, 4, 13, 5, 14, 6, 15, 7
nmsub_macro 12, 15, 8, D8 nmsub_macro D8, 12, 8, 13, 9, 14, 10, 15, 11
GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15 GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15
// Store A // Store A
A_st_macro 0, 15, 0, 4 A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
// Store C // Store C
GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \
U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \ U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \
@ -197,13 +196,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//0 1 //0 1
// 3 // 3
// Sequentially extract data from B in row order // Sequentially extract data from B in row order
ldrepl_macro 16, 17, 0 ldrepl_macro 16, 16, 17
GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
ldrepl_macro 18, 18, 3 ldrepl_macro 15, 18
nmsub_macro 4, 7, 0, D1 nmsub_macro D1, 4, 0, 5, 1, 6, 2, 7, 3
GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
// Store A // Store A
A_st_macro 0, 7, 0, 4 A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7
// Store C // Store C
GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \
U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60
@ -218,22 +217,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 10 11 // 10 11
// 15 // 15
// Sequentially extract data from B in row order // Sequentially extract data from B in row order
ldrepl_macro 16, 19, 0 ldrepl_macro 16, 16, 17, 18, 19
GMUL xvf, d, U0, D0, U0, U1, D0, U1 GMUL xvf, d, U0, D0, U0, U1, D0, U1
ldrepl_macro 20, 22, 5 ldrepl_macro 15, 20, 21, 22
nmsub_macro 2, 3, 0, D1 nmsub_macro D1, 2, 0, 3, 1
ldrepl_macro 23, 24, 10 ldrepl_macro 13, 23, 24
GMUL xvf, d, U2, D4, U2, U3, D4, U3 GMUL xvf, d, U2, D4, U2, U3, D4, U3
ldrepl_macro 25, 25, 15 ldrepl_macro 10, 25
nmsub_macro 4, 5, 0, D2 nmsub_macro D2, 4, 0, 5, 1
nmsub_macro 4, 5, 2, D5 nmsub_macro D5, 4, 2, 5, 3
GMUL xvf, d, U4, D7, U4, U5, D7, U5 GMUL xvf, d, U4, D7, U4, U5, D7, U5
nmsub_macro 6, 7, 0, D3 nmsub_macro D3, 6, 0, 7, 1
nmsub_macro 6, 7, 2, D6 nmsub_macro D6, 6, 2, 7, 3
nmsub_macro 6, 7, 4, D8 nmsub_macro D8, 6, 4, 7, 5
GMUL xvf, d, U6, D9, U6, U7, D9, U7 GMUL xvf, d, U6, D9, U6, U7, D9, U7
// Store A // Store A
A_st_macro 0, 7, 0, 4 A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7
// Store C // Store C
GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
U2, C1, 0x00, U3, C1, 0x20, \ U2, C1, 0x00, U3, C1, 0x20, \
@ -248,13 +247,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//0 1 //0 1
// 3 // 3
// Sequentially extract data from B in row order // Sequentially extract data from B in row order
ldrepl_macro 16, 17, 0 ldrepl_macro 16, 16, 17
GMUL xvf, d, U0, D0, U0, U1, D0, U1 GMUL xvf, d, U0, D0, U0, U1, D0, U1
ldrepl_macro 18, 18, 3 ldrepl_macro 15, 18
nmsub_macro 2, 3, 0, D1 nmsub_macro D1, 2, 0, 3, 1
GMUL xvf, d, U2, D2, U2, U3, D2, U3 GMUL xvf, d, U2, D2, U2, U3, D2, U3
// Store A // Store A
A_st_macro 0, 3, 0, 4 A_st_macro 4, 0, 0, 1, 2, 3
// Store C // Store C
GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
U2, C1, 0x00, U3, C1, 0x20 U2, C1, 0x00, U3, C1, 0x20
@ -269,22 +268,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 10 11 // 10 11
// 15 // 15
// Sequentially extract data from B in row order // Sequentially extract data from B in row order
ldrepl_macro 16, 19, 0 ldrepl_macro 16, 16, 17, 18, 19
GMUL xvf, d, U0, D0, U0 GMUL xvf, d, U0, D0, U0
ldrepl_macro 20, 22, 5 ldrepl_macro 15, 20, 21, 22
nmsub_macro 1, 1, 0, D1 nmsub_macro D1, 1, 0
ldrepl_macro 23, 24, 10 ldrepl_macro 13, 23, 24
GMUL xvf, d, U1, D4, U1 GMUL xvf, d, U1, D4, U1
ldrepl_macro 25, 25, 15 ldrepl_macro 10, 25
nmsub_macro 2, 2, 0, D2 nmsub_macro D2, 2, 0
nmsub_macro 2, 2, 1, D5 nmsub_macro D5, 2, 1
GMUL xvf, d, U2, D7, U2 GMUL xvf, d, U2, D7, U2
nmsub_macro 3, 3, 0, D3 nmsub_macro D3, 3, 0
nmsub_macro 3, 3, 1, D6 nmsub_macro D6, 3, 1
nmsub_macro 3, 3, 2, D8 nmsub_macro D8, 3, 2
GMUL xvf, d, U3, D9, U3 GMUL xvf, d, U3, D9, U3
// Store A // Store A
A_st_macro 0, 3, 0, 4 A_st_macro 4, 0, 0, 1, 2, 3
// Store C // Store C
GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00 GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00
.endm .endm
@ -296,13 +295,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//0 1 //0 1
// 3 // 3
// Sequentially extract data from B in row order // Sequentially extract data from B in row order
ldrepl_macro 16, 17, 0 ldrepl_macro 16, 16, 17
GMUL xvf, d, U0, D0, U0 GMUL xvf, d, U0, D0, U0
ldrepl_macro 18, 18, 3 ldrepl_macro 15, 18
nmsub_macro 1, 1, 0, D1 nmsub_macro D1, 1, 0
GMUL xvf, d, U1, D2, U1 GMUL xvf, d, U1, D2, U1
// Store A // Store A
A_st_macro 0, 1, 0, 4 A_st_macro 4, 0, 0, 1
// Store C // Store C
GST xv, , U0, C0, 0x00, U1, C1, 0x00 GST xv, , U0, C0, 0x00, U1, C1, 0x00
.endm .endm
@ -316,23 +315,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 10 11 // 10 11
// 15 // 15
// Sequentially extract data from B in row order // Sequentially extract data from B in row order
ldrepl_macro 16, 19, 0 ldrepl_macro 16, 16, 17, 18, 19
GMUL xvf, d, U0, D0, U0 GMUL xvf, d, U0, D0, U0
ldrepl_macro 20, 22, 5 ldrepl_macro 15, 20, 21, 22
nmsub_macro 1, 1, 0, D1 nmsub_macro D1, 1, 0
ldrepl_macro 23, 24, 10 ldrepl_macro 13, 23, 24
GMUL xvf, d, U1, D4, U1 GMUL xvf, d, U1, D4, U1
ldrepl_macro 25, 25, 15 ldrepl_macro 10, 25
nmsub_macro 2, 2, 0, D2 nmsub_macro D2, 2, 0
nmsub_macro 2, 2, 1, D5 nmsub_macro D5, 2, 1
GMUL xvf, d, U2, D7, U2 GMUL xvf, d, U2, D7, U2
nmsub_macro 3, 3, 0, D3 nmsub_macro D3, 3, 0
nmsub_macro 3, 3, 1, D6 nmsub_macro D6, 3, 1
nmsub_macro 3, 3, 2, D8 nmsub_macro D8, 3, 2
GMUL xvf, d, U3, D9, U3 GMUL xvf, d, U3, D9, U3
// Store A // Store A
A_st_macro 0, 3, 0, 2 A_st_macro 2, 0, 0, 1, 2, 3
// Store C // Store C
GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00, GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00,
.endm .endm
@ -344,13 +343,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//0 1 //0 1
// 3 // 3
// Sequentially extract data from B in row order // Sequentially extract data from B in row order
ldrepl_macro 16, 17, 0 ldrepl_macro 16, 16, 17
GMUL xvf, d, U0, D0, U0 GMUL xvf, d, U0, D0, U0
ldrepl_macro 18, 18, 3 ldrepl_macro 15, 18
nmsub_macro 1, 1, 0, D1 nmsub_macro D1, 1, 0
GMUL xvf, d, U1, D2, U1 GMUL xvf, d, U1, D2, U1
// Store A // Store A
A_st_macro 0, 1, 0, 2 A_st_macro 2, 0, 0, 1
// Store C // Store C
GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00 GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00
.endm .endm
@ -364,23 +363,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 10 11 // 10 11
// 15 // 15
// Sequentially extract data from B in row order // Sequentially extract data from B in row order
ldrepl_macro 16, 19, 0 ldrepl_macro 16, 16, 17, 18, 19
GMUL xvf, d, U0, D0, U0 GMUL xvf, d, U0, D0, U0
ldrepl_macro 20, 22, 5 ldrepl_macro 15, 20, 21, 22
nmsub_macro 1, 1, 0, D1 nmsub_macro D1, 1, 0
ldrepl_macro 23, 24, 10 ldrepl_macro 13, 23, 24
GMUL xvf, d, U1, D4, U1 GMUL xvf, d, U1, D4, U1
ldrepl_macro 25, 25, 15 ldrepl_macro 10, 25
nmsub_macro 2, 2, 0, D2 nmsub_macro D2, 2, 0
nmsub_macro 2, 2, 1, D5 nmsub_macro D5, 2, 1
GMUL xvf, d, U2, D7, U2 GMUL xvf, d, U2, D7, U2
nmsub_macro 3, 3, 0, D3 nmsub_macro D3, 3, 0
nmsub_macro 3, 3, 1, D6 nmsub_macro D6, 3, 1
nmsub_macro 3, 3, 2, D8 nmsub_macro D8, 3, 2
GMUL xvf, d, U3, D9, U3 GMUL xvf, d, U3, D9, U3
// Store A // Store A
A_st_macro 0, 3, 0, 1 A_st_macro 1, 0, 0, 1, 2, 3
// Store C // Store C
GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00, GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00,
.endm .endm
@ -392,13 +391,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//0 1 //0 1
// 3 // 3
// Sequentially extract data from B in row order // Sequentially extract data from B in row order
ldrepl_macro 16, 17, 0 ldrepl_macro 16, 16, 17
GMUL xvf, d, U0, D0, U0 GMUL xvf, d, U0, D0, U0
ldrepl_macro 18, 18, 3 ldrepl_macro 15, 18
nmsub_macro 1, 1, 0, D1 nmsub_macro D1, 1, 0
GMUL xvf, d, U1, D2, U1 GMUL xvf, d, U1, D2, U1
// Store A // Store A
A_st_macro 0, 1, 0, 1 A_st_macro 1, 0, 0, 1
// Store C // Store C
GST f, d, $f0, C0, 0x00, $f1, C1, 0x00 GST f, d, $f0, C0, 0x00, $f1, C1, 0x00
.endm .endm
@ -582,10 +581,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvld U2, C0, 0x40 xvld U2, C0, 0x40
xvld U3, C0, 0x60 xvld U3, C0, 0x60
.L_dsolve_16x1: .L_dsolve_16x1:
ldrepl_macro 16, 16, 0 ldrepl_macro 16, 16
GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
// Store A // Store A
A_st_macro 0, 3, 0, 4 A_st_macro 4, 0, 0, 1, 2, 3
// Strore C // Strore C
GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60
.endm .endm
@ -599,10 +598,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvld U0, C0, 0x00 xvld U0, C0, 0x00
xvld U1, C0, 0x20 xvld U1, C0, 0x20
.L_dsolve_8x1: .L_dsolve_8x1:
ldrepl_macro 16, 16, 0 ldrepl_macro 16, 16
GMUL xvf, d, U0, D0, U0, U1, D0, U1 GMUL xvf, d, U0, D0, U0, U1, D0, U1
// Store A // Store A
A_st_macro 0, 1, 0, 4 A_st_macro 4, 0, 0, 1
// Strore C // Strore C
GST xv, , U0, C0, 0x00, U1, C0, 0x20 GST xv, , U0, C0, 0x00, U1, C0, 0x20
.endm .endm
@ -615,10 +614,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* Load C0 */ /* Load C0 */
xvld U0, C0, 0x00 xvld U0, C0, 0x00
.L_dsolve_4x1: .L_dsolve_4x1:
ldrepl_macro 16, 16, 0 ldrepl_macro 16, 16
GMUL xvf, d, U0, D0, U0 GMUL xvf, d, U0, D0, U0
// Store A // Store A
A_st_macro 0, 0, 0, 4 A_st_macro 4, 0, 0
// Strore C // Strore C
GST xv, , U0, C0, 0x00 GST xv, , U0, C0, 0x00
.endm .endm
@ -631,10 +630,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* Load C0 */ /* Load C0 */
xvld U0, C0, 0x00 xvld U0, C0, 0x00
.L_dsolve_2x1: .L_dsolve_2x1:
ldrepl_macro 16, 16, 0 ldrepl_macro 16, 16
GMUL xvf, d, U0, D0, U0 GMUL xvf, d, U0, D0, U0
// Store A // Store A
A_st_macro 0, 0, 0, 2 A_st_macro 2, 0, 0
// Strore C // Strore C
GST v, , $vr0, C0, 0x00 GST v, , $vr0, C0, 0x00
.endm .endm
@ -647,16 +646,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Load C // Load C
fld.d $f0, C0, 0x00 fld.d $f0, C0, 0x00
.L_dsolve_1x1: .L_dsolve_1x1:
ldrepl_macro 16, 16, 0 ldrepl_macro 16, 16
GMUL xvf, d, U0, D0, U0 GMUL xvf, d, U0, D0, U0
// Store A // Store A
A_st_macro 0, 0, 0, 1 A_st_macro 1, 0, 0
// Strore C // Strore C
GST f, d, $f0, C0, 0x00 GST f, d, $f0, C0, 0x00
.endm .endm
PROLOGUE PROLOGUE
push_if_used 26, 32 push_if_used 9, 8
PTR_SLLI LDC, LDC, 3 PTR_SLLI LDC, LDC, 3
PTR_SUB KK, ZERO, OFFSET PTR_SUB KK, ZERO, OFFSET
/* if (!(N >> 2)) goto L_N3 */ /* if (!(N >> 2)) goto L_N3 */
@ -877,6 +876,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PTR_ADD AA, AA, T0 // aa += 1 * k PTR_ADD AA, AA, T0 // aa += 1 * k
.L_N1_M0: .L_N1_M0:
.L_N0: .L_N0:
pop_if_used 26, 32 pop_if_used 9, 8
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
EPILOGUE EPILOGUE

View File

@ -111,33 +111,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dtrsm_kernel_macro.S" #include "dtrsm_kernel_macro.S"
.macro ldrepl_macro start, end, stride .macro ldrepl_macro stride:req, index:req, more:vararg
// Load Ux (x = 0...15) // Load Ux (x = 0...15)
.if \start <= \end GLDREPL xv, d, $xr\index, B0, \index * 8 - \stride * 8
GLDREPL xv, d, $xr\start, B0, \stride * 8 .ifnb \more
ldrepl_macro %start + 1, \end, %stride + 1 ldrepl_macro \stride, \more
.endif .endif
.endm .endm
.macro nmsub_macro reg:req, start0:req, start1:req, more:vararg
.macro nmsub_macro start0, end0, start1, reg // Gx -= reg * Ux
// Ux -= reg * Dx
.if \start0 <= \end0
xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0
nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg .ifnb \more
nmsub_macro \reg, \more
.endif .endif
.endm .endm
.macro A_st_macro N:req, stride:req, start:req, more:vararg
.macro A_st_macro start, end, stride, N // Store Gx(x = 16...31)
// Store Ux(x = 0...15)
.if \start <= \end
.if \N == 4 .if \N == 4
xvst $xr\start, A0, \stride * 0x20 xvst $xr\start, A0, \start * 0x20 - \stride * 0x20
.elseif \N == 2 .elseif \N == 2
vst $vr\start, A0, \stride * 0x10 vst $vr\start, A0, \start * 0x10 - \stride * 0x10
.elseif \N == 1 .elseif \N == 1
fst.d $f\start, A0, \stride * 0x08 fst.d $f\start, A0, \start * 0x08 - \stride * 0x08
.endif .endif
A_st_macro %start + 1, \end, %stride + 1, \N .ifnb \more
A_st_macro \N, \stride, \more
.endif .endif
.endm .endm
@ -148,13 +146,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//0 //0
//2 3 //2 3
// Sequentially extract data from B in row order // Sequentially extract data from B in row order
ldrepl_macro 16, 16, 0 ldrepl_macro 16, 16
ldrepl_macro 17, 18, 2 ldrepl_macro 15, 17, 18
GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
nmsub_macro 0, 3, 4, D1 nmsub_macro D1, 0, 4, 1, 5, 2, 6, 3, 7
GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
// Store A // Store A
A_st_macro 0, 7, 0, 4 A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7
// Store C // Store C
GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \
U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60
@ -167,13 +165,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//0 //0
//2 3 //2 3
// Sequentially extract data from B in row order // Sequentially extract data from B in row order
ldrepl_macro 16, 16, 0 ldrepl_macro 16, 16
ldrepl_macro 17, 18, 2 ldrepl_macro 15, 17, 18
GMUL xvf, d, U2, D2, U2, U3, D2, U3 GMUL xvf, d, U2, D2, U2, U3, D2, U3
nmsub_macro 0, 1, 2, D1 nmsub_macro D1, 0, 2, 1, 3
GMUL xvf, d, U0, D0, U0, U1, D0, U1 GMUL xvf, d, U0, D0, U0, U1, D0, U1
// Store A // Store A
A_st_macro 0, 3, 0, 4 A_st_macro 4, 0, 0, 1, 2, 3
// Store C // Store C
GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
U2, C1, 0x00, U3, C1, 0x20 U2, C1, 0x00, U3, C1, 0x20
@ -186,13 +184,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//0 //0
//2 3 //2 3
// Sequentially extract data from B in row order // Sequentially extract data from B in row order
ldrepl_macro 16, 16, 0 ldrepl_macro 16, 16
ldrepl_macro 17, 18, 2 ldrepl_macro 15, 17, 18
GMUL xvf, d, U1, D2, U1 GMUL xvf, d, U1, D2, U1
nmsub_macro 0, 0, 1, D1 nmsub_macro D1, 0, 1
GMUL xvf, d, U0, D0, U0 GMUL xvf, d, U0, D0, U0
// Store A // Store A
A_st_macro 0, 1, 0, 4 A_st_macro 4, 0, 0, 1
// Store C // Store C
GST xv, , U0, C0, 0x00, U1, C1, 0x00 GST xv, , U0, C0, 0x00, U1, C1, 0x00
.endm .endm
@ -204,13 +202,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//0 //0
//2 3 //2 3
// Sequentially extract data from B in row order // Sequentially extract data from B in row order
ldrepl_macro 16, 16, 0 ldrepl_macro 16, 16
ldrepl_macro 17, 18, 2 ldrepl_macro 15, 17, 18
GMUL xvf, d, U1, D2, U1 GMUL xvf, d, U1, D2, U1
nmsub_macro 0, 0, 1, D1 nmsub_macro D1, 0, 1
GMUL xvf, d, U0, D0, U0 GMUL xvf, d, U0, D0, U0
// Store A // Store A
A_st_macro 0, 1, 0, 2 A_st_macro 2, 0, 0, 1
// Store C // Store C
GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00 GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00
.endm .endm
@ -222,13 +220,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//0 //0
//2 3 //2 3
// Sequentially extract data from B in row order // Sequentially extract data from B in row order
ldrepl_macro 16, 16, 0 ldrepl_macro 16, 16
ldrepl_macro 17, 18, 2 ldrepl_macro 15, 17, 18
GMUL xvf, d, U1, D2, U1 GMUL xvf, d, U1, D2, U1
nmsub_macro 0, 0, 1, D1 nmsub_macro D1, 0, 1
GMUL xvf, d, U0, D0, U0 GMUL xvf, d, U0, D0, U0
// Store A // Store A
A_st_macro 0, 1, 0, 1 A_st_macro 1, 0, 0, 1
// Store C // Store C
GST f, d, $f0, C0, 0x00, $f1, C1, 0x00 GST f, d, $f0, C0, 0x00, $f1, C1, 0x00
.endm .endm
@ -242,22 +240,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//8 9 10 //8 9 10
//12 13 14 15 //12 13 14 15
// Sequentially extract data from B in row order // Sequentially extract data from B in row order
ldrepl_macro 22, 25, 12 ldrepl_macro 10, 22, 23, 24, 25
GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15 GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15
ldrepl_macro 19, 21, 8 ldrepl_macro 11, 19, 20, 21
nmsub_macro 8, 11, 12, D8 nmsub_macro D8, 8, 12, 9, 13, 10, 14, 11, 15
ldrepl_macro 17, 18, 4 ldrepl_macro 13, 17, 18
GMUL xvf, d, U8, D5, U8, U9, D5, U9, U10, D5, U10, U11, D5, U11 GMUL xvf, d, U8, D5, U8, U9, D5, U9, U10, D5, U10, U11, D5, U11
ldrepl_macro 16, 16, 0 ldrepl_macro 16, 16
nmsub_macro 4, 7, 12, D7 nmsub_macro D7, 4, 12, 5, 13, 6, 14, 7, 15
nmsub_macro 4, 7, 8, D4 nmsub_macro D4, 4, 8, 5, 9, 6, 10, 7, 11
GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
nmsub_macro 0, 3, 12, D6 nmsub_macro D6, 0, 12, 1, 13, 2, 14, 3, 15
nmsub_macro 0, 3, 8, D3 nmsub_macro D3, 0, 8, 1, 9, 2, 10, 3, 11
nmsub_macro 0, 3, 4, D1 nmsub_macro D1, 0, 4, 1, 5, 2, 6, 3, 7
GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
// Store A // Store A
A_st_macro 0, 15, 0, 4 A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
// Store C // Store C
GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \
U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \ U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \
@ -274,22 +272,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//8 9 10 //8 9 10
//12 13 14 15 //12 13 14 15
// Sequentially extract data from B in row order // Sequentially extract data from B in row order
ldrepl_macro 22, 25, 12 ldrepl_macro 10, 22, 23, 24, 25
GMUL xvf, d, U6, D9, U6, U7, D9, U7 GMUL xvf, d, U6, D9, U6, U7, D9, U7
ldrepl_macro 19, 21, 8 ldrepl_macro 11, 19, 20, 21
nmsub_macro 4, 5, 6, D8 nmsub_macro D8, 4, 6, 5, 7
ldrepl_macro 17, 18, 4 ldrepl_macro 13, 17, 18
GMUL xvf, d, U4, D5, U4, U5, D5, U5 GMUL xvf, d, U4, D5, U4, U5, D5, U5
ldrepl_macro 16, 16, 0 ldrepl_macro 16, 16
nmsub_macro 2, 3, 6, D7 nmsub_macro D7, 2, 6, 3, 7
nmsub_macro 2, 3, 4, D4 nmsub_macro D4, 2, 4, 3, 5
GMUL xvf, d, U2, D2, U2, U3, D2, U3 GMUL xvf, d, U2, D2, U2, U3, D2, U3
nmsub_macro 0, 1, 6, D6 nmsub_macro D6, 0, 6, 1, 7
nmsub_macro 0, 1, 4, D3 nmsub_macro D3, 0, 4, 1, 5
nmsub_macro 0, 1, 2, D1 nmsub_macro D1, 0, 2, 1, 3
GMUL xvf, d, U0, D0, U0, U1, D0, U1 GMUL xvf, d, U0, D0, U0, U1, D0, U1
// Store A // Store A
A_st_macro 0, 7, 0, 4 A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7
// Store C // Store C
GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
U2, C1, 0x00, U3, C1, 0x20, \ U2, C1, 0x00, U3, C1, 0x20, \
@ -306,22 +304,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//8 9 10 //8 9 10
//12 13 14 15 //12 13 14 15
// Sequentially extract data from B in row order // Sequentially extract data from B in row order
ldrepl_macro 22, 25, 12 ldrepl_macro 10, 22, 23, 24, 25
GMUL xvf, d, U3, D9, U3 GMUL xvf, d, U3, D9, U3
ldrepl_macro 19, 21, 8 ldrepl_macro 11, 19, 20, 21
nmsub_macro 2, 2, 3, D8 nmsub_macro D8, 2, 3
ldrepl_macro 17, 18, 4 ldrepl_macro 13, 17, 18
GMUL xvf, d, U2, D5, U2 GMUL xvf, d, U2, D5, U2
ldrepl_macro 16, 16, 0 ldrepl_macro 16, 16
nmsub_macro 1, 1, 3, D7 nmsub_macro D7, 1, 3
nmsub_macro 1, 1, 2, D4 nmsub_macro D4, 1, 2
GMUL xvf, d, U1, D2, U1 GMUL xvf, d, U1, D2, U1
nmsub_macro 0, 0, 3, D6 nmsub_macro D6, 0, 3
nmsub_macro 0, 0, 2, D3 nmsub_macro D3, 0, 2
nmsub_macro 0, 0, 1, D1 nmsub_macro D1, 0, 1
GMUL xvf, d, U0, D0, U0 GMUL xvf, d, U0, D0, U0
// Store A // Store A
A_st_macro 0, 3, 0, 4 A_st_macro 4, 0, 0, 1, 2, 3
// Store C // Store C
GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00 GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00
.endm .endm
@ -335,22 +333,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//8 9 10 //8 9 10
//12 13 14 15 //12 13 14 15
// Sequentially extract data from B in row order // Sequentially extract data from B in row order
ldrepl_macro 22, 25, 12 ldrepl_macro 10, 22, 23, 24, 25
GMUL xvf, d, U3, D9, U3 GMUL xvf, d, U3, D9, U3
ldrepl_macro 19, 21, 8 ldrepl_macro 11, 19, 20, 21
nmsub_macro 2, 2, 3, D8 nmsub_macro D8, 2, 3
ldrepl_macro 17, 18, 4 ldrepl_macro 13, 17, 18
GMUL xvf, d, U2, D5, U2 GMUL xvf, d, U2, D5, U2
ldrepl_macro 16, 16, 0 ldrepl_macro 16, 16
nmsub_macro 1, 1, 3, D7 nmsub_macro D7, 1, 3
nmsub_macro 1, 1, 2, D4 nmsub_macro D4, 1, 2
GMUL xvf, d, U1, D2, U1 GMUL xvf, d, U1, D2, U1
nmsub_macro 0, 0, 3, D6 nmsub_macro D6, 0, 3
nmsub_macro 0, 0, 2, D3 nmsub_macro D3, 0, 2
nmsub_macro 0, 0, 1, D1 nmsub_macro D1, 0, 1
GMUL xvf, d, U0, D0, U0 GMUL xvf, d, U0, D0, U0
// Store A // Store A
A_st_macro 0, 3, 0, 2 A_st_macro 2, 0, 0, 1, 2, 3
// Store C // Store C
GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00 GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00
.endm .endm
@ -364,22 +362,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//8 9 10 //8 9 10
//12 13 14 15 //12 13 14 15
// Sequentially extract data from B in row order // Sequentially extract data from B in row order
ldrepl_macro 22, 25, 12 ldrepl_macro 10, 22, 23, 24, 25
GMUL xvf, d, U3, D9, U3 GMUL xvf, d, U3, D9, U3
ldrepl_macro 19, 21, 8 ldrepl_macro 11, 19, 20, 21
nmsub_macro 2, 2, 3, D8 nmsub_macro D8, 2, 3
ldrepl_macro 17, 18, 4 ldrepl_macro 13, 17, 18
GMUL xvf, d, U2, D5, U2 GMUL xvf, d, U2, D5, U2
ldrepl_macro 16, 16, 0 ldrepl_macro 16, 16
nmsub_macro 1, 1, 3, D7 nmsub_macro D7, 1, 3
nmsub_macro 1, 1, 2, D4 nmsub_macro D4, 1, 2
GMUL xvf, d, U1, D2, U1 GMUL xvf, d, U1, D2, U1
nmsub_macro 0, 0, 3, D6 nmsub_macro D6, 0, 3
nmsub_macro 0, 0, 2, D3 nmsub_macro D3, 0, 2
nmsub_macro 0, 0, 1, D1 nmsub_macro D1, 0, 1
GMUL xvf, d, U0, D0, U0 GMUL xvf, d, U0, D0, U0
// Store A // Store A
A_st_macro 0, 3, 0, 1 A_st_macro 1, 0, 0, 1, 2, 3
// Store C // Store C
GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00, GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00,
.endm .endm
@ -399,10 +397,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_dsolve_16x1: .L_dsolve_16x1:
PTR_ADDI A0, T1, -16 * 8 PTR_ADDI A0, T1, -16 * 8
PTR_ADDI B0, T2, -1 * 8 PTR_ADDI B0, T2, -1 * 8
ldrepl_macro 16, 16, 0 ldrepl_macro 16, 16
GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
// Store A // Store A
A_st_macro 0, 3, 0, 4 A_st_macro 4, 0, 0, 1, 2, 3
// Strore C // Strore C
GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60
.endm .endm
@ -420,10 +418,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_dsolve_8x1: .L_dsolve_8x1:
PTR_ADDI A0, T1, -8 * 8 PTR_ADDI A0, T1, -8 * 8
PTR_ADDI B0, T2, -1 * 8 PTR_ADDI B0, T2, -1 * 8
ldrepl_macro 16, 16, 0 ldrepl_macro 16, 16
GMUL xvf, d, U0, D0, U0, U1, D0, U1 GMUL xvf, d, U0, D0, U0, U1, D0, U1
// Store A // Store A
A_st_macro 0, 1, 0, 4 A_st_macro 4, 0, 0, 1
// Strore C // Strore C
GST xv, , U0, C0, 0x00, U1, C0, 0x20 GST xv, , U0, C0, 0x00, U1, C0, 0x20
.endm .endm
@ -440,10 +438,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_dsolve_4x1: .L_dsolve_4x1:
PTR_ADDI A0, T1, -4 * 8 PTR_ADDI A0, T1, -4 * 8
PTR_ADDI B0, T2, -1 * 8 PTR_ADDI B0, T2, -1 * 8
ldrepl_macro 16, 16, 0 ldrepl_macro 16, 16
GMUL xvf, d, U0, D0, U0 GMUL xvf, d, U0, D0, U0
// Store A // Store A
A_st_macro 0, 0, 0, 4 A_st_macro 4, 0, 0
// Strore C // Strore C
GST xv, , U0, C0, 0x00 GST xv, , U0, C0, 0x00
.endm .endm
@ -460,10 +458,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_dsolve_2x1: .L_dsolve_2x1:
PTR_ADDI A0, T1, -2 * 8 PTR_ADDI A0, T1, -2 * 8
PTR_ADDI B0, T2, -1 * 8 PTR_ADDI B0, T2, -1 * 8
ldrepl_macro 16, 16, 0 ldrepl_macro 16, 16
GMUL xvf, d, U0, D0, U0 GMUL xvf, d, U0, D0, U0
// Store A // Store A
A_st_macro 0, 0, 0, 2 A_st_macro 2, 0, 0
// Strore C // Strore C
GST v, , $vr0, C0, 0x00 GST v, , $vr0, C0, 0x00
.endm .endm
@ -480,10 +478,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_dsolve_1x1: .L_dsolve_1x1:
PTR_ADDI A0, T1, -1 * 8 PTR_ADDI A0, T1, -1 * 8
PTR_ADDI B0, T2, -1 * 8 PTR_ADDI B0, T2, -1 * 8
ldrepl_macro 16, 16, 0 ldrepl_macro 16, 16
GMUL xvf, d, U0, D0, U0 GMUL xvf, d, U0, D0, U0
// Store A // Store A
A_st_macro 0, 0, 0, 1 A_st_macro 1, 0, 0
// Strore C // Strore C
GST f, d, $f0, C0, 0x00 GST f, d, $f0, C0, 0x00
.endm .endm
@ -697,7 +695,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
PROLOGUE PROLOGUE
push_if_used 26, 32 push_if_used 9, 8
PTR_SLLI LDC, LDC, 3 PTR_SLLI LDC, LDC, 3
PTR_SUB KK, N, OFFSET PTR_SUB KK, N, OFFSET
PTR_MUL T0, N, LDC PTR_MUL T0, N, LDC
@ -948,6 +946,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PTR_ADDI KK, KK, -4 PTR_ADDI KK, KK, -4
bnez J, .L_J1 bnez J, .L_J1
.L_N0: .L_N0:
pop_if_used 26, 32 pop_if_used 9, 8
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
EPILOGUE EPILOGUE

View File

@ -90,57 +90,175 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define PTR_FST fst.d #define PTR_FST fst.d
#endif #endif
// The max registers available to the user which
// do not need to be preserved across calls.
// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html
#define MAX_INT_CALLER_SAVED 17
#define MAX_FP_CALLER_SAVED 24
.altmacro // Enable alternate macro mode .altmacro // Enable alternate macro mode
/*
* Pushing and popping static registers into/from the stack.
* regs : number of static general-purpose registers, greater than or equal to 0, less than or equal to 9
* fregs: number of static floating-point registers, greater than or equal to 0, less than or equal to 8
*/
.macro push_if_used regs, fregs .macro push_if_used regs, fregs
.if \regs > MAX_INT_CALLER_SAVED .if \regs > 0
PTR_ADDI $sp, $sp, -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG) PTR_ADDI $sp, $sp, -(\regs << REG_LOG)
push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 push_regs 0, \regs - 1
.endif .endif
.if \fregs > MAX_FP_CALLER_SAVED .if \fregs > 0
PTR_ADDI $sp, $sp, -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG) PTR_ADDI $sp, $sp, -(\fregs << FREG_LOG)
push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 push_fregs 0, \fregs - 1
.endif .endif
.endm // End push_if_used .endm // End push_if_used
.macro pop_if_used regs, fregs .macro pop_if_used regs, fregs
.if \fregs > MAX_FP_CALLER_SAVED .if \fregs > 0
pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 pop_fregs 0, \fregs - 1
PTR_ADDI $sp, $sp, (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG PTR_ADDI $sp, $sp, \fregs << FREG_LOG
.endif .endif
.if \regs > MAX_INT_CALLER_SAVED .if \regs > 0
pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 pop_regs 0, \regs - 1
PTR_ADDI $sp, $sp, (\regs - MAX_INT_CALLER_SAVED) << REG_LOG PTR_ADDI $sp, $sp, \regs << REG_LOG
.endif .endif
.endm // End pop_if_used .endm // End pop_if_used
.macro push_regs from, to .macro push_regs from, to
PTR_ST $s\()\from, $sp, \from << REG_LOG #ifdef __clang__
.if \to >= 0
PTR_ST $s0, $sp, 0 << REG_LOG
.endif
.if \to >= 1
PTR_ST $s1, $sp, 1 << REG_LOG
.endif
.if \to >= 2
PTR_ST $s2, $sp, 2 << REG_LOG
.endif
.if \to >= 3
PTR_ST $s3, $sp, 3 << REG_LOG
.endif
.if \to >= 4
PTR_ST $s4, $sp, 4 << REG_LOG
.endif
.if \to >= 5
PTR_ST $s5, $sp, 5 << REG_LOG
.endif
.if \to >= 6
PTR_ST $s6, $sp, 6 << REG_LOG
.endif
.if \to >= 7
PTR_ST $s7, $sp, 7 << REG_LOG
.endif
.if \to >= 8
PTR_ST $s8, $sp, 8 << REG_LOG
.endif
#else
PTR_ST $s\()\from, $sp, \from << REG_LOG
.if \to - \from .if \to - \from
push_regs %from + 1, \to push_regs %from + 1, \to
.endif .endif
#endif
.endm // End push_regs .endm // End push_regs
.macro pop_regs from, to .macro pop_regs from, to
#ifdef __clang__
.if \to >= 0
PTR_LD $s0, $sp, 0 << REG_LOG
.endif
.if \to >= 1
PTR_LD $s1, $sp, 1 << REG_LOG
.endif
.if \to >= 2
PTR_LD $s2, $sp, 2 << REG_LOG
.endif
.if \to >= 3
PTR_LD $s3, $sp, 3 << REG_LOG
.endif
.if \to >= 4
PTR_LD $s4, $sp, 4 << REG_LOG
.endif
.if \to >= 5
PTR_LD $s5, $sp, 5 << REG_LOG
.endif
.if \to >= 6
PTR_LD $s6, $sp, 6 << REG_LOG
.endif
.if \to >= 7
PTR_LD $s7, $sp, 7 << REG_LOG
.endif
.if \to >= 8
PTR_LD $s8, $sp, 8 << REG_LOG
.endif
#else
PTR_LD $s\()\from, $sp, \from << REG_LOG PTR_LD $s\()\from, $sp, \from << REG_LOG
.if \to - \from .if \to - \from
pop_regs %from + 1, \to pop_regs %from + 1, \to
.endif .endif
#endif
.endm // End pop_regs .endm // End pop_regs
.macro push_fregs from, to .macro push_fregs from, to
#ifdef __clang__
.if \to >= 0
PTR_FST $fs0, $sp, 0 << FREG_LOG
.endif
.if \to >= 1
PTR_FST $fs1, $sp, 1 << FREG_LOG
.endif
.if \to >= 2
PTR_FST $fs2, $sp, 2 << FREG_LOG
.endif
.if \to >= 3
PTR_FST $fs3, $sp, 3 << FREG_LOG
.endif
.if \to >= 4
PTR_FST $fs4, $sp, 4 << FREG_LOG
.endif
.if \to >= 5
PTR_FST $fs5, $sp, 5 << FREG_LOG
.endif
.if \to >= 6
PTR_FST $fs6, $sp, 6 << FREG_LOG
.endif
.if \to >= 7
PTR_FST $fs7, $sp, 7 << FREG_LOG
.endif
#else
PTR_FST $fs\()\from, $sp, \from << FREG_LOG PTR_FST $fs\()\from, $sp, \from << FREG_LOG
.if \to - \from .if \to - \from
push_fregs %from + 1, \to push_fregs %from + 1, \to
.endif .endif
#endif
.endm // End push_fregs .endm // End push_fregs
.macro pop_fregs from, to .macro pop_fregs from, to
#ifdef __clang__
.if \to >= 0
PTR_FLD $fs0, $sp, 0 << FREG_LOG
.endif
.if \to >= 1
PTR_FLD $fs1, $sp, 1 << FREG_LOG
.endif
.if \to >= 2
PTR_FLD $fs2, $sp, 2 << FREG_LOG
.endif
.if \to >= 3
PTR_FLD $fs3, $sp, 3 << FREG_LOG
.endif
.if \to >= 4
PTR_FLD $fs4, $sp, 4 << FREG_LOG
.endif
.if \to >= 5
PTR_FLD $fs5, $sp, 5 << FREG_LOG
.endif
.if \to >= 6
PTR_FLD $fs6, $sp, 6 << FREG_LOG
.endif
.if \to >= 7
PTR_FLD $fs7, $sp, 7 << FREG_LOG
.endif
#else
PTR_FLD $fs\()\from, $sp, \from << FREG_LOG PTR_FLD $fs\()\from, $sp, \from << FREG_LOG
.if \to - \from .if \to - \from
pop_fregs %from + 1, \to pop_fregs %from + 1, \to
.endif .endif
#endif
.endm // End pop_fregs .endm // End pop_fregs
// //
@ -275,7 +393,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// GXOR // GXOR
// //
.macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg .macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
\pre_op\()xor.\suf_op \out, \in0, \in1 .ifnb \pre_op
\pre_op\()xor.v \out, \in0, \in1
.else
xor.\suf_op \out, \in0, \in1
.endif
.ifnb \more .ifnb \more
GXOR \pre_op, \suf_op, \more GXOR \pre_op, \suf_op, \more
.endif .endif
@ -307,6 +429,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
GPRELD \more GPRELD \more
.endif .endif
.endm .endm
//
// GPACKEV
//
.macro GPACKEV pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
\pre_op\()packev.\suf_op \out, \in0, \in1
.ifnb \more
GPACKEV \pre_op, \suf_op, \more
.endif
.endm
//
// GPACKOD
//
.macro GPACKOD pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
\pre_op\()packod.\suf_op \out, \in0, \in1
.ifnb \more
GPACKOD \pre_op, \suf_op, \more
.endif
.endm
//
// GSHUF4I
//
.macro GSHUF4I pre_op:req, suf_op:req, out:req, in0:req, in1:req /* imm */, more:vararg
\pre_op\()shuf4i.\suf_op \out, \in0, \in1
.ifnb \more
GSHUF4I \pre_op, \suf_op, \more
.endif
.endm
.macro TRANSF2G name, pre_op:req, suf_op:req, more:vararg
.ifeqs "\pre_op\()\suf_op", "vfs"
\name v, w, \more
.endif
.ifeqs "\pre_op\()\suf_op", "vfd"
\name v, d, \more
.endif
.ifeqs "\pre_op\()\suf_op", "xvfs"
\name xv, w, \more
.endif
.ifeqs "\pre_op\()\suf_op", "xvfd"
\name xv, d, \more
.endif
.endm
// //
// Compound instructions // Compound instructions
@ -314,61 +478,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// GACC: Accumulate the values of vector registers // GACC: Accumulate the values of vector registers
// //
.macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg .macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
.ifeqs "\pre_op", "xvf" .ifeqs "\pre_op\()\suf_op", "xvfd"
xvpermi.q \out, \in, 0x01
\pre_op\()add.\suf_op \in, \out, \in
xvpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.endif
.ifeqs "\pre_op\()\suf_op", "xvfs"
xvpermi.q \out, \in, 0x01 xvpermi.q \out, \in, 0x01
\pre_op\()add.\suf_op \in, \out, \in \pre_op\()add.\suf_op \in, \out, \in
xvpackod.d \out, \in, \in xvpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in \pre_op\()add.\suf_op \out, \out, \in
.ifeqs "\suf_op", "s"
xvpackod.w \in, \out, \out xvpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in \pre_op\()add.\suf_op \out, \out, \in
.endif .endif
.endif .ifeqs "\pre_op\()\suf_op", "vfd"
vpackod.d \out, \in, \in
.ifeqs "\pre_op", "vf" \pre_op\()add.\suf_op \out, \out, \in
.endif
.ifeqs "\pre_op\()\suf_op", "vfs"
vpackod.d \out, \in, \in vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in \pre_op\()add.\suf_op \out, \out, \in
.ifeqs "\suf_op", "s"
vpackod.w \in, \out, \out vpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in \pre_op\()add.\suf_op \out, \out, \in
.endif .endif
.endif
.ifeqs "\pre_op", "xv" .ifeqs "\pre_op\()\suf_op", "xvd"
xvpermi.q \out, \in, 0x01
\pre_op\()add.\suf_op \in, \out, \in
xvpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.endif
.ifeqs "\pre_op\()\suf_op", "xvw"
xvpermi.q \out, \in, 0x01
\pre_op\()add.\suf_op \in, \out, \in
xvpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
xvpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.endif
.ifeqs "\pre_op\()\suf_op", "xvh"
xvpermi.q \out, \in, 0x01
\pre_op\()add.\suf_op \in, \out, \in
xvpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
xvpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
xvpackod.h \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.endif
.ifeqs "\pre_op\()\suf_op", "xvb"
xvpermi.q \out, \in, 0x01 xvpermi.q \out, \in, 0x01
\pre_op\()add.\suf_op \in, \out, \in \pre_op\()add.\suf_op \in, \out, \in
xvpackod.d \out, \in, \in xvpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in \pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "d"
xvpackod.w \in, \out, \out xvpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in \pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "w"
xvpackod.h \in, \out, \out xvpackod.h \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in \pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "h"
xvpackod.b \in, \out, \out xvpackod.b \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in \pre_op\()add.\suf_op \out, \out, \in
.endif .endif
.endif
.endif
.endif
.ifeqs "\pre_op", "v" .ifeqs "\pre_op\()\suf_op", "vd"
vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.endif
.ifeqs "\pre_op\()\suf_op", "vw"
vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
vpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.endif
.ifeqs "\pre_op\()\suf_op", "vh"
vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
vpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
vpackod.h \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.endif
.ifeqs "\pre_op\()\suf_op", "vb"
vpackod.d \out, \in, \in vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in \pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "d"
vpackod.w \in, \out, \out vpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in \pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "w"
vpackod.h \in, \out, \out vpackod.h \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in \pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "h"
vpackod.b \in, \out, \out vpackod.b \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in \pre_op\()add.\suf_op \out, \out, \in
.endif .endif
.endif
.endif
.endif
.ifnb \more .ifnb \more
GACC \pre_op, \suf_op, \more GACC \pre_op, \suf_op, \more
@ -391,26 +590,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Note: When "pre_op = xvf && suf_op = s", in will be modified. // Note: When "pre_op = xvf && suf_op = s", in will be modified.
// //
.macro GCOMPLEXACC pre_op:req, suf_op:req, out:req, in:req, more:vararg .macro GCOMPLEXACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
.ifeqs "\pre_op", "xvf" .ifeqs "\pre_op\()\suf_op", "xvfd"
xvpermi.q \out, \in, 0x01
\pre_op\()add.\suf_op \out, \out, \in
.endif
.ifeqs "\pre_op\()\suf_op", "xvfs"
xvpermi.q \out, \in, 0x01 xvpermi.q \out, \in, 0x01
.ifeqs "\suf_op", "s"
\pre_op\()add.\suf_op \in, \out, \in \pre_op\()add.\suf_op \in, \out, \in
xvpackod.d \out, \in, \in xvpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in \pre_op\()add.\suf_op \out, \out, \in
.else
\pre_op\()add.\suf_op \out, \out, \in
.endif
.endif .endif
.ifeqs "\pre_op", "vf" .ifeqs "\pre_op\()\suf_op", "vfd"
.ifeqs "\suf_op", "s"
vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.else
vor.v \out, \in, \in vor.v \out, \in, \in
.endif .endif
.endif
.ifeqs "\pre_op\()\suf_op", "vfs"
vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.endif
.ifnb \more .ifnb \more
GCOMPLEXACC \pre_op, \suf_op, \more GCOMPLEXACC \pre_op, \suf_op, \more
@ -430,56 +629,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// suf_op: s or d, differentiate between single precision or double precision complex numbers // suf_op: s or d, differentiate between single precision or double precision complex numbers
// //
.macro GCOMPLEXMUL xconj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, tmp0:req, tmp1:req, tmp2:req, more:vararg .macro GCOMPLEXMUL xconj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, tmp0:req, tmp1:req, tmp2:req, more:vararg
.ifeqs "\pre_op", "xvf" TRANSF2G GXOR, \pre_op, s, \tmp1, \tmp1, \tmp1
xvxor.v \tmp1, \tmp1, \tmp1 TRANSF2G GPACKEV, \pre_op, \suf_op, \tmp0, \in0, \in0
.ifeqs "\suf_op", "s"
xvpackev.w \tmp0, \in0, \in0
.else
xvpackev.d \tmp0, \in0, \in0
.endif
.else
vxor.v \tmp1, \tmp1, \tmp1
.ifeqs "\suf_op", "s"
vpackev.w \tmp0, \in0, \in0
.else
vpackev.d \tmp0, \in0, \in0
.endif
.endif
\pre_op\()sub.\suf_op \tmp1, \tmp1, \in0 \pre_op\()sub.\suf_op \tmp1, \tmp1, \in0
.ifeqs "\pre_op", "xvf" .ifeqs "\xconj", "0"
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1
.else
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0
.endif
.ifeqs "\suf_op", "s" .ifeqs "\suf_op", "s"
.ifeqs "\xconj", "0" TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1
xvpackod.w \tmp1, \in0, \tmp1
.else .else
xvpackod.w \tmp1, \tmp1, \in0 TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b
.endif
xvshuf4i.w \tmp2, \in1, 0xb1
.else
.ifeqs "\xconj", "0"
xvpackod.d \tmp1, \in0, \tmp1
.else
xvpackod.d \tmp1, \tmp1, \in0
.endif
xvshuf4i.d \tmp2, \in1, 0x0b
.endif
.else
.ifeqs "\suf_op", "s"
.ifeqs "\xconj", "0"
vpackod.w \tmp1, \in0, \tmp1
.else
vpackod.w \tmp1, \tmp1, \in0
.endif
vshuf4i.w \tmp2, \in1, 0xb1
.else
.ifeqs "\xconj", "0"
vpackod.d \tmp1, \in0, \tmp1
.else
vpackod.d \tmp1, \tmp1, \in0
.endif
vshuf4i.d \tmp2, \in1, 0x0b
.endif
.endif .endif
\pre_op\()mul.\suf_op \out, \tmp0, \in1 \pre_op\()mul.\suf_op \out, \tmp0, \in1
@ -512,112 +676,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// suf_op: s or d, differentiate between single precision or double precision complex numbers // suf_op: s or d, differentiate between single precision or double precision complex numbers
// //
.macro GCOMPLEXMADD xconj=0, conj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, tmp0:req, tmp1:req, tmp2:req, more:vararg .macro GCOMPLEXMADD xconj=0, conj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, tmp0:req, tmp1:req, tmp2:req, more:vararg
.ifeqs "\pre_op", "xvf" TRANSF2G GXOR, \pre_op, s, \tmp1, \tmp1, \tmp1
xvxor.v \tmp1, \tmp1, \tmp1 TRANSF2G GPACKEV, \pre_op, \suf_op, \tmp0, \in0, \in0
.ifeqs "\suf_op", "s"
xvpackev.w \tmp0, \in0, \in0
.else
xvpackev.d \tmp0, \in0, \in0
.endif
.else
vxor.v \tmp1, \tmp1, \tmp1
.ifeqs "\suf_op", "s"
vpackev.w \tmp0, \in0, \in0
.else
vpackev.d \tmp0, \in0, \in0
.endif
.endif
\pre_op\()madd.\suf_op \tmp2, \tmp0, \in1, \in2 \pre_op\()madd.\suf_op \tmp2, \tmp0, \in1, \in2
.ifeqs "\conj", "1"
.ifeqs "\conj\()\suf_op", "1s"
\pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2 \pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2
.ifeqs "\pre_op", "xvf" TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp0, \tmp0, 0xb1
.ifeqs "\suf_op", "s" TRANSF2G GPACKEV, \pre_op, \suf_op, \out, \tmp0, \tmp2
xvshuf4i.w \tmp0, \tmp0, 0xb1
xvpackev.w \out, \tmp0, \tmp2
.else
xvshuf4i.d \tmp0, \tmp0, 0x0b
xvpackev.d \out, \tmp0, \tmp2
.endif .endif
.else .ifeqs "\conj\()\suf_op", "1d"
.ifeqs "\suf_op", "s" \pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2
vshuf4i.w \tmp0, \tmp0, 0xb1 TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp0, \tmp0, 0x0b
vpackev.w \out, \tmp0, \tmp2 TRANSF2G GPACKEV, \pre_op, \suf_op, \out, \tmp0, \tmp2
.else
vshuf4i.d \tmp0, \tmp0, 0x0b
vpackev.d \out, \tmp0, \tmp2
.endif .endif
.endif /* pre_op = xvf */ .ifeqs "\conj", "0"
.else
\pre_op\()add.\suf_op \out, \tmp2, \tmp1 \pre_op\()add.\suf_op \out, \tmp2, \tmp1
.endif /* conj = 1 */ .endif
\pre_op\()sub.\suf_op \tmp1, \tmp1, \in0 \pre_op\()sub.\suf_op \tmp1, \tmp1, \in0
.ifeqs "\pre_op", "xvf" .ifeqs "\xconj\()\conj\()\suf_op", "00s"
.ifeqs "\suf_op", "s" TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1
.ifeqs "\conj", "0" TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1
.ifeqs "\xconj", "0"
xvpackod.w \tmp1, \in0, \tmp1
.else
xvpackod.w \tmp1, \tmp1, \in0
.endif .endif
.else .ifeqs "\xconj\()\conj\()\suf_op", "10s"
.ifeqs "\xconj", "0" TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0
xvpackod.w \tmp1, \in0, \in0 TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1
.else
xvpackod.w \tmp1, \tmp1, \tmp1
.endif .endif
.ifeqs "\xconj\()\conj\()\suf_op", "01s"
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \in0
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1
.endif .endif
xvshuf4i.w \tmp2, \in1, 0xb1 .ifeqs "\xconj\()\conj\()\suf_op", "11s"
.else TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \tmp1
.ifeqs "\conj", "0" TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1
.ifeqs "\xconj", "0"
xvpackod.d \tmp1, \in0, \tmp1
.else
xvpackod.d \tmp1, \tmp1, \in0
.endif .endif
.else .ifeqs "\xconj\()\conj\()\suf_op", "00d"
.ifeqs "\xconj", "0" TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1
xvpackod.d \tmp1, \in0, \in0 TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b
.else
xvpackod.d \tmp1, \tmp1, \tmp1
.endif .endif
.ifeqs "\xconj\()\conj\()\suf_op", "10d"
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b
.endif .endif
xvshuf4i.d \tmp2, \in1, 0x0b .ifeqs "\xconj\()\conj\()\suf_op", "01d"
.endif TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \in0
.else TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b
.ifeqs "\suf_op", "s"
.ifeqs "\conj", "0"
.ifeqs "\xconj", "0"
vpackod.w \tmp1, \in0, \tmp1
.else
vpackod.w \tmp1, \tmp1, \in0
.endif
.else
.ifeqs "\xconj", "0"
vpackod.w \tmp1, \in0, \in0
.else
vpackod.w \tmp1, \tmp1, \tmp1
.endif
.endif
vshuf4i.w \tmp2, \in1, 0xb1
.else
.ifeqs "\conj", "0"
.ifeqs "\xconj", "0"
vpackod.d \tmp1, \in0, \tmp1
.else
vpackod.d \tmp1, \tmp1, \in0
.endif
.else
.ifeqs "\xconj", "0"
vpackod.d \tmp1, \in0, \in0
.else
vpackod.d \tmp1, \tmp1, \tmp1
.endif
.endif
vshuf4i.d \tmp2, \in1, 0x0b
.endif .endif
.ifeqs "\xconj\()\conj\()\suf_op", "11d"
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \tmp1
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b
.endif .endif
\pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out \pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out

View File

@ -837,7 +837,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
PROLOGUE PROLOGUE
push_if_used 26, 32 push_if_used 9, 8
xvreplve0.w VALPHA, $xr0 xvreplve0.w VALPHA, $xr0
#if defined (TRMMKERNEL) && !defined(LEFT) #if defined (TRMMKERNEL) && !defined(LEFT)
PTR_SUB OFF, ZERO, OFFSET PTR_SUB OFF, ZERO, OFFSET
@ -2343,6 +2343,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif // #if defined(TRMMKERNEL) #endif // #if defined(TRMMKERNEL)
.L_N1_M0: .L_N1_M0:
.L_N0: .L_N0:
pop_if_used 26, 32 pop_if_used 9, 8
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
EPILOGUE EPILOGUE

View File

@ -135,7 +135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//.L_N0 //.L_N0
PROLOGUE PROLOGUE
push_if_used 26, 32 push_if_used 9, 8
move TD, DST move TD, DST
move TS, SRC move TS, SRC
@ -458,6 +458,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PTR_ADDI M, M, -1 PTR_ADDI M, M, -1
blt ZERO, M, .L_N1_M1 blt ZERO, M, .L_N1_M1
.L_N0: .L_N0:
pop_if_used 26, 32 pop_if_used 9, 8
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
EPILOGUE EPILOGUE

View File

@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//.L_N0 //.L_N0
PROLOGUE PROLOGUE
push_if_used 17, 20 push_if_used 0, 0
move TD, DST move TD, DST
move TS, SRC move TS, SRC
@ -293,6 +293,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PTR_ADDI M, M, -1 PTR_ADDI M, M, -1
blt ZERO, M, .L_N1_M1 blt ZERO, M, .L_N1_M1
.L_N0: .L_N0:
pop_if_used 17, 20 pop_if_used 0, 0
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
EPILOGUE EPILOGUE

View File

@ -118,7 +118,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//.L_M0 //.L_M0
PROLOGUE PROLOGUE
push_if_used 24, 8 push_if_used 7, 0
move S0, SRC move S0, SRC
move P0, DST move P0, DST
@ -521,6 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PTR_ADDI S1, S1, 0x04 PTR_ADDI S1, S1, 0x04
PTR_ADDI P5, P5, 0x04 PTR_ADDI P5, P5, 0x04
.L_M0: .L_M0:
pop_if_used 24, 8 pop_if_used 7, 0
jirl $r0, $r1, 0x00 jirl $r0, $r1, 0x00
EPILOGUE EPILOGUE

View File

@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//.L_M0 //.L_M0
PROLOGUE PROLOGUE
push_if_used 23, 8 push_if_used 6, 0
move S0, SRC move S0, SRC
move P0, DST move P0, DST
@ -401,6 +401,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PTR_ADDI S1, S1, 0x04 PTR_ADDI S1, S1, 0x04
PTR_ADDI P4, P4, 0x04 PTR_ADDI P4, P4, 0x04
.L_M0: .L_M0:
pop_if_used 23, 8 pop_if_used 6, 0
jirl $r0, $r1, 0x00 jirl $r0, $r1, 0x00
EPILOGUE EPILOGUE

View File

@ -418,7 +418,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
PTR_LD INC_Y, $sp, 0 PTR_LD INC_Y, $sp, 0
push_if_used 17 + 7, 19 push_if_used 7, 0
PTR_ADDI K, $r0, 0x01 PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K PTR_SUB I, INC_X, K
PTR_SUB J, INC_Y, K PTR_SUB J, INC_Y, K
@ -458,6 +458,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
SGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 SGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
.L_END: .L_END:
pop_if_used 17 + 7, 19 pop_if_used 7, 0
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
EPILOGUE EPILOGUE

View File

@ -369,7 +369,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
PTR_LD INC_Y, $sp, 0 PTR_LD INC_Y, $sp, 0
push_if_used 17 + 8, 18 push_if_used 8, 0
PTR_ADDI K, $r0, 0x01 PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K PTR_SUB I, INC_X, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
@ -400,6 +400,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1: /* if (incx != 1) */ .L_GAP_1: /* if (incx != 1) */
SGEMV_T_LASX GAP_1, X8_GAP, X4_GAP SGEMV_T_LASX GAP_1, X8_GAP, X4_GAP
.L_END: .L_END:
pop_if_used 17 + 8, 18 pop_if_used 8, 0
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
EPILOGUE EPILOGUE

View File

@ -253,7 +253,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
PTR_LD INC_Y, $sp, 0 PTR_LD INC_Y, $sp, 0
push_if_used 17 + 7, 31 push_if_used 7, 7
PTR_ADDI K, $r0, 0x01 PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K PTR_SUB I, INC_X, K
PTR_SUB J, INC_Y, K PTR_SUB J, INC_Y, K
@ -291,6 +291,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
ZGEMV_N_LSX GAP_1_1, X_2_GAP, X_1, Y_2_GAP, Y_1 ZGEMV_N_LSX GAP_1_1, X_2_GAP, X_1, Y_2_GAP, Y_1
.L_END: .L_END:
pop_if_used 17 + 7, 31 pop_if_used 7, 7
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
EPILOGUE EPILOGUE

View File

@ -298,7 +298,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
PTR_LD INC_Y, $sp, 0 PTR_LD INC_Y, $sp, 0
push_if_used 17 + 7, 31 push_if_used 7, 7
PTR_ADDI K, $r0, 0x01 PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K PTR_SUB I, INC_X, K
PTR_SUB J, INC_Y, K PTR_SUB J, INC_Y, K
@ -337,7 +337,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
ZGEMV_N_LASX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 ZGEMV_N_LASX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1
.L_END: .L_END:
pop_if_used 17 + 7, 31 pop_if_used 7, 7
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
EPILOGUE EPILOGUE

View File

@ -234,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
PTR_LD INC_Y, $sp, 0 PTR_LD INC_Y, $sp, 0
push_if_used 17 + 8, 30 push_if_used 8, 6
PTR_ADDI K, $r0, 0x01 PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K PTR_SUB I, INC_X, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
@ -263,6 +263,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1: /* if (incx != 1) */ .L_GAP_1: /* if (incx != 1) */
ZGEMV_T_LSX GAP_1, X2_GAP ZGEMV_T_LSX GAP_1, X2_GAP
.L_END: .L_END:
pop_if_used 17 + 8, 30 pop_if_used 8, 6
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
EPILOGUE EPILOGUE

View File

@ -264,7 +264,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
PTR_LD INC_Y, $sp, 0 PTR_LD INC_Y, $sp, 0
push_if_used 17 + 8, 30 push_if_used 8, 6
PTR_ADDI K, $r0, 0x01 PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K PTR_SUB I, INC_X, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
@ -294,6 +294,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1: /* if (incx != 1) */ .L_GAP_1: /* if (incx != 1) */
ZGEMV_T_LASX GAP_1, X4_GAP ZGEMV_T_LASX GAP_1, X4_GAP
.L_END: .L_END:
pop_if_used 17 + 8, 30 pop_if_used 8, 6
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
EPILOGUE EPILOGUE