loongarch64: Fixed clang compilation issues

This commit is contained in:
gxw 2024-04-15 10:31:33 +08:00
parent 15b9fc3f78
commit 7cd438a5ac
27 changed files with 645 additions and 514 deletions

View File

@ -955,12 +955,18 @@ endif
ifeq ($(ARCH), loongarch64)
LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d)
LA64_ARCH=$(shell $(CC) -march=loongarch64 -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo loongarch64)
ifneq ($(LA64_ABI), lp64d)
LA64_ABI=lp64
endif
ifneq ($(LA64_ARCH), loongarch64)
CCOMMON_OPT += -mabi=$(LA64_ABI)
FCOMMON_OPT += -mabi=$(LA64_ABI)
else
CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
endif
endif
endif

15
c_check
View File

@ -197,10 +197,22 @@ fi
no_lsx=0
no_lasx=0
if [ "$architecture" = "loongarch64" ]; then
lasx_flags='-march=loongarch64'
lsx_flags='-march=loongarch64'
tmpd="$(mktemp -d)"
tmparch="$tmpd/arch.c"
printf "void main(void){ }\n" >> "$tmparch"
args="-march=loongarch64 -o $tmparch.o $tmparch"
{
$compiler_name $flags $args >/dev/null 2>&1
} || {
lasx_flags=''
lsx_flags=''
}
tmplsx="$tmpd/lsx.c"
codelsx='"vadd.b $vr0, $vr0, $vr0"'
lsx_flags='-march=loongarch64'
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx"
args="$lsx_flags -o $tmplsx.o $tmplsx"
{
@ -211,7 +223,6 @@ if [ "$architecture" = "loongarch64" ]; then
tmplasx="$tmpd/lasx.c"
codelasx='"xvadd.b $xr0, $xr0, $xr0"'
lasx_flags='-march=loongarch64'
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx"
args="$lasx_flags -o $tmplasx.o $tmplasx"
{

View File

@ -279,7 +279,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 7, 31
push_if_used 7, 7
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
PTR_SUB J, INC_Y, K
@ -318,6 +318,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1
.L_END:
pop_if_used 17 + 7, 31
pop_if_used 7, 7
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -336,7 +336,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 7, 31
push_if_used 7, 7
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
PTR_SUB J, INC_Y, K
@ -378,6 +378,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
CGEMV_N_LASX GAP_1_1, X_8_GAP, X_1, Y_8_GAP, Y_1
.L_END:
pop_if_used 17 + 7, 31
pop_if_used 7, 7
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -255,7 +255,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 8, 30
push_if_used 8, 6
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
@ -285,6 +285,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1: /* if (incx != 1) */
CGEMV_T_LSX GAP_1, X4_GAP
.L_END:
pop_if_used 17 + 8, 30
pop_if_used 8, 6
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -304,7 +304,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 8, 30
push_if_used 8, 6
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
@ -337,6 +337,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1: /* if (incx != 1) */
CGEMV_T_LASX GAP_1, X8_GAP
.L_END:
pop_if_used 17 + 8, 30
pop_if_used 8, 6
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -79,7 +79,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define D7 $vr15
PROLOGUE
push_if_used 26, 32
push_if_used 0, 0
move TD, DST
move TS, SRC
slli.d TL, LDA, 0x03
@ -278,6 +278,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d M, M, -1
blt ZERO, M, .L_M1
.L_N0:
pop_if_used 26, 32
pop_if_used 0, 0
jirl $r0, $r1, 0x00
EPILOGUE

View File

@ -66,7 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define U7 $vr7
PROLOGUE
push_if_used 18, 8
push_if_used 1, 0
move S0, SRC
move P0, DST
@ -274,7 +274,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fst.d F0, P3, 0x00
.L_M0:
pop_if_used 18, 8
pop_if_used 1, 0
jirl $r0, $r1, 0x00
EPILOGUE

View File

@ -76,7 +76,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define U7 $vr7
PROLOGUE
push_if_used 24, 8
push_if_used 7, 0
move S0, SRC
move P0, DST
@ -592,6 +592,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d S1, S1, 0x08
addi.d P4, P4, 0x08
.L_M0:
pop_if_used 24, 8
pop_if_used 7, 0
jirl $r0, $r1, 0x00
EPILOGUE

View File

@ -509,7 +509,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 7, 24 + 4
push_if_used 7, 4
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
PTR_SUB J, INC_Y, K
@ -549,6 +549,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
DGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
.L_END:
pop_if_used 17 + 7, 24 + 4
pop_if_used 7, 4
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -445,7 +445,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 8, 24 + 3
push_if_used 8, 3
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
@ -476,6 +476,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1: /* if (incx != 1) */
DGEMV_T_LASX GAP_1, X8_GAP, X4_GAP
.L_END:
pop_if_used 17 + 8, 24 + 3
pop_if_used 8, 3
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -1029,7 +1029,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
PROLOGUE
push_if_used 26, 32
push_if_used 9, 8
PTR_SLLI LDC, LDC, 3
/* if (!(N >> 2)) goto L_N3 */
PTR_SRAI J, N, 2 /* J = bn >> 2 */
@ -1361,6 +1361,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
blt ZERO, I, .L_N1_I1
.L_N1_M0:
.L_N0:
pop_if_used 26, 32
pop_if_used 9, 8
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -128,31 +128,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dtrsm_kernel_macro.S"
.macro ldrepl_macro start, end, stride
.macro ldrepl_macro stride:req, index:req, more:vararg
// Load Ux (x = 0...15)
.if \start <= \end
GLDREPL xv, d, $xr\start, A0, \stride * 8
ldrepl_macro %start + 1, \end, %stride + 1
GLDREPL xv, d, $xr\index, A0, \index * 8 - \stride * 8
.ifnb \more
ldrepl_macro \stride, \more
.endif
.endm
.macro nmsub_macro start0, end0, start1, reg
.macro nmsub_macro reg:req, start0:req, start1:req, more:vararg
// Gx -= reg * Ux
.if \start0 <= \end0
xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0
nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg
.ifnb \more
nmsub_macro \reg, \more
.endif
.endm
.macro B_st_macro start, end, stride, N
.macro B_st_macro N:req, stride:req, start:req, more:vararg
// Store Gx(x = 16...31)
.if \start <= \end
.if \N == 4
xvst $xr\start, B0, \stride * 0x20
xvst $xr\start, B0, \start * 0x20 - \stride * 0x20
.elseif \N == 2
vst $vr\start, B0, \stride * 0x10
vst $vr\start, B0, \start * 0x10 - \stride * 0x10
.elseif \N == 1
fst.d $f\start, B0, \stride * 0x08
fst.d $f\start, B0, \start * 0x08 - \stride * 0x08
.endif
B_st_macro %start + 1, \end, %stride + 1, \N
.ifnb \more
B_st_macro \N, \stride, \more
.endif
.endm
@ -194,86 +194,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 255
// Sequentially extract data from A in row order
// Load 0
ldrepl_macro 0, 15, 0
ldrepl_macro 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G0, G0, U0
nmsub_macro 17, 31, 1, G0
nmsub_macro G0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, \
25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8
// Load 1
ldrepl_macro 1, 15, 0
ldrepl_macro 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G1, G1, U1
nmsub_macro 18, 31, 2, G1
nmsub_macro G1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, \
25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8
// Load 2
ldrepl_macro 2, 15, 0
ldrepl_macro 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G2, G2, U2
nmsub_macro 19, 31, 3, G2
nmsub_macro G2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, 25, 9, 26, \
10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8
// Load 3
ldrepl_macro 3, 15, 0
ldrepl_macro 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G3, G3, U3
nmsub_macro 20, 31, 4, G3
nmsub_macro G3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, 25, 9, 26, 10, \
27, 11, 28, 12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8
// Load 4
ldrepl_macro 4, 15, 0
ldrepl_macro 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G4, G4, U4
nmsub_macro 21, 31, 5, G4
nmsub_macro G4, 21, 5, 22, 6, 23, 7, 24, 8, 25, 9, 26, 10, 27, 11, \
28, 12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8
// Load 5
ldrepl_macro 5, 15, 0
ldrepl_macro 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G5, G5, U5
nmsub_macro 22, 31, 6, G5
nmsub_macro G5, 22, 6, 23, 7, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, \
29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8
// Load 6
ldrepl_macro 6, 15, 0
ldrepl_macro 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G6, G6, U6
nmsub_macro 23, 31, 7, G6
nmsub_macro G6, 23, 7, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, \
30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8
// Load 7
ldrepl_macro 7, 15, 0
ldrepl_macro 7, 7, 8, 9, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G7, G7, U7
nmsub_macro 24, 31, 8, G7
nmsub_macro G7, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8
// Load 8
ldrepl_macro 8, 15, 0
ldrepl_macro 8, 8, 9, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G8, G8, U8
nmsub_macro 25, 31, 9, G8
nmsub_macro G8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8
// Load 9
ldrepl_macro 9, 15, 0
ldrepl_macro 9, 9, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G9, G9, U9
nmsub_macro 26, 31, 10, G9
nmsub_macro G9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8
// Load 10
ldrepl_macro 10, 15, 0
ldrepl_macro 10, 10, 11, 12, 13, 14, 15
GMUL xvf, d, G10, G10, U10
nmsub_macro 27, 31, 11, G10
nmsub_macro G10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8
// Load 11
ldrepl_macro 11, 15, 0
ldrepl_macro 11, 11, 12, 13, 14, 15
GMUL xvf, d, G11, G11, U11
nmsub_macro 28, 31, 12, G11
nmsub_macro G11, 28, 12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8
// Load 12
ldrepl_macro 12, 15, 0
ldrepl_macro 12, 12, 13, 14, 15
GMUL xvf, d, G12, G12, U12
nmsub_macro 29, 31, 13, G12
nmsub_macro G12, 29, 13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8
// Load 13
ldrepl_macro 13, 15, 0
ldrepl_macro 13, 13, 14, 15
GMUL xvf, d, G13, G13, U13
nmsub_macro 30, 31, 14, G13
nmsub_macro G13, 30, 14, 31, 15
PTR_ADDI A0, A0, 17 * 8
// Load 14
ldrepl_macro 14, 15, 0
ldrepl_macro 14, 14, 15
GMUL xvf, d, G14, G14, U14
nmsub_macro 31, 31, 15, G14
nmsub_macro G14, 31, 15
PTR_ADDI A0, A0, 17 * 8
// Load 15
ldrepl_macro 15, 15, 0
ldrepl_macro 15, 15
GMUL xvf, d, G15, G15, U15
// Finally, We can store the result.
// For B, stored sequentially, and C, first transpose and then store
B_st_macro 16, 31, 0, \N
B_st_macro \N, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1
GTRANSPOSE4x4_D G8, G9, G10, G11, G8, G9, G10, G11, U0, U1
@ -334,46 +341,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 63
// Sequentially extract data from A in row order
// Load 0
ldrepl_macro 0, 7, 0
ldrepl_macro 0, 0, 1, 2, 3, 4, 5, 6, 7
GMUL xvf, d, G0, G0, U0
nmsub_macro 17, 23, 1, G0
nmsub_macro G0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
PTR_ADDI A0, A0, 9 * 8
// Load 1
ldrepl_macro 1, 7, 0
ldrepl_macro 1, 1, 2, 3, 4, 5, 6, 7
GMUL xvf, d, G1, G1, U1
nmsub_macro 18, 23, 2, G1
nmsub_macro G1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
PTR_ADDI A0, A0, 9 * 8
// Load 2
ldrepl_macro 2, 7, 0
ldrepl_macro 2, 2, 3, 4, 5, 6, 7
GMUL xvf, d, G2, G2, U2
nmsub_macro 19, 23, 3, G2
nmsub_macro G2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
PTR_ADDI A0, A0, 9 * 8
// Load 3
ldrepl_macro 3, 7, 0
ldrepl_macro 3, 3, 4, 5, 6, 7
GMUL xvf, d, G3, G3, U3
nmsub_macro 20, 23, 4, G3
nmsub_macro G3, 20, 4, 21, 5, 22, 6, 23, 7
PTR_ADDI A0, A0, 9 * 8
// Load 4
ldrepl_macro 4, 7, 0
ldrepl_macro 4, 4, 5, 6, 7
GMUL xvf, d, G4, G4, U4
nmsub_macro 21, 23, 5, G4
nmsub_macro G4, 21, 5, 22, 6, 23, 7
PTR_ADDI A0, A0, 9 * 8
// Load 5
ldrepl_macro 5, 7, 0
ldrepl_macro 5, 5, 6, 7
GMUL xvf, d, G5, G5, U5
nmsub_macro 22, 23, 6, G5
nmsub_macro G5, 22, 6, 23, 7
PTR_ADDI A0, A0, 9 * 8
// Load 6
ldrepl_macro 6, 7, 0
ldrepl_macro 6, 6, 7
GMUL xvf, d, G6, G6, U6
nmsub_macro 23, 23, 7, G6
nmsub_macro G6, 23, 7
PTR_ADDI A0, A0, 9 * 8
// Load 7
ldrepl_macro 7, 7, 0
ldrepl_macro 7, 7
GMUL xvf, d, G7, G7, U7
// Finally, We can store the result.
// For B, stored sequentially, and C, first transpose and then store
B_st_macro 16, 23, 0, \N
B_st_macro \N, 16, 16, 17, 18, 19, 20, 21, 22, 23
GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1
.if \N == 4
@ -437,26 +444,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 15
// Sequentially extract data from A in row order
// Load 0
ldrepl_macro 0, 3, 0
ldrepl_macro 0, 0, 1, 2, 3
GMUL xvf, d, G0, G0, U0
nmsub_macro 17, 19, 1, G0
nmsub_macro G0, 17, 1, 18, 2, 19, 3
PTR_ADDI A0, A0, 5 * 8
// Load 1
ldrepl_macro 1, 3, 0
ldrepl_macro 1, 1, 2, 3
GMUL xvf, d, G1, G1, U1
nmsub_macro 18, 19, 2, G1
nmsub_macro G1, 18, 2, 19, 3
PTR_ADDI A0, A0, 5 * 8
// Load 2
ldrepl_macro 2, 3, 0
ldrepl_macro 2, 2, 3
GMUL xvf, d, G2, G2, U2
nmsub_macro 19, 19, 3, G2
nmsub_macro G2, 19, 3
PTR_ADDI A0, A0, 5 * 8
// Load 3
ldrepl_macro 3, 3, 0
ldrepl_macro 3, 3
GMUL xvf, d, G3, G3, U3
// Finally, We can store the result.
// For B, stored sequentially, and C, first transpose and then store
B_st_macro 16, 19, 0, \N
B_st_macro \N, 16, 16, 17, 18, 19
GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
.if \N == 4
GST xv, , G0, C0, 0x00, G1, C1, 0x00, G2, C2, 0x00, G3, C3, 0x00
@ -501,16 +508,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 3
// Sequentially extract data from A in row order
// Load 0
ldrepl_macro 0, 1, 0
ldrepl_macro 0, 0, 1
GMUL xvf, d, G0, G0, U0
nmsub_macro 17, 17, 1, G0
nmsub_macro G0, 17, 1
PTR_ADDI A0, A0, 3 * 8
// Load 1
ldrepl_macro 1, 1, 0
ldrepl_macro 1, 1
GMUL xvf, d, G1, G1, U1
// Finally, We can store the result.
// For B, stored sequentially, and C, first transpose and then store
B_st_macro 16, 17, 0, \N
B_st_macro \N, 16, 16, 17
GSBUTTERFLY xv, d, U0, U1, G1, G0
.if \N == 4
vst $vr0, C0, 0x00
@ -717,7 +724,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
PROLOGUE
push_if_used 26, 32
push_if_used 9, 8
PTR_SLLI LDC, LDC, 3
/* if (!(N >> 2)) goto L_N3 */
PTR_SRAI J, N, 2 /* J = bn >> 2 */
@ -954,6 +961,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PTR_ADD AA, AA, T0 // aa += 1 * k
.L_N1_M0:
.L_N0:
pop_if_used 26, 32
pop_if_used 9, 8
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -128,33 +128,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dtrsm_kernel_macro.S"
.macro ldrepl_macro start, end, stride
.macro ldrepl_macro stride:req, index:req, more:vararg
// Load Ux (x = 0...15)
.if \start <= \end
GLDREPL xv, d, $xr\start, B0, \stride * 8
ldrepl_macro %start + 1, \end, %stride + 1
GLDREPL xv, d, $xr\index, B0, \index * 8 - \stride * 8
.ifnb \more
ldrepl_macro \stride, \more
.endif
.endm
.macro nmsub_macro start0, end0, start1, reg
// Ux -= reg * Dx
.if \start0 <= \end0
.macro nmsub_macro reg:req, start0:req, start1:req, more:vararg
// Gx -= reg * Ux
xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0
nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg
.ifnb \more
nmsub_macro \reg, \more
.endif
.endm
.macro A_st_macro start, end, stride, N
// Store Ux(x = 0...15)
.if \start <= \end
.macro A_st_macro N:req, stride:req, start:req, more:vararg
// Store Gx(x = 16...31)
.if \N == 4
xvst $xr\start, A0, \stride * 0x20
xvst $xr\start, A0, \start * 0x20 - \stride * 0x20
.elseif \N == 2
vst $vr\start, A0, \stride * 0x10
vst $vr\start, A0, \start * 0x10 - \stride * 0x10
.elseif \N == 1
fst.d $f\start, A0, \stride * 0x08
fst.d $f\start, A0, \start * 0x08 - \stride * 0x08
.endif
A_st_macro %start + 1, \end, %stride + 1, \N
.ifnb \more
A_st_macro \N, \stride, \more
.endif
.endm
@ -167,22 +165,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 10 11
// 15
// Sequentially extract data from B in row order
ldrepl_macro 16, 19, 0
ldrepl_macro 16, 16, 17, 18, 19
GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
ldrepl_macro 20, 22, 5
nmsub_macro 4, 7, 0, D1
ldrepl_macro 23, 24, 10
ldrepl_macro 15, 20, 21, 22
nmsub_macro D1, 4, 0, 5, 1, 6, 2, 7, 3
ldrepl_macro 13, 23, 24
GMUL xvf, d, U4, D4, U4, U5, D4, U5, U6, D4, U6, U7, D4, U7
ldrepl_macro 25, 25, 15
nmsub_macro 8, 11, 0, D2
nmsub_macro 8, 11, 4, D5
ldrepl_macro 10, 25
nmsub_macro D2, 8, 0, 9, 1, 10, 2, 11, 3
nmsub_macro D5, 8, 4, 9, 5, 10, 6, 11, 7
GMUL xvf, d, U8, D7, U8, U9, D7, U9, U10, D7, U10, U11, D7, U11
nmsub_macro 12, 15, 0, D3
nmsub_macro 12, 15, 4, D6
nmsub_macro 12, 15, 8, D8
nmsub_macro D3, 12, 0, 13, 1, 14, 2, 15, 3
nmsub_macro D6, 12, 4, 13, 5, 14, 6, 15, 7
nmsub_macro D8, 12, 8, 13, 9, 14, 10, 15, 11
GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15
// Store A
A_st_macro 0, 15, 0, 4
A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
// Store C
GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \
U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \
@ -197,13 +196,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//0 1
// 3
// Sequentially extract data from B in row order
ldrepl_macro 16, 17, 0
ldrepl_macro 16, 16, 17
GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
ldrepl_macro 18, 18, 3
nmsub_macro 4, 7, 0, D1
ldrepl_macro 15, 18
nmsub_macro D1, 4, 0, 5, 1, 6, 2, 7, 3
GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
// Store A
A_st_macro 0, 7, 0, 4
A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7
// Store C
GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \
U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60
@ -218,22 +217,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 10 11
// 15
// Sequentially extract data from B in row order
ldrepl_macro 16, 19, 0
ldrepl_macro 16, 16, 17, 18, 19
GMUL xvf, d, U0, D0, U0, U1, D0, U1
ldrepl_macro 20, 22, 5
nmsub_macro 2, 3, 0, D1
ldrepl_macro 23, 24, 10
ldrepl_macro 15, 20, 21, 22
nmsub_macro D1, 2, 0, 3, 1
ldrepl_macro 13, 23, 24
GMUL xvf, d, U2, D4, U2, U3, D4, U3
ldrepl_macro 25, 25, 15
nmsub_macro 4, 5, 0, D2
nmsub_macro 4, 5, 2, D5
ldrepl_macro 10, 25
nmsub_macro D2, 4, 0, 5, 1
nmsub_macro D5, 4, 2, 5, 3
GMUL xvf, d, U4, D7, U4, U5, D7, U5
nmsub_macro 6, 7, 0, D3
nmsub_macro 6, 7, 2, D6
nmsub_macro 6, 7, 4, D8
nmsub_macro D3, 6, 0, 7, 1
nmsub_macro D6, 6, 2, 7, 3
nmsub_macro D8, 6, 4, 7, 5
GMUL xvf, d, U6, D9, U6, U7, D9, U7
// Store A
A_st_macro 0, 7, 0, 4
A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7
// Store C
GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
U2, C1, 0x00, U3, C1, 0x20, \
@ -248,13 +247,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//0 1
// 3
// Sequentially extract data from B in row order
ldrepl_macro 16, 17, 0
ldrepl_macro 16, 16, 17
GMUL xvf, d, U0, D0, U0, U1, D0, U1
ldrepl_macro 18, 18, 3
nmsub_macro 2, 3, 0, D1
ldrepl_macro 15, 18
nmsub_macro D1, 2, 0, 3, 1
GMUL xvf, d, U2, D2, U2, U3, D2, U3
// Store A
A_st_macro 0, 3, 0, 4
A_st_macro 4, 0, 0, 1, 2, 3
// Store C
GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
U2, C1, 0x00, U3, C1, 0x20
@ -269,22 +268,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 10 11
// 15
// Sequentially extract data from B in row order
ldrepl_macro 16, 19, 0
ldrepl_macro 16, 16, 17, 18, 19
GMUL xvf, d, U0, D0, U0
ldrepl_macro 20, 22, 5
nmsub_macro 1, 1, 0, D1
ldrepl_macro 23, 24, 10
ldrepl_macro 15, 20, 21, 22
nmsub_macro D1, 1, 0
ldrepl_macro 13, 23, 24
GMUL xvf, d, U1, D4, U1
ldrepl_macro 25, 25, 15
nmsub_macro 2, 2, 0, D2
nmsub_macro 2, 2, 1, D5
ldrepl_macro 10, 25
nmsub_macro D2, 2, 0
nmsub_macro D5, 2, 1
GMUL xvf, d, U2, D7, U2
nmsub_macro 3, 3, 0, D3
nmsub_macro 3, 3, 1, D6
nmsub_macro 3, 3, 2, D8
nmsub_macro D3, 3, 0
nmsub_macro D6, 3, 1
nmsub_macro D8, 3, 2
GMUL xvf, d, U3, D9, U3
// Store A
A_st_macro 0, 3, 0, 4
A_st_macro 4, 0, 0, 1, 2, 3
// Store C
GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00
.endm
@ -296,13 +295,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//0 1
// 3
// Sequentially extract data from B in row order
ldrepl_macro 16, 17, 0
ldrepl_macro 16, 16, 17
GMUL xvf, d, U0, D0, U0
ldrepl_macro 18, 18, 3
nmsub_macro 1, 1, 0, D1
ldrepl_macro 15, 18
nmsub_macro D1, 1, 0
GMUL xvf, d, U1, D2, U1
// Store A
A_st_macro 0, 1, 0, 4
A_st_macro 4, 0, 0, 1
// Store C
GST xv, , U0, C0, 0x00, U1, C1, 0x00
.endm
@ -316,23 +315,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 10 11
// 15
// Sequentially extract data from B in row order
ldrepl_macro 16, 19, 0
ldrepl_macro 16, 16, 17, 18, 19
GMUL xvf, d, U0, D0, U0
ldrepl_macro 20, 22, 5
nmsub_macro 1, 1, 0, D1
ldrepl_macro 23, 24, 10
ldrepl_macro 15, 20, 21, 22
nmsub_macro D1, 1, 0
ldrepl_macro 13, 23, 24
GMUL xvf, d, U1, D4, U1
ldrepl_macro 25, 25, 15
nmsub_macro 2, 2, 0, D2
nmsub_macro 2, 2, 1, D5
ldrepl_macro 10, 25
nmsub_macro D2, 2, 0
nmsub_macro D5, 2, 1
GMUL xvf, d, U2, D7, U2
nmsub_macro 3, 3, 0, D3
nmsub_macro 3, 3, 1, D6
nmsub_macro 3, 3, 2, D8
nmsub_macro D3, 3, 0
nmsub_macro D6, 3, 1
nmsub_macro D8, 3, 2
GMUL xvf, d, U3, D9, U3
// Store A
A_st_macro 0, 3, 0, 2
A_st_macro 2, 0, 0, 1, 2, 3
// Store C
GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00,
.endm
@ -344,13 +343,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//0 1
// 3
// Sequentially extract data from B in row order
ldrepl_macro 16, 17, 0
ldrepl_macro 16, 16, 17
GMUL xvf, d, U0, D0, U0
ldrepl_macro 18, 18, 3
nmsub_macro 1, 1, 0, D1
ldrepl_macro 15, 18
nmsub_macro D1, 1, 0
GMUL xvf, d, U1, D2, U1
// Store A
A_st_macro 0, 1, 0, 2
A_st_macro 2, 0, 0, 1
// Store C
GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00
.endm
@ -364,23 +363,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 10 11
// 15
// Sequentially extract data from B in row order
ldrepl_macro 16, 19, 0
ldrepl_macro 16, 16, 17, 18, 19
GMUL xvf, d, U0, D0, U0
ldrepl_macro 20, 22, 5
nmsub_macro 1, 1, 0, D1
ldrepl_macro 23, 24, 10
ldrepl_macro 15, 20, 21, 22
nmsub_macro D1, 1, 0
ldrepl_macro 13, 23, 24
GMUL xvf, d, U1, D4, U1
ldrepl_macro 25, 25, 15
nmsub_macro 2, 2, 0, D2
nmsub_macro 2, 2, 1, D5
ldrepl_macro 10, 25
nmsub_macro D2, 2, 0
nmsub_macro D5, 2, 1
GMUL xvf, d, U2, D7, U2
nmsub_macro 3, 3, 0, D3
nmsub_macro 3, 3, 1, D6
nmsub_macro 3, 3, 2, D8
nmsub_macro D3, 3, 0
nmsub_macro D6, 3, 1
nmsub_macro D8, 3, 2
GMUL xvf, d, U3, D9, U3
// Store A
A_st_macro 0, 3, 0, 1
A_st_macro 1, 0, 0, 1, 2, 3
// Store C
GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00,
.endm
@ -392,13 +391,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//0 1
// 3
// Sequentially extract data from B in row order
ldrepl_macro 16, 17, 0
ldrepl_macro 16, 16, 17
GMUL xvf, d, U0, D0, U0
ldrepl_macro 18, 18, 3
nmsub_macro 1, 1, 0, D1
ldrepl_macro 15, 18
nmsub_macro D1, 1, 0
GMUL xvf, d, U1, D2, U1
// Store A
A_st_macro 0, 1, 0, 1
A_st_macro 1, 0, 0, 1
// Store C
GST f, d, $f0, C0, 0x00, $f1, C1, 0x00
.endm
@ -582,10 +581,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvld U2, C0, 0x40
xvld U3, C0, 0x60
.L_dsolve_16x1:
ldrepl_macro 16, 16, 0
ldrepl_macro 16, 16
GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
// Store A
A_st_macro 0, 3, 0, 4
A_st_macro 4, 0, 0, 1, 2, 3
// Strore C
GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60
.endm
@ -599,10 +598,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvld U0, C0, 0x00
xvld U1, C0, 0x20
.L_dsolve_8x1:
ldrepl_macro 16, 16, 0
ldrepl_macro 16, 16
GMUL xvf, d, U0, D0, U0, U1, D0, U1
// Store A
A_st_macro 0, 1, 0, 4
A_st_macro 4, 0, 0, 1
// Strore C
GST xv, , U0, C0, 0x00, U1, C0, 0x20
.endm
@ -615,10 +614,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* Load C0 */
xvld U0, C0, 0x00
.L_dsolve_4x1:
ldrepl_macro 16, 16, 0
ldrepl_macro 16, 16
GMUL xvf, d, U0, D0, U0
// Store A
A_st_macro 0, 0, 0, 4
A_st_macro 4, 0, 0
// Strore C
GST xv, , U0, C0, 0x00
.endm
@ -631,10 +630,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* Load C0 */
xvld U0, C0, 0x00
.L_dsolve_2x1:
ldrepl_macro 16, 16, 0
ldrepl_macro 16, 16
GMUL xvf, d, U0, D0, U0
// Store A
A_st_macro 0, 0, 0, 2
A_st_macro 2, 0, 0
// Strore C
GST v, , $vr0, C0, 0x00
.endm
@ -647,16 +646,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Load C
fld.d $f0, C0, 0x00
.L_dsolve_1x1:
ldrepl_macro 16, 16, 0
ldrepl_macro 16, 16
GMUL xvf, d, U0, D0, U0
// Store A
A_st_macro 0, 0, 0, 1
A_st_macro 1, 0, 0
// Strore C
GST f, d, $f0, C0, 0x00
.endm
PROLOGUE
push_if_used 26, 32
push_if_used 9, 8
PTR_SLLI LDC, LDC, 3
PTR_SUB KK, ZERO, OFFSET
/* if (!(N >> 2)) goto L_N3 */
@ -877,6 +876,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PTR_ADD AA, AA, T0 // aa += 1 * k
.L_N1_M0:
.L_N0:
pop_if_used 26, 32
pop_if_used 9, 8
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -111,33 +111,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dtrsm_kernel_macro.S"
.macro ldrepl_macro start, end, stride
.macro ldrepl_macro stride:req, index:req, more:vararg
// Load Ux (x = 0...15)
.if \start <= \end
GLDREPL xv, d, $xr\start, B0, \stride * 8
ldrepl_macro %start + 1, \end, %stride + 1
GLDREPL xv, d, $xr\index, B0, \index * 8 - \stride * 8
.ifnb \more
ldrepl_macro \stride, \more
.endif
.endm
.macro nmsub_macro start0, end0, start1, reg
// Ux -= reg * Dx
.if \start0 <= \end0
.macro nmsub_macro reg:req, start0:req, start1:req, more:vararg
// Gx -= reg * Ux
xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0
nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg
.ifnb \more
nmsub_macro \reg, \more
.endif
.endm
.macro A_st_macro start, end, stride, N
// Store Ux(x = 0...15)
.if \start <= \end
.macro A_st_macro N:req, stride:req, start:req, more:vararg
// Store Gx(x = 16...31)
.if \N == 4
xvst $xr\start, A0, \stride * 0x20
xvst $xr\start, A0, \start * 0x20 - \stride * 0x20
.elseif \N == 2
vst $vr\start, A0, \stride * 0x10
vst $vr\start, A0, \start * 0x10 - \stride * 0x10
.elseif \N == 1
fst.d $f\start, A0, \stride * 0x08
fst.d $f\start, A0, \start * 0x08 - \stride * 0x08
.endif
A_st_macro %start + 1, \end, %stride + 1, \N
.ifnb \more
A_st_macro \N, \stride, \more
.endif
.endm
@ -148,13 +146,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//0
//2 3
// Sequentially extract data from B in row order
ldrepl_macro 16, 16, 0
ldrepl_macro 17, 18, 2
ldrepl_macro 16, 16
ldrepl_macro 15, 17, 18
GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
nmsub_macro 0, 3, 4, D1
nmsub_macro D1, 0, 4, 1, 5, 2, 6, 3, 7
GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
// Store A
A_st_macro 0, 7, 0, 4
A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7
// Store C
GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \
U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60
@ -167,13 +165,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//0
//2 3
// Sequentially extract data from B in row order
ldrepl_macro 16, 16, 0
ldrepl_macro 17, 18, 2
ldrepl_macro 16, 16
ldrepl_macro 15, 17, 18
GMUL xvf, d, U2, D2, U2, U3, D2, U3
nmsub_macro 0, 1, 2, D1
nmsub_macro D1, 0, 2, 1, 3
GMUL xvf, d, U0, D0, U0, U1, D0, U1
// Store A
A_st_macro 0, 3, 0, 4
A_st_macro 4, 0, 0, 1, 2, 3
// Store C
GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
U2, C1, 0x00, U3, C1, 0x20
@ -186,13 +184,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//0
//2 3
// Sequentially extract data from B in row order
ldrepl_macro 16, 16, 0
ldrepl_macro 17, 18, 2
ldrepl_macro 16, 16
ldrepl_macro 15, 17, 18
GMUL xvf, d, U1, D2, U1
nmsub_macro 0, 0, 1, D1
nmsub_macro D1, 0, 1
GMUL xvf, d, U0, D0, U0
// Store A
A_st_macro 0, 1, 0, 4
A_st_macro 4, 0, 0, 1
// Store C
GST xv, , U0, C0, 0x00, U1, C1, 0x00
.endm
@ -204,13 +202,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//0
//2 3
// Sequentially extract data from B in row order
ldrepl_macro 16, 16, 0
ldrepl_macro 17, 18, 2
ldrepl_macro 16, 16
ldrepl_macro 15, 17, 18
GMUL xvf, d, U1, D2, U1
nmsub_macro 0, 0, 1, D1
nmsub_macro D1, 0, 1
GMUL xvf, d, U0, D0, U0
// Store A
A_st_macro 0, 1, 0, 2
A_st_macro 2, 0, 0, 1
// Store C
GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00
.endm
@ -222,13 +220,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//0
//2 3
// Sequentially extract data from B in row order
ldrepl_macro 16, 16, 0
ldrepl_macro 17, 18, 2
ldrepl_macro 16, 16
ldrepl_macro 15, 17, 18
GMUL xvf, d, U1, D2, U1
nmsub_macro 0, 0, 1, D1
nmsub_macro D1, 0, 1
GMUL xvf, d, U0, D0, U0
// Store A
A_st_macro 0, 1, 0, 1
A_st_macro 1, 0, 0, 1
// Store C
GST f, d, $f0, C0, 0x00, $f1, C1, 0x00
.endm
@ -242,22 +240,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//8 9 10
//12 13 14 15
// Sequentially extract data from B in row order
ldrepl_macro 22, 25, 12
ldrepl_macro 10, 22, 23, 24, 25
GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15
ldrepl_macro 19, 21, 8
nmsub_macro 8, 11, 12, D8
ldrepl_macro 17, 18, 4
ldrepl_macro 11, 19, 20, 21
nmsub_macro D8, 8, 12, 9, 13, 10, 14, 11, 15
ldrepl_macro 13, 17, 18
GMUL xvf, d, U8, D5, U8, U9, D5, U9, U10, D5, U10, U11, D5, U11
ldrepl_macro 16, 16, 0
nmsub_macro 4, 7, 12, D7
nmsub_macro 4, 7, 8, D4
ldrepl_macro 16, 16
nmsub_macro D7, 4, 12, 5, 13, 6, 14, 7, 15
nmsub_macro D4, 4, 8, 5, 9, 6, 10, 7, 11
GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
nmsub_macro 0, 3, 12, D6
nmsub_macro 0, 3, 8, D3
nmsub_macro 0, 3, 4, D1
nmsub_macro D6, 0, 12, 1, 13, 2, 14, 3, 15
nmsub_macro D3, 0, 8, 1, 9, 2, 10, 3, 11
nmsub_macro D1, 0, 4, 1, 5, 2, 6, 3, 7
GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
// Store A
A_st_macro 0, 15, 0, 4
A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
// Store C
GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \
U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \
@ -274,22 +272,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//8 9 10
//12 13 14 15
// Sequentially extract data from B in row order
ldrepl_macro 22, 25, 12
ldrepl_macro 10, 22, 23, 24, 25
GMUL xvf, d, U6, D9, U6, U7, D9, U7
ldrepl_macro 19, 21, 8
nmsub_macro 4, 5, 6, D8
ldrepl_macro 17, 18, 4
ldrepl_macro 11, 19, 20, 21
nmsub_macro D8, 4, 6, 5, 7
ldrepl_macro 13, 17, 18
GMUL xvf, d, U4, D5, U4, U5, D5, U5
ldrepl_macro 16, 16, 0
nmsub_macro 2, 3, 6, D7
nmsub_macro 2, 3, 4, D4
ldrepl_macro 16, 16
nmsub_macro D7, 2, 6, 3, 7
nmsub_macro D4, 2, 4, 3, 5
GMUL xvf, d, U2, D2, U2, U3, D2, U3
nmsub_macro 0, 1, 6, D6
nmsub_macro 0, 1, 4, D3
nmsub_macro 0, 1, 2, D1
nmsub_macro D6, 0, 6, 1, 7
nmsub_macro D3, 0, 4, 1, 5
nmsub_macro D1, 0, 2, 1, 3
GMUL xvf, d, U0, D0, U0, U1, D0, U1
// Store A
A_st_macro 0, 7, 0, 4
A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7
// Store C
GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
U2, C1, 0x00, U3, C1, 0x20, \
@ -306,22 +304,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//8 9 10
//12 13 14 15
// Sequentially extract data from B in row order
ldrepl_macro 22, 25, 12
ldrepl_macro 10, 22, 23, 24, 25
GMUL xvf, d, U3, D9, U3
ldrepl_macro 19, 21, 8
nmsub_macro 2, 2, 3, D8
ldrepl_macro 17, 18, 4
ldrepl_macro 11, 19, 20, 21
nmsub_macro D8, 2, 3
ldrepl_macro 13, 17, 18
GMUL xvf, d, U2, D5, U2
ldrepl_macro 16, 16, 0
nmsub_macro 1, 1, 3, D7
nmsub_macro 1, 1, 2, D4
ldrepl_macro 16, 16
nmsub_macro D7, 1, 3
nmsub_macro D4, 1, 2
GMUL xvf, d, U1, D2, U1
nmsub_macro 0, 0, 3, D6
nmsub_macro 0, 0, 2, D3
nmsub_macro 0, 0, 1, D1
nmsub_macro D6, 0, 3
nmsub_macro D3, 0, 2
nmsub_macro D1, 0, 1
GMUL xvf, d, U0, D0, U0
// Store A
A_st_macro 0, 3, 0, 4
A_st_macro 4, 0, 0, 1, 2, 3
// Store C
GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00
.endm
@ -335,22 +333,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//8 9 10
//12 13 14 15
// Sequentially extract data from B in row order
ldrepl_macro 22, 25, 12
ldrepl_macro 10, 22, 23, 24, 25
GMUL xvf, d, U3, D9, U3
ldrepl_macro 19, 21, 8
nmsub_macro 2, 2, 3, D8
ldrepl_macro 17, 18, 4
ldrepl_macro 11, 19, 20, 21
nmsub_macro D8, 2, 3
ldrepl_macro 13, 17, 18
GMUL xvf, d, U2, D5, U2
ldrepl_macro 16, 16, 0
nmsub_macro 1, 1, 3, D7
nmsub_macro 1, 1, 2, D4
ldrepl_macro 16, 16
nmsub_macro D7, 1, 3
nmsub_macro D4, 1, 2
GMUL xvf, d, U1, D2, U1
nmsub_macro 0, 0, 3, D6
nmsub_macro 0, 0, 2, D3
nmsub_macro 0, 0, 1, D1
nmsub_macro D6, 0, 3
nmsub_macro D3, 0, 2
nmsub_macro D1, 0, 1
GMUL xvf, d, U0, D0, U0
// Store A
A_st_macro 0, 3, 0, 2
A_st_macro 2, 0, 0, 1, 2, 3
// Store C
GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00
.endm
@ -364,22 +362,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//8 9 10
//12 13 14 15
// Sequentially extract data from B in row order
ldrepl_macro 22, 25, 12
ldrepl_macro 10, 22, 23, 24, 25
GMUL xvf, d, U3, D9, U3
ldrepl_macro 19, 21, 8
nmsub_macro 2, 2, 3, D8
ldrepl_macro 17, 18, 4
ldrepl_macro 11, 19, 20, 21
nmsub_macro D8, 2, 3
ldrepl_macro 13, 17, 18
GMUL xvf, d, U2, D5, U2
ldrepl_macro 16, 16, 0
nmsub_macro 1, 1, 3, D7
nmsub_macro 1, 1, 2, D4
ldrepl_macro 16, 16
nmsub_macro D7, 1, 3
nmsub_macro D4, 1, 2
GMUL xvf, d, U1, D2, U1
nmsub_macro 0, 0, 3, D6
nmsub_macro 0, 0, 2, D3
nmsub_macro 0, 0, 1, D1
nmsub_macro D6, 0, 3
nmsub_macro D3, 0, 2
nmsub_macro D1, 0, 1
GMUL xvf, d, U0, D0, U0
// Store A
A_st_macro 0, 3, 0, 1
A_st_macro 1, 0, 0, 1, 2, 3
// Store C
GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00,
.endm
@ -399,10 +397,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_dsolve_16x1:
PTR_ADDI A0, T1, -16 * 8
PTR_ADDI B0, T2, -1 * 8
ldrepl_macro 16, 16, 0
ldrepl_macro 16, 16
GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
// Store A
A_st_macro 0, 3, 0, 4
A_st_macro 4, 0, 0, 1, 2, 3
// Strore C
GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60
.endm
@ -420,10 +418,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_dsolve_8x1:
PTR_ADDI A0, T1, -8 * 8
PTR_ADDI B0, T2, -1 * 8
ldrepl_macro 16, 16, 0
ldrepl_macro 16, 16
GMUL xvf, d, U0, D0, U0, U1, D0, U1
// Store A
A_st_macro 0, 1, 0, 4
A_st_macro 4, 0, 0, 1
// Strore C
GST xv, , U0, C0, 0x00, U1, C0, 0x20
.endm
@ -440,10 +438,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_dsolve_4x1:
PTR_ADDI A0, T1, -4 * 8
PTR_ADDI B0, T2, -1 * 8
ldrepl_macro 16, 16, 0
ldrepl_macro 16, 16
GMUL xvf, d, U0, D0, U0
// Store A
A_st_macro 0, 0, 0, 4
A_st_macro 4, 0, 0
// Strore C
GST xv, , U0, C0, 0x00
.endm
@ -460,10 +458,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_dsolve_2x1:
PTR_ADDI A0, T1, -2 * 8
PTR_ADDI B0, T2, -1 * 8
ldrepl_macro 16, 16, 0
ldrepl_macro 16, 16
GMUL xvf, d, U0, D0, U0
// Store A
A_st_macro 0, 0, 0, 2
A_st_macro 2, 0, 0
// Strore C
GST v, , $vr0, C0, 0x00
.endm
@ -480,10 +478,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_dsolve_1x1:
PTR_ADDI A0, T1, -1 * 8
PTR_ADDI B0, T2, -1 * 8
ldrepl_macro 16, 16, 0
ldrepl_macro 16, 16
GMUL xvf, d, U0, D0, U0
// Store A
A_st_macro 0, 0, 0, 1
A_st_macro 1, 0, 0
// Strore C
GST f, d, $f0, C0, 0x00
.endm
@ -697,7 +695,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
PROLOGUE
push_if_used 26, 32
push_if_used 9, 8
PTR_SLLI LDC, LDC, 3
PTR_SUB KK, N, OFFSET
PTR_MUL T0, N, LDC
@ -948,6 +946,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PTR_ADDI KK, KK, -4
bnez J, .L_J1
.L_N0:
pop_if_used 26, 32
pop_if_used 9, 8
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -90,57 +90,175 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define PTR_FST fst.d
#endif
// The max registers available to the user which
// do not need to be preserved across calls.
// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html
#define MAX_INT_CALLER_SAVED 17
#define MAX_FP_CALLER_SAVED 24
.altmacro // Enable alternate macro mode
/*
* Pushing and popping static registers into/from the stack.
* regs : number of static general-purpose registers, greater than or equal to 0, less than or equal to 9
* fregs: number of static floating-point registers, greater than or equal to 0, less than or equal to 8
*/
.macro push_if_used regs, fregs
.if \regs > MAX_INT_CALLER_SAVED
PTR_ADDI $sp, $sp, -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG)
push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
.if \regs > 0
PTR_ADDI $sp, $sp, -(\regs << REG_LOG)
push_regs 0, \regs - 1
.endif
.if \fregs > MAX_FP_CALLER_SAVED
PTR_ADDI $sp, $sp, -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG)
push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
.if \fregs > 0
PTR_ADDI $sp, $sp, -(\fregs << FREG_LOG)
push_fregs 0, \fregs - 1
.endif
.endm // End push_if_used
.macro pop_if_used regs, fregs
.if \fregs > MAX_FP_CALLER_SAVED
pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
PTR_ADDI $sp, $sp, (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG
.if \fregs > 0
pop_fregs 0, \fregs - 1
PTR_ADDI $sp, $sp, \fregs << FREG_LOG
.endif
.if \regs > MAX_INT_CALLER_SAVED
pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
PTR_ADDI $sp, $sp, (\regs - MAX_INT_CALLER_SAVED) << REG_LOG
.if \regs > 0
pop_regs 0, \regs - 1
PTR_ADDI $sp, $sp, \regs << REG_LOG
.endif
.endm // End pop_if_used
.macro push_regs from, to
PTR_ST $s\()\from, $sp, \from << REG_LOG
#ifdef __clang__
.if \to >= 0
PTR_ST $s0, $sp, 0 << REG_LOG
.endif
.if \to >= 1
PTR_ST $s1, $sp, 1 << REG_LOG
.endif
.if \to >= 2
PTR_ST $s2, $sp, 2 << REG_LOG
.endif
.if \to >= 3
PTR_ST $s3, $sp, 3 << REG_LOG
.endif
.if \to >= 4
PTR_ST $s4, $sp, 4 << REG_LOG
.endif
.if \to >= 5
PTR_ST $s5, $sp, 5 << REG_LOG
.endif
.if \to >= 6
PTR_ST $s6, $sp, 6 << REG_LOG
.endif
.if \to >= 7
PTR_ST $s7, $sp, 7 << REG_LOG
.endif
.if \to >= 8
PTR_ST $s8, $sp, 8 << REG_LOG
.endif
#else
PTR_ST $s\()\from, $sp, \from << REG_LOG
.if \to - \from
push_regs %from + 1, \to
.endif
#endif
.endm // End push_regs
.macro pop_regs from, to
#ifdef __clang__
.if \to >= 0
PTR_LD $s0, $sp, 0 << REG_LOG
.endif
.if \to >= 1
PTR_LD $s1, $sp, 1 << REG_LOG
.endif
.if \to >= 2
PTR_LD $s2, $sp, 2 << REG_LOG
.endif
.if \to >= 3
PTR_LD $s3, $sp, 3 << REG_LOG
.endif
.if \to >= 4
PTR_LD $s4, $sp, 4 << REG_LOG
.endif
.if \to >= 5
PTR_LD $s5, $sp, 5 << REG_LOG
.endif
.if \to >= 6
PTR_LD $s6, $sp, 6 << REG_LOG
.endif
.if \to >= 7
PTR_LD $s7, $sp, 7 << REG_LOG
.endif
.if \to >= 8
PTR_LD $s8, $sp, 8 << REG_LOG
.endif
#else
PTR_LD $s\()\from, $sp, \from << REG_LOG
.if \to - \from
pop_regs %from + 1, \to
.endif
#endif
.endm // End pop_regs
.macro push_fregs from, to
#ifdef __clang__
.if \to >= 0
PTR_FST $fs0, $sp, 0 << FREG_LOG
.endif
.if \to >= 1
PTR_FST $fs1, $sp, 1 << FREG_LOG
.endif
.if \to >= 2
PTR_FST $fs2, $sp, 2 << FREG_LOG
.endif
.if \to >= 3
PTR_FST $fs3, $sp, 3 << FREG_LOG
.endif
.if \to >= 4
PTR_FST $fs4, $sp, 4 << FREG_LOG
.endif
.if \to >= 5
PTR_FST $fs5, $sp, 5 << FREG_LOG
.endif
.if \to >= 6
PTR_FST $fs6, $sp, 6 << FREG_LOG
.endif
.if \to >= 7
PTR_FST $fs7, $sp, 7 << FREG_LOG
.endif
#else
PTR_FST $fs\()\from, $sp, \from << FREG_LOG
.if \to - \from
push_fregs %from + 1, \to
.endif
#endif
.endm // End push_fregs
.macro pop_fregs from, to
#ifdef __clang__
.if \to >= 0
PTR_FLD $fs0, $sp, 0 << FREG_LOG
.endif
.if \to >= 1
PTR_FLD $fs1, $sp, 1 << FREG_LOG
.endif
.if \to >= 2
PTR_FLD $fs2, $sp, 2 << FREG_LOG
.endif
.if \to >= 3
PTR_FLD $fs3, $sp, 3 << FREG_LOG
.endif
.if \to >= 4
PTR_FLD $fs4, $sp, 4 << FREG_LOG
.endif
.if \to >= 5
PTR_FLD $fs5, $sp, 5 << FREG_LOG
.endif
.if \to >= 6
PTR_FLD $fs6, $sp, 6 << FREG_LOG
.endif
.if \to >= 7
PTR_FLD $fs7, $sp, 7 << FREG_LOG
.endif
#else
PTR_FLD $fs\()\from, $sp, \from << FREG_LOG
.if \to - \from
pop_fregs %from + 1, \to
.endif
#endif
.endm // End pop_fregs
//
@ -275,7 +393,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// GXOR
//
.macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
\pre_op\()xor.\suf_op \out, \in0, \in1
.ifnb \pre_op
\pre_op\()xor.v \out, \in0, \in1
.else
xor.\suf_op \out, \in0, \in1
.endif
.ifnb \more
GXOR \pre_op, \suf_op, \more
.endif
@ -307,6 +429,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
GPRELD \more
.endif
.endm
//
// GPACKEV
//
.macro GPACKEV pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
\pre_op\()packev.\suf_op \out, \in0, \in1
.ifnb \more
GPACKEV \pre_op, \suf_op, \more
.endif
.endm
//
// GPACKOD
//
.macro GPACKOD pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
\pre_op\()packod.\suf_op \out, \in0, \in1
.ifnb \more
GPACKOD \pre_op, \suf_op, \more
.endif
.endm
//
// GSHUF4I
//
.macro GSHUF4I pre_op:req, suf_op:req, out:req, in0:req, in1:req /* imm */, more:vararg
\pre_op\()shuf4i.\suf_op \out, \in0, \in1
.ifnb \more
GSHUF4I \pre_op, \suf_op, \more
.endif
.endm
.macro TRANSF2G name, pre_op:req, suf_op:req, more:vararg
.ifeqs "\pre_op\()\suf_op", "vfs"
\name v, w, \more
.endif
.ifeqs "\pre_op\()\suf_op", "vfd"
\name v, d, \more
.endif
.ifeqs "\pre_op\()\suf_op", "xvfs"
\name xv, w, \more
.endif
.ifeqs "\pre_op\()\suf_op", "xvfd"
\name xv, d, \more
.endif
.endm
//
// Compound instructions
@ -314,61 +478,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// GACC: Accumulate the values of vector registers
//
.macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
.ifeqs "\pre_op", "xvf"
.ifeqs "\pre_op\()\suf_op", "xvfd"
xvpermi.q \out, \in, 0x01
\pre_op\()add.\suf_op \in, \out, \in
xvpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.endif
.ifeqs "\pre_op\()\suf_op", "xvfs"
xvpermi.q \out, \in, 0x01
\pre_op\()add.\suf_op \in, \out, \in
xvpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.ifeqs "\suf_op", "s"
xvpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.endif
.endif
.ifeqs "\pre_op", "vf"
.ifeqs "\pre_op\()\suf_op", "vfd"
vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.endif
.ifeqs "\pre_op\()\suf_op", "vfs"
vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.ifeqs "\suf_op", "s"
vpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.endif
.endif
.ifeqs "\pre_op", "xv"
.ifeqs "\pre_op\()\suf_op", "xvd"
xvpermi.q \out, \in, 0x01
\pre_op\()add.\suf_op \in, \out, \in
xvpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.endif
.ifeqs "\pre_op\()\suf_op", "xvw"
xvpermi.q \out, \in, 0x01
\pre_op\()add.\suf_op \in, \out, \in
xvpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
xvpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.endif
.ifeqs "\pre_op\()\suf_op", "xvh"
xvpermi.q \out, \in, 0x01
\pre_op\()add.\suf_op \in, \out, \in
xvpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
xvpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
xvpackod.h \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.endif
.ifeqs "\pre_op\()\suf_op", "xvb"
xvpermi.q \out, \in, 0x01
\pre_op\()add.\suf_op \in, \out, \in
xvpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "d"
xvpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "w"
xvpackod.h \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "h"
xvpackod.b \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.endif
.endif
.endif
.endif
.ifeqs "\pre_op", "v"
.ifeqs "\pre_op\()\suf_op", "vd"
vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.endif
.ifeqs "\pre_op\()\suf_op", "vw"
vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
vpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.endif
.ifeqs "\pre_op\()\suf_op", "vh"
vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
vpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
vpackod.h \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.endif
.ifeqs "\pre_op\()\suf_op", "vb"
vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "d"
vpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "w"
vpackod.h \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "h"
vpackod.b \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.endif
.endif
.endif
.endif
.ifnb \more
GACC \pre_op, \suf_op, \more
@ -391,26 +590,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Note: When "pre_op = xvf && suf_op = s", in will be modified.
//
.macro GCOMPLEXACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
.ifeqs "\pre_op", "xvf"
.ifeqs "\pre_op\()\suf_op", "xvfd"
xvpermi.q \out, \in, 0x01
\pre_op\()add.\suf_op \out, \out, \in
.endif
.ifeqs "\pre_op\()\suf_op", "xvfs"
xvpermi.q \out, \in, 0x01
.ifeqs "\suf_op", "s"
\pre_op\()add.\suf_op \in, \out, \in
xvpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.else
\pre_op\()add.\suf_op \out, \out, \in
.endif
.endif
.ifeqs "\pre_op", "vf"
.ifeqs "\suf_op", "s"
vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.else
.ifeqs "\pre_op\()\suf_op", "vfd"
vor.v \out, \in, \in
.endif
.endif
.ifeqs "\pre_op\()\suf_op", "vfs"
vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.endif
.ifnb \more
GCOMPLEXACC \pre_op, \suf_op, \more
@ -430,56 +629,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// suf_op: s or d, differentiate between single precision or double precision complex numbers
//
.macro GCOMPLEXMUL xconj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, tmp0:req, tmp1:req, tmp2:req, more:vararg
.ifeqs "\pre_op", "xvf"
xvxor.v \tmp1, \tmp1, \tmp1
.ifeqs "\suf_op", "s"
xvpackev.w \tmp0, \in0, \in0
.else
xvpackev.d \tmp0, \in0, \in0
.endif
.else
vxor.v \tmp1, \tmp1, \tmp1
.ifeqs "\suf_op", "s"
vpackev.w \tmp0, \in0, \in0
.else
vpackev.d \tmp0, \in0, \in0
.endif
.endif
TRANSF2G GXOR, \pre_op, s, \tmp1, \tmp1, \tmp1
TRANSF2G GPACKEV, \pre_op, \suf_op, \tmp0, \in0, \in0
\pre_op\()sub.\suf_op \tmp1, \tmp1, \in0
.ifeqs "\pre_op", "xvf"
.ifeqs "\xconj", "0"
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1
.else
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0
.endif
.ifeqs "\suf_op", "s"
.ifeqs "\xconj", "0"
xvpackod.w \tmp1, \in0, \tmp1
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1
.else
xvpackod.w \tmp1, \tmp1, \in0
.endif
xvshuf4i.w \tmp2, \in1, 0xb1
.else
.ifeqs "\xconj", "0"
xvpackod.d \tmp1, \in0, \tmp1
.else
xvpackod.d \tmp1, \tmp1, \in0
.endif
xvshuf4i.d \tmp2, \in1, 0x0b
.endif
.else
.ifeqs "\suf_op", "s"
.ifeqs "\xconj", "0"
vpackod.w \tmp1, \in0, \tmp1
.else
vpackod.w \tmp1, \tmp1, \in0
.endif
vshuf4i.w \tmp2, \in1, 0xb1
.else
.ifeqs "\xconj", "0"
vpackod.d \tmp1, \in0, \tmp1
.else
vpackod.d \tmp1, \tmp1, \in0
.endif
vshuf4i.d \tmp2, \in1, 0x0b
.endif
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b
.endif
\pre_op\()mul.\suf_op \out, \tmp0, \in1
@ -512,112 +676,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// suf_op: s or d, differentiate between single precision or double precision complex numbers
//
.macro GCOMPLEXMADD xconj=0, conj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, tmp0:req, tmp1:req, tmp2:req, more:vararg
.ifeqs "\pre_op", "xvf"
xvxor.v \tmp1, \tmp1, \tmp1
.ifeqs "\suf_op", "s"
xvpackev.w \tmp0, \in0, \in0
.else
xvpackev.d \tmp0, \in0, \in0
.endif
.else
vxor.v \tmp1, \tmp1, \tmp1
.ifeqs "\suf_op", "s"
vpackev.w \tmp0, \in0, \in0
.else
vpackev.d \tmp0, \in0, \in0
.endif
.endif
TRANSF2G GXOR, \pre_op, s, \tmp1, \tmp1, \tmp1
TRANSF2G GPACKEV, \pre_op, \suf_op, \tmp0, \in0, \in0
\pre_op\()madd.\suf_op \tmp2, \tmp0, \in1, \in2
.ifeqs "\conj", "1"
.ifeqs "\conj\()\suf_op", "1s"
\pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2
.ifeqs "\pre_op", "xvf"
.ifeqs "\suf_op", "s"
xvshuf4i.w \tmp0, \tmp0, 0xb1
xvpackev.w \out, \tmp0, \tmp2
.else
xvshuf4i.d \tmp0, \tmp0, 0x0b
xvpackev.d \out, \tmp0, \tmp2
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp0, \tmp0, 0xb1
TRANSF2G GPACKEV, \pre_op, \suf_op, \out, \tmp0, \tmp2
.endif
.else
.ifeqs "\suf_op", "s"
vshuf4i.w \tmp0, \tmp0, 0xb1
vpackev.w \out, \tmp0, \tmp2
.else
vshuf4i.d \tmp0, \tmp0, 0x0b
vpackev.d \out, \tmp0, \tmp2
.ifeqs "\conj\()\suf_op", "1d"
\pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp0, \tmp0, 0x0b
TRANSF2G GPACKEV, \pre_op, \suf_op, \out, \tmp0, \tmp2
.endif
.endif /* pre_op = xvf */
.else
.ifeqs "\conj", "0"
\pre_op\()add.\suf_op \out, \tmp2, \tmp1
.endif /* conj = 1 */
.endif
\pre_op\()sub.\suf_op \tmp1, \tmp1, \in0
.ifeqs "\pre_op", "xvf"
.ifeqs "\suf_op", "s"
.ifeqs "\conj", "0"
.ifeqs "\xconj", "0"
xvpackod.w \tmp1, \in0, \tmp1
.else
xvpackod.w \tmp1, \tmp1, \in0
.ifeqs "\xconj\()\conj\()\suf_op", "00s"
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1
.endif
.else
.ifeqs "\xconj", "0"
xvpackod.w \tmp1, \in0, \in0
.else
xvpackod.w \tmp1, \tmp1, \tmp1
.ifeqs "\xconj\()\conj\()\suf_op", "10s"
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1
.endif
.ifeqs "\xconj\()\conj\()\suf_op", "01s"
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \in0
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1
.endif
xvshuf4i.w \tmp2, \in1, 0xb1
.else
.ifeqs "\conj", "0"
.ifeqs "\xconj", "0"
xvpackod.d \tmp1, \in0, \tmp1
.else
xvpackod.d \tmp1, \tmp1, \in0
.ifeqs "\xconj\()\conj\()\suf_op", "11s"
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \tmp1
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1
.endif
.else
.ifeqs "\xconj", "0"
xvpackod.d \tmp1, \in0, \in0
.else
xvpackod.d \tmp1, \tmp1, \tmp1
.ifeqs "\xconj\()\conj\()\suf_op", "00d"
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b
.endif
.ifeqs "\xconj\()\conj\()\suf_op", "10d"
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b
.endif
xvshuf4i.d \tmp2, \in1, 0x0b
.endif
.else
.ifeqs "\suf_op", "s"
.ifeqs "\conj", "0"
.ifeqs "\xconj", "0"
vpackod.w \tmp1, \in0, \tmp1
.else
vpackod.w \tmp1, \tmp1, \in0
.endif
.else
.ifeqs "\xconj", "0"
vpackod.w \tmp1, \in0, \in0
.else
vpackod.w \tmp1, \tmp1, \tmp1
.endif
.endif
vshuf4i.w \tmp2, \in1, 0xb1
.else
.ifeqs "\conj", "0"
.ifeqs "\xconj", "0"
vpackod.d \tmp1, \in0, \tmp1
.else
vpackod.d \tmp1, \tmp1, \in0
.endif
.else
.ifeqs "\xconj", "0"
vpackod.d \tmp1, \in0, \in0
.else
vpackod.d \tmp1, \tmp1, \tmp1
.endif
.endif
vshuf4i.d \tmp2, \in1, 0x0b
.ifeqs "\xconj\()\conj\()\suf_op", "01d"
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \in0
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b
.endif
.ifeqs "\xconj\()\conj\()\suf_op", "11d"
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \tmp1
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b
.endif
\pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out

View File

@ -837,7 +837,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
PROLOGUE
push_if_used 26, 32
push_if_used 9, 8
xvreplve0.w VALPHA, $xr0
#if defined (TRMMKERNEL) && !defined(LEFT)
PTR_SUB OFF, ZERO, OFFSET
@ -2343,6 +2343,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif // #if defined(TRMMKERNEL)
.L_N1_M0:
.L_N0:
pop_if_used 26, 32
pop_if_used 9, 8
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -135,7 +135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//.L_N0
PROLOGUE
push_if_used 26, 32
push_if_used 9, 8
move TD, DST
move TS, SRC
@ -458,6 +458,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PTR_ADDI M, M, -1
blt ZERO, M, .L_N1_M1
.L_N0:
pop_if_used 26, 32
pop_if_used 9, 8
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//.L_N0
PROLOGUE
push_if_used 17, 20
push_if_used 0, 0
move TD, DST
move TS, SRC
@ -293,6 +293,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PTR_ADDI M, M, -1
blt ZERO, M, .L_N1_M1
.L_N0:
pop_if_used 17, 20
pop_if_used 0, 0
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -118,7 +118,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//.L_M0
PROLOGUE
push_if_used 24, 8
push_if_used 7, 0
move S0, SRC
move P0, DST
@ -521,6 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PTR_ADDI S1, S1, 0x04
PTR_ADDI P5, P5, 0x04
.L_M0:
pop_if_used 24, 8
pop_if_used 7, 0
jirl $r0, $r1, 0x00
EPILOGUE

View File

@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//.L_M0
PROLOGUE
push_if_used 23, 8
push_if_used 6, 0
move S0, SRC
move P0, DST
@ -401,6 +401,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PTR_ADDI S1, S1, 0x04
PTR_ADDI P4, P4, 0x04
.L_M0:
pop_if_used 23, 8
pop_if_used 6, 0
jirl $r0, $r1, 0x00
EPILOGUE

View File

@ -418,7 +418,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 7, 19
push_if_used 7, 0
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
PTR_SUB J, INC_Y, K
@ -458,6 +458,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
SGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
.L_END:
pop_if_used 17 + 7, 19
pop_if_used 7, 0
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -369,7 +369,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 8, 18
push_if_used 8, 0
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
@ -400,6 +400,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1: /* if (incx != 1) */
SGEMV_T_LASX GAP_1, X8_GAP, X4_GAP
.L_END:
pop_if_used 17 + 8, 18
pop_if_used 8, 0
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -253,7 +253,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 7, 31
push_if_used 7, 7
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
PTR_SUB J, INC_Y, K
@ -291,6 +291,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
ZGEMV_N_LSX GAP_1_1, X_2_GAP, X_1, Y_2_GAP, Y_1
.L_END:
pop_if_used 17 + 7, 31
pop_if_used 7, 7
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -298,7 +298,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 7, 31
push_if_used 7, 7
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
PTR_SUB J, INC_Y, K
@ -337,7 +337,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
ZGEMV_N_LASX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1
.L_END:
pop_if_used 17 + 7, 31
pop_if_used 7, 7
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -234,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 8, 30
push_if_used 8, 6
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
@ -263,6 +263,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1: /* if (incx != 1) */
ZGEMV_T_LSX GAP_1, X2_GAP
.L_END:
pop_if_used 17 + 8, 30
pop_if_used 8, 6
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -264,7 +264,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 8, 30
push_if_used 8, 6
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
@ -294,6 +294,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L_GAP_1: /* if (incx != 1) */
ZGEMV_T_LASX GAP_1, X4_GAP
.L_END:
pop_if_used 17 + 8, 30
pop_if_used 8, 6
jirl $r0, $r1, 0x0
EPILOGUE