loongarch64: Fixed clang compilation issues
This commit is contained in:
parent
15b9fc3f78
commit
7cd438a5ac
|
@ -955,12 +955,18 @@ endif
|
|||
|
||||
ifeq ($(ARCH), loongarch64)
|
||||
LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d)
|
||||
LA64_ARCH=$(shell $(CC) -march=loongarch64 -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo loongarch64)
|
||||
ifneq ($(LA64_ABI), lp64d)
|
||||
LA64_ABI=lp64
|
||||
endif
|
||||
ifneq ($(LA64_ARCH), loongarch64)
|
||||
CCOMMON_OPT += -mabi=$(LA64_ABI)
|
||||
FCOMMON_OPT += -mabi=$(LA64_ABI)
|
||||
else
|
||||
CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
|
||||
FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
|
||||
endif
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
|
|
15
c_check
15
c_check
|
@ -197,10 +197,22 @@ fi
|
|||
no_lsx=0
|
||||
no_lasx=0
|
||||
if [ "$architecture" = "loongarch64" ]; then
|
||||
lasx_flags='-march=loongarch64'
|
||||
lsx_flags='-march=loongarch64'
|
||||
|
||||
tmpd="$(mktemp -d)"
|
||||
tmparch="$tmpd/arch.c"
|
||||
printf "void main(void){ }\n" >> "$tmparch"
|
||||
args="-march=loongarch64 -o $tmparch.o $tmparch"
|
||||
{
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
lasx_flags=''
|
||||
lsx_flags=''
|
||||
}
|
||||
|
||||
tmplsx="$tmpd/lsx.c"
|
||||
codelsx='"vadd.b $vr0, $vr0, $vr0"'
|
||||
lsx_flags='-march=loongarch64'
|
||||
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx"
|
||||
args="$lsx_flags -o $tmplsx.o $tmplsx"
|
||||
{
|
||||
|
@ -211,7 +223,6 @@ if [ "$architecture" = "loongarch64" ]; then
|
|||
|
||||
tmplasx="$tmpd/lasx.c"
|
||||
codelasx='"xvadd.b $xr0, $xr0, $xr0"'
|
||||
lasx_flags='-march=loongarch64'
|
||||
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx"
|
||||
args="$lasx_flags -o $tmplasx.o $tmplasx"
|
||||
{
|
||||
|
|
|
@ -279,7 +279,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 7, 31
|
||||
push_if_used 7, 7
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
PTR_SUB J, INC_Y, K
|
||||
|
@ -318,6 +318,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
|
||||
CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1
|
||||
.L_END:
|
||||
pop_if_used 17 + 7, 31
|
||||
pop_if_used 7, 7
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
|
|
@ -336,7 +336,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 7, 31
|
||||
push_if_used 7, 7
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
PTR_SUB J, INC_Y, K
|
||||
|
@ -378,6 +378,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
|
||||
CGEMV_N_LASX GAP_1_1, X_8_GAP, X_1, Y_8_GAP, Y_1
|
||||
.L_END:
|
||||
pop_if_used 17 + 7, 31
|
||||
pop_if_used 7, 7
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
|
|
@ -255,7 +255,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 8, 30
|
||||
push_if_used 8, 6
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
|
||||
|
@ -285,6 +285,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.L_GAP_1: /* if (incx != 1) */
|
||||
CGEMV_T_LSX GAP_1, X4_GAP
|
||||
.L_END:
|
||||
pop_if_used 17 + 8, 30
|
||||
pop_if_used 8, 6
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
|
|
@ -304,7 +304,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 8, 30
|
||||
push_if_used 8, 6
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
|
||||
|
@ -337,6 +337,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.L_GAP_1: /* if (incx != 1) */
|
||||
CGEMV_T_LASX GAP_1, X8_GAP
|
||||
.L_END:
|
||||
pop_if_used 17 + 8, 30
|
||||
pop_if_used 8, 6
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
|
|
@ -79,7 +79,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define D7 $vr15
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 26, 32
|
||||
push_if_used 0, 0
|
||||
move TD, DST
|
||||
move TS, SRC
|
||||
slli.d TL, LDA, 0x03
|
||||
|
@ -278,6 +278,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
addi.d M, M, -1
|
||||
blt ZERO, M, .L_M1
|
||||
.L_N0:
|
||||
pop_if_used 26, 32
|
||||
pop_if_used 0, 0
|
||||
jirl $r0, $r1, 0x00
|
||||
EPILOGUE
|
||||
|
|
|
@ -66,7 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define U7 $vr7
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 18, 8
|
||||
push_if_used 1, 0
|
||||
|
||||
move S0, SRC
|
||||
move P0, DST
|
||||
|
@ -274,7 +274,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
fst.d F0, P3, 0x00
|
||||
|
||||
.L_M0:
|
||||
pop_if_used 18, 8
|
||||
pop_if_used 1, 0
|
||||
jirl $r0, $r1, 0x00
|
||||
|
||||
EPILOGUE
|
||||
|
|
|
@ -76,7 +76,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define U7 $vr7
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 24, 8
|
||||
push_if_used 7, 0
|
||||
|
||||
move S0, SRC
|
||||
move P0, DST
|
||||
|
@ -592,6 +592,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
addi.d S1, S1, 0x08
|
||||
addi.d P4, P4, 0x08
|
||||
.L_M0:
|
||||
pop_if_used 24, 8
|
||||
pop_if_used 7, 0
|
||||
jirl $r0, $r1, 0x00
|
||||
EPILOGUE
|
||||
|
|
|
@ -509,7 +509,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 7, 24 + 4
|
||||
push_if_used 7, 4
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
PTR_SUB J, INC_Y, K
|
||||
|
@ -549,6 +549,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
|
||||
DGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
|
||||
.L_END:
|
||||
pop_if_used 17 + 7, 24 + 4
|
||||
pop_if_used 7, 4
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
|
|
@ -445,7 +445,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 8, 24 + 3
|
||||
push_if_used 8, 3
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
|
||||
|
@ -476,6 +476,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.L_GAP_1: /* if (incx != 1) */
|
||||
DGEMV_T_LASX GAP_1, X8_GAP, X4_GAP
|
||||
.L_END:
|
||||
pop_if_used 17 + 8, 24 + 3
|
||||
pop_if_used 8, 3
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
|
|
@ -1029,7 +1029,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 26, 32
|
||||
push_if_used 9, 8
|
||||
PTR_SLLI LDC, LDC, 3
|
||||
/* if (!(N >> 2)) goto L_N3 */
|
||||
PTR_SRAI J, N, 2 /* J = bn >> 2 */
|
||||
|
@ -1361,6 +1361,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
blt ZERO, I, .L_N1_I1
|
||||
.L_N1_M0:
|
||||
.L_N0:
|
||||
pop_if_used 26, 32
|
||||
pop_if_used 9, 8
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
|
|
@ -128,31 +128,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "dtrsm_kernel_macro.S"
|
||||
|
||||
.macro ldrepl_macro start, end, stride
|
||||
.macro ldrepl_macro stride:req, index:req, more:vararg
|
||||
// Load Ux (x = 0...15)
|
||||
.if \start <= \end
|
||||
GLDREPL xv, d, $xr\start, A0, \stride * 8
|
||||
ldrepl_macro %start + 1, \end, %stride + 1
|
||||
GLDREPL xv, d, $xr\index, A0, \index * 8 - \stride * 8
|
||||
.ifnb \more
|
||||
ldrepl_macro \stride, \more
|
||||
.endif
|
||||
.endm
|
||||
.macro nmsub_macro start0, end0, start1, reg
|
||||
.macro nmsub_macro reg:req, start0:req, start1:req, more:vararg
|
||||
// Gx -= reg * Ux
|
||||
.if \start0 <= \end0
|
||||
xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0
|
||||
nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg
|
||||
.ifnb \more
|
||||
nmsub_macro \reg, \more
|
||||
.endif
|
||||
.endm
|
||||
.macro B_st_macro start, end, stride, N
|
||||
.macro B_st_macro N:req, stride:req, start:req, more:vararg
|
||||
// Store Gx(x = 16...31)
|
||||
.if \start <= \end
|
||||
.if \N == 4
|
||||
xvst $xr\start, B0, \stride * 0x20
|
||||
xvst $xr\start, B0, \start * 0x20 - \stride * 0x20
|
||||
.elseif \N == 2
|
||||
vst $vr\start, B0, \stride * 0x10
|
||||
vst $vr\start, B0, \start * 0x10 - \stride * 0x10
|
||||
.elseif \N == 1
|
||||
fst.d $f\start, B0, \stride * 0x08
|
||||
fst.d $f\start, B0, \start * 0x08 - \stride * 0x08
|
||||
.endif
|
||||
B_st_macro %start + 1, \end, %stride + 1, \N
|
||||
.ifnb \more
|
||||
B_st_macro \N, \stride, \more
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
@ -194,86 +194,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 255
|
||||
// Sequentially extract data from A in row order
|
||||
// Load 0
|
||||
ldrepl_macro 0, 15, 0
|
||||
ldrepl_macro 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
GMUL xvf, d, G0, G0, U0
|
||||
nmsub_macro 17, 31, 1, G0
|
||||
nmsub_macro G0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, \
|
||||
25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
|
||||
PTR_ADDI A0, A0, 17 * 8
|
||||
// Load 1
|
||||
ldrepl_macro 1, 15, 0
|
||||
ldrepl_macro 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
GMUL xvf, d, G1, G1, U1
|
||||
nmsub_macro 18, 31, 2, G1
|
||||
nmsub_macro G1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, \
|
||||
25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
|
||||
PTR_ADDI A0, A0, 17 * 8
|
||||
// Load 2
|
||||
ldrepl_macro 2, 15, 0
|
||||
ldrepl_macro 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
GMUL xvf, d, G2, G2, U2
|
||||
nmsub_macro 19, 31, 3, G2
|
||||
nmsub_macro G2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, 25, 9, 26, \
|
||||
10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
|
||||
PTR_ADDI A0, A0, 17 * 8
|
||||
// Load 3
|
||||
ldrepl_macro 3, 15, 0
|
||||
ldrepl_macro 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
GMUL xvf, d, G3, G3, U3
|
||||
nmsub_macro 20, 31, 4, G3
|
||||
nmsub_macro G3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, 25, 9, 26, 10, \
|
||||
27, 11, 28, 12, 29, 13, 30, 14, 31, 15
|
||||
PTR_ADDI A0, A0, 17 * 8
|
||||
// Load 4
|
||||
ldrepl_macro 4, 15, 0
|
||||
ldrepl_macro 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
GMUL xvf, d, G4, G4, U4
|
||||
nmsub_macro 21, 31, 5, G4
|
||||
nmsub_macro G4, 21, 5, 22, 6, 23, 7, 24, 8, 25, 9, 26, 10, 27, 11, \
|
||||
28, 12, 29, 13, 30, 14, 31, 15
|
||||
PTR_ADDI A0, A0, 17 * 8
|
||||
// Load 5
|
||||
ldrepl_macro 5, 15, 0
|
||||
ldrepl_macro 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
GMUL xvf, d, G5, G5, U5
|
||||
nmsub_macro 22, 31, 6, G5
|
||||
nmsub_macro G5, 22, 6, 23, 7, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, \
|
||||
29, 13, 30, 14, 31, 15
|
||||
PTR_ADDI A0, A0, 17 * 8
|
||||
// Load 6
|
||||
ldrepl_macro 6, 15, 0
|
||||
ldrepl_macro 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
GMUL xvf, d, G6, G6, U6
|
||||
nmsub_macro 23, 31, 7, G6
|
||||
nmsub_macro G6, 23, 7, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, \
|
||||
30, 14, 31, 15
|
||||
PTR_ADDI A0, A0, 17 * 8
|
||||
// Load 7
|
||||
ldrepl_macro 7, 15, 0
|
||||
ldrepl_macro 7, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
GMUL xvf, d, G7, G7, U7
|
||||
nmsub_macro 24, 31, 8, G7
|
||||
nmsub_macro G7, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
|
||||
PTR_ADDI A0, A0, 17 * 8
|
||||
// Load 8
|
||||
ldrepl_macro 8, 15, 0
|
||||
ldrepl_macro 8, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
GMUL xvf, d, G8, G8, U8
|
||||
nmsub_macro 25, 31, 9, G8
|
||||
nmsub_macro G8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
|
||||
PTR_ADDI A0, A0, 17 * 8
|
||||
// Load 9
|
||||
ldrepl_macro 9, 15, 0
|
||||
ldrepl_macro 9, 9, 10, 11, 12, 13, 14, 15
|
||||
GMUL xvf, d, G9, G9, U9
|
||||
nmsub_macro 26, 31, 10, G9
|
||||
nmsub_macro G9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
|
||||
PTR_ADDI A0, A0, 17 * 8
|
||||
// Load 10
|
||||
ldrepl_macro 10, 15, 0
|
||||
ldrepl_macro 10, 10, 11, 12, 13, 14, 15
|
||||
GMUL xvf, d, G10, G10, U10
|
||||
nmsub_macro 27, 31, 11, G10
|
||||
nmsub_macro G10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
|
||||
PTR_ADDI A0, A0, 17 * 8
|
||||
// Load 11
|
||||
ldrepl_macro 11, 15, 0
|
||||
ldrepl_macro 11, 11, 12, 13, 14, 15
|
||||
GMUL xvf, d, G11, G11, U11
|
||||
nmsub_macro 28, 31, 12, G11
|
||||
nmsub_macro G11, 28, 12, 29, 13, 30, 14, 31, 15
|
||||
PTR_ADDI A0, A0, 17 * 8
|
||||
// Load 12
|
||||
ldrepl_macro 12, 15, 0
|
||||
ldrepl_macro 12, 12, 13, 14, 15
|
||||
GMUL xvf, d, G12, G12, U12
|
||||
nmsub_macro 29, 31, 13, G12
|
||||
nmsub_macro G12, 29, 13, 30, 14, 31, 15
|
||||
PTR_ADDI A0, A0, 17 * 8
|
||||
// Load 13
|
||||
ldrepl_macro 13, 15, 0
|
||||
ldrepl_macro 13, 13, 14, 15
|
||||
GMUL xvf, d, G13, G13, U13
|
||||
nmsub_macro 30, 31, 14, G13
|
||||
nmsub_macro G13, 30, 14, 31, 15
|
||||
PTR_ADDI A0, A0, 17 * 8
|
||||
// Load 14
|
||||
ldrepl_macro 14, 15, 0
|
||||
ldrepl_macro 14, 14, 15
|
||||
GMUL xvf, d, G14, G14, U14
|
||||
nmsub_macro 31, 31, 15, G14
|
||||
nmsub_macro G14, 31, 15
|
||||
PTR_ADDI A0, A0, 17 * 8
|
||||
// Load 15
|
||||
ldrepl_macro 15, 15, 0
|
||||
ldrepl_macro 15, 15
|
||||
GMUL xvf, d, G15, G15, U15
|
||||
// Finally, We can store the result.
|
||||
// For B, stored sequentially, and C, first transpose and then store
|
||||
B_st_macro 16, 31, 0, \N
|
||||
B_st_macro \N, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
||||
GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
|
||||
GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1
|
||||
GTRANSPOSE4x4_D G8, G9, G10, G11, G8, G9, G10, G11, U0, U1
|
||||
|
@ -334,46 +341,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 63
|
||||
// Sequentially extract data from A in row order
|
||||
// Load 0
|
||||
ldrepl_macro 0, 7, 0
|
||||
ldrepl_macro 0, 0, 1, 2, 3, 4, 5, 6, 7
|
||||
GMUL xvf, d, G0, G0, U0
|
||||
nmsub_macro 17, 23, 1, G0
|
||||
nmsub_macro G0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
|
||||
PTR_ADDI A0, A0, 9 * 8
|
||||
// Load 1
|
||||
ldrepl_macro 1, 7, 0
|
||||
ldrepl_macro 1, 1, 2, 3, 4, 5, 6, 7
|
||||
GMUL xvf, d, G1, G1, U1
|
||||
nmsub_macro 18, 23, 2, G1
|
||||
nmsub_macro G1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
|
||||
PTR_ADDI A0, A0, 9 * 8
|
||||
// Load 2
|
||||
ldrepl_macro 2, 7, 0
|
||||
ldrepl_macro 2, 2, 3, 4, 5, 6, 7
|
||||
GMUL xvf, d, G2, G2, U2
|
||||
nmsub_macro 19, 23, 3, G2
|
||||
nmsub_macro G2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
|
||||
PTR_ADDI A0, A0, 9 * 8
|
||||
// Load 3
|
||||
ldrepl_macro 3, 7, 0
|
||||
ldrepl_macro 3, 3, 4, 5, 6, 7
|
||||
GMUL xvf, d, G3, G3, U3
|
||||
nmsub_macro 20, 23, 4, G3
|
||||
nmsub_macro G3, 20, 4, 21, 5, 22, 6, 23, 7
|
||||
PTR_ADDI A0, A0, 9 * 8
|
||||
// Load 4
|
||||
ldrepl_macro 4, 7, 0
|
||||
ldrepl_macro 4, 4, 5, 6, 7
|
||||
GMUL xvf, d, G4, G4, U4
|
||||
nmsub_macro 21, 23, 5, G4
|
||||
nmsub_macro G4, 21, 5, 22, 6, 23, 7
|
||||
PTR_ADDI A0, A0, 9 * 8
|
||||
// Load 5
|
||||
ldrepl_macro 5, 7, 0
|
||||
ldrepl_macro 5, 5, 6, 7
|
||||
GMUL xvf, d, G5, G5, U5
|
||||
nmsub_macro 22, 23, 6, G5
|
||||
nmsub_macro G5, 22, 6, 23, 7
|
||||
PTR_ADDI A0, A0, 9 * 8
|
||||
// Load 6
|
||||
ldrepl_macro 6, 7, 0
|
||||
ldrepl_macro 6, 6, 7
|
||||
GMUL xvf, d, G6, G6, U6
|
||||
nmsub_macro 23, 23, 7, G6
|
||||
nmsub_macro G6, 23, 7
|
||||
PTR_ADDI A0, A0, 9 * 8
|
||||
// Load 7
|
||||
ldrepl_macro 7, 7, 0
|
||||
ldrepl_macro 7, 7
|
||||
GMUL xvf, d, G7, G7, U7
|
||||
// Finally, We can store the result.
|
||||
// For B, stored sequentially, and C, first transpose and then store
|
||||
B_st_macro 16, 23, 0, \N
|
||||
B_st_macro \N, 16, 16, 17, 18, 19, 20, 21, 22, 23
|
||||
GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
|
||||
GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1
|
||||
.if \N == 4
|
||||
|
@ -437,26 +444,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 15
|
||||
// Sequentially extract data from A in row order
|
||||
// Load 0
|
||||
ldrepl_macro 0, 3, 0
|
||||
ldrepl_macro 0, 0, 1, 2, 3
|
||||
GMUL xvf, d, G0, G0, U0
|
||||
nmsub_macro 17, 19, 1, G0
|
||||
nmsub_macro G0, 17, 1, 18, 2, 19, 3
|
||||
PTR_ADDI A0, A0, 5 * 8
|
||||
// Load 1
|
||||
ldrepl_macro 1, 3, 0
|
||||
ldrepl_macro 1, 1, 2, 3
|
||||
GMUL xvf, d, G1, G1, U1
|
||||
nmsub_macro 18, 19, 2, G1
|
||||
nmsub_macro G1, 18, 2, 19, 3
|
||||
PTR_ADDI A0, A0, 5 * 8
|
||||
// Load 2
|
||||
ldrepl_macro 2, 3, 0
|
||||
ldrepl_macro 2, 2, 3
|
||||
GMUL xvf, d, G2, G2, U2
|
||||
nmsub_macro 19, 19, 3, G2
|
||||
nmsub_macro G2, 19, 3
|
||||
PTR_ADDI A0, A0, 5 * 8
|
||||
// Load 3
|
||||
ldrepl_macro 3, 3, 0
|
||||
ldrepl_macro 3, 3
|
||||
GMUL xvf, d, G3, G3, U3
|
||||
// Finally, We can store the result.
|
||||
// For B, stored sequentially, and C, first transpose and then store
|
||||
B_st_macro 16, 19, 0, \N
|
||||
B_st_macro \N, 16, 16, 17, 18, 19
|
||||
GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
|
||||
.if \N == 4
|
||||
GST xv, , G0, C0, 0x00, G1, C1, 0x00, G2, C2, 0x00, G3, C3, 0x00
|
||||
|
@ -501,16 +508,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 3
|
||||
// Sequentially extract data from A in row order
|
||||
// Load 0
|
||||
ldrepl_macro 0, 1, 0
|
||||
ldrepl_macro 0, 0, 1
|
||||
GMUL xvf, d, G0, G0, U0
|
||||
nmsub_macro 17, 17, 1, G0
|
||||
nmsub_macro G0, 17, 1
|
||||
PTR_ADDI A0, A0, 3 * 8
|
||||
// Load 1
|
||||
ldrepl_macro 1, 1, 0
|
||||
ldrepl_macro 1, 1
|
||||
GMUL xvf, d, G1, G1, U1
|
||||
// Finally, We can store the result.
|
||||
// For B, stored sequentially, and C, first transpose and then store
|
||||
B_st_macro 16, 17, 0, \N
|
||||
B_st_macro \N, 16, 16, 17
|
||||
GSBUTTERFLY xv, d, U0, U1, G1, G0
|
||||
.if \N == 4
|
||||
vst $vr0, C0, 0x00
|
||||
|
@ -717,7 +724,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 26, 32
|
||||
push_if_used 9, 8
|
||||
PTR_SLLI LDC, LDC, 3
|
||||
/* if (!(N >> 2)) goto L_N3 */
|
||||
PTR_SRAI J, N, 2 /* J = bn >> 2 */
|
||||
|
@ -954,6 +961,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
PTR_ADD AA, AA, T0 // aa += 1 * k
|
||||
.L_N1_M0:
|
||||
.L_N0:
|
||||
pop_if_used 26, 32
|
||||
pop_if_used 9, 8
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
|
|
@ -128,33 +128,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "dtrsm_kernel_macro.S"
|
||||
|
||||
.macro ldrepl_macro start, end, stride
|
||||
.macro ldrepl_macro stride:req, index:req, more:vararg
|
||||
// Load Ux (x = 0...15)
|
||||
.if \start <= \end
|
||||
GLDREPL xv, d, $xr\start, B0, \stride * 8
|
||||
ldrepl_macro %start + 1, \end, %stride + 1
|
||||
GLDREPL xv, d, $xr\index, B0, \index * 8 - \stride * 8
|
||||
.ifnb \more
|
||||
ldrepl_macro \stride, \more
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro nmsub_macro start0, end0, start1, reg
|
||||
// Ux -= reg * Dx
|
||||
.if \start0 <= \end0
|
||||
.macro nmsub_macro reg:req, start0:req, start1:req, more:vararg
|
||||
// Gx -= reg * Ux
|
||||
xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0
|
||||
nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg
|
||||
.ifnb \more
|
||||
nmsub_macro \reg, \more
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro A_st_macro start, end, stride, N
|
||||
// Store Ux(x = 0...15)
|
||||
.if \start <= \end
|
||||
.macro A_st_macro N:req, stride:req, start:req, more:vararg
|
||||
// Store Gx(x = 16...31)
|
||||
.if \N == 4
|
||||
xvst $xr\start, A0, \stride * 0x20
|
||||
xvst $xr\start, A0, \start * 0x20 - \stride * 0x20
|
||||
.elseif \N == 2
|
||||
vst $vr\start, A0, \stride * 0x10
|
||||
vst $vr\start, A0, \start * 0x10 - \stride * 0x10
|
||||
.elseif \N == 1
|
||||
fst.d $f\start, A0, \stride * 0x08
|
||||
fst.d $f\start, A0, \start * 0x08 - \stride * 0x08
|
||||
.endif
|
||||
A_st_macro %start + 1, \end, %stride + 1, \N
|
||||
.ifnb \more
|
||||
A_st_macro \N, \stride, \more
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
@ -167,22 +165,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 10 11
|
||||
// 15
|
||||
// Sequentially extract data from B in row order
|
||||
ldrepl_macro 16, 19, 0
|
||||
ldrepl_macro 16, 16, 17, 18, 19
|
||||
GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
|
||||
ldrepl_macro 20, 22, 5
|
||||
nmsub_macro 4, 7, 0, D1
|
||||
ldrepl_macro 23, 24, 10
|
||||
ldrepl_macro 15, 20, 21, 22
|
||||
|
||||
nmsub_macro D1, 4, 0, 5, 1, 6, 2, 7, 3
|
||||
ldrepl_macro 13, 23, 24
|
||||
GMUL xvf, d, U4, D4, U4, U5, D4, U5, U6, D4, U6, U7, D4, U7
|
||||
ldrepl_macro 25, 25, 15
|
||||
nmsub_macro 8, 11, 0, D2
|
||||
nmsub_macro 8, 11, 4, D5
|
||||
ldrepl_macro 10, 25
|
||||
nmsub_macro D2, 8, 0, 9, 1, 10, 2, 11, 3
|
||||
nmsub_macro D5, 8, 4, 9, 5, 10, 6, 11, 7
|
||||
GMUL xvf, d, U8, D7, U8, U9, D7, U9, U10, D7, U10, U11, D7, U11
|
||||
nmsub_macro 12, 15, 0, D3
|
||||
nmsub_macro 12, 15, 4, D6
|
||||
nmsub_macro 12, 15, 8, D8
|
||||
nmsub_macro D3, 12, 0, 13, 1, 14, 2, 15, 3
|
||||
nmsub_macro D6, 12, 4, 13, 5, 14, 6, 15, 7
|
||||
nmsub_macro D8, 12, 8, 13, 9, 14, 10, 15, 11
|
||||
GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15
|
||||
// Store A
|
||||
A_st_macro 0, 15, 0, 4
|
||||
A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
// Store C
|
||||
GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \
|
||||
U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \
|
||||
|
@ -197,13 +196,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//0 1
|
||||
// 3
|
||||
// Sequentially extract data from B in row order
|
||||
ldrepl_macro 16, 17, 0
|
||||
ldrepl_macro 16, 16, 17
|
||||
GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
|
||||
ldrepl_macro 18, 18, 3
|
||||
nmsub_macro 4, 7, 0, D1
|
||||
ldrepl_macro 15, 18
|
||||
nmsub_macro D1, 4, 0, 5, 1, 6, 2, 7, 3
|
||||
GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
|
||||
// Store A
|
||||
A_st_macro 0, 7, 0, 4
|
||||
A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7
|
||||
// Store C
|
||||
GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \
|
||||
U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60
|
||||
|
@ -218,22 +217,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 10 11
|
||||
// 15
|
||||
// Sequentially extract data from B in row order
|
||||
ldrepl_macro 16, 19, 0
|
||||
ldrepl_macro 16, 16, 17, 18, 19
|
||||
GMUL xvf, d, U0, D0, U0, U1, D0, U1
|
||||
ldrepl_macro 20, 22, 5
|
||||
nmsub_macro 2, 3, 0, D1
|
||||
ldrepl_macro 23, 24, 10
|
||||
ldrepl_macro 15, 20, 21, 22
|
||||
nmsub_macro D1, 2, 0, 3, 1
|
||||
ldrepl_macro 13, 23, 24
|
||||
GMUL xvf, d, U2, D4, U2, U3, D4, U3
|
||||
ldrepl_macro 25, 25, 15
|
||||
nmsub_macro 4, 5, 0, D2
|
||||
nmsub_macro 4, 5, 2, D5
|
||||
ldrepl_macro 10, 25
|
||||
nmsub_macro D2, 4, 0, 5, 1
|
||||
nmsub_macro D5, 4, 2, 5, 3
|
||||
GMUL xvf, d, U4, D7, U4, U5, D7, U5
|
||||
nmsub_macro 6, 7, 0, D3
|
||||
nmsub_macro 6, 7, 2, D6
|
||||
nmsub_macro 6, 7, 4, D8
|
||||
nmsub_macro D3, 6, 0, 7, 1
|
||||
nmsub_macro D6, 6, 2, 7, 3
|
||||
nmsub_macro D8, 6, 4, 7, 5
|
||||
GMUL xvf, d, U6, D9, U6, U7, D9, U7
|
||||
// Store A
|
||||
A_st_macro 0, 7, 0, 4
|
||||
A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7
|
||||
// Store C
|
||||
GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
|
||||
U2, C1, 0x00, U3, C1, 0x20, \
|
||||
|
@ -248,13 +247,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//0 1
|
||||
// 3
|
||||
// Sequentially extract data from B in row order
|
||||
ldrepl_macro 16, 17, 0
|
||||
ldrepl_macro 16, 16, 17
|
||||
GMUL xvf, d, U0, D0, U0, U1, D0, U1
|
||||
ldrepl_macro 18, 18, 3
|
||||
nmsub_macro 2, 3, 0, D1
|
||||
ldrepl_macro 15, 18
|
||||
nmsub_macro D1, 2, 0, 3, 1
|
||||
GMUL xvf, d, U2, D2, U2, U3, D2, U3
|
||||
// Store A
|
||||
A_st_macro 0, 3, 0, 4
|
||||
A_st_macro 4, 0, 0, 1, 2, 3
|
||||
// Store C
|
||||
GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
|
||||
U2, C1, 0x00, U3, C1, 0x20
|
||||
|
@ -269,22 +268,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 10 11
|
||||
// 15
|
||||
// Sequentially extract data from B in row order
|
||||
ldrepl_macro 16, 19, 0
|
||||
ldrepl_macro 16, 16, 17, 18, 19
|
||||
GMUL xvf, d, U0, D0, U0
|
||||
ldrepl_macro 20, 22, 5
|
||||
nmsub_macro 1, 1, 0, D1
|
||||
ldrepl_macro 23, 24, 10
|
||||
ldrepl_macro 15, 20, 21, 22
|
||||
nmsub_macro D1, 1, 0
|
||||
ldrepl_macro 13, 23, 24
|
||||
GMUL xvf, d, U1, D4, U1
|
||||
ldrepl_macro 25, 25, 15
|
||||
nmsub_macro 2, 2, 0, D2
|
||||
nmsub_macro 2, 2, 1, D5
|
||||
ldrepl_macro 10, 25
|
||||
nmsub_macro D2, 2, 0
|
||||
nmsub_macro D5, 2, 1
|
||||
GMUL xvf, d, U2, D7, U2
|
||||
nmsub_macro 3, 3, 0, D3
|
||||
nmsub_macro 3, 3, 1, D6
|
||||
nmsub_macro 3, 3, 2, D8
|
||||
nmsub_macro D3, 3, 0
|
||||
nmsub_macro D6, 3, 1
|
||||
nmsub_macro D8, 3, 2
|
||||
GMUL xvf, d, U3, D9, U3
|
||||
// Store A
|
||||
A_st_macro 0, 3, 0, 4
|
||||
A_st_macro 4, 0, 0, 1, 2, 3
|
||||
// Store C
|
||||
GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00
|
||||
.endm
|
||||
|
@ -296,13 +295,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//0 1
|
||||
// 3
|
||||
// Sequentially extract data from B in row order
|
||||
ldrepl_macro 16, 17, 0
|
||||
ldrepl_macro 16, 16, 17
|
||||
GMUL xvf, d, U0, D0, U0
|
||||
ldrepl_macro 18, 18, 3
|
||||
nmsub_macro 1, 1, 0, D1
|
||||
ldrepl_macro 15, 18
|
||||
nmsub_macro D1, 1, 0
|
||||
GMUL xvf, d, U1, D2, U1
|
||||
// Store A
|
||||
A_st_macro 0, 1, 0, 4
|
||||
A_st_macro 4, 0, 0, 1
|
||||
// Store C
|
||||
GST xv, , U0, C0, 0x00, U1, C1, 0x00
|
||||
.endm
|
||||
|
@ -316,23 +315,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 10 11
|
||||
// 15
|
||||
// Sequentially extract data from B in row order
|
||||
ldrepl_macro 16, 19, 0
|
||||
ldrepl_macro 16, 16, 17, 18, 19
|
||||
GMUL xvf, d, U0, D0, U0
|
||||
ldrepl_macro 20, 22, 5
|
||||
nmsub_macro 1, 1, 0, D1
|
||||
ldrepl_macro 23, 24, 10
|
||||
ldrepl_macro 15, 20, 21, 22
|
||||
nmsub_macro D1, 1, 0
|
||||
ldrepl_macro 13, 23, 24
|
||||
GMUL xvf, d, U1, D4, U1
|
||||
|
||||
ldrepl_macro 25, 25, 15
|
||||
nmsub_macro 2, 2, 0, D2
|
||||
nmsub_macro 2, 2, 1, D5
|
||||
ldrepl_macro 10, 25
|
||||
nmsub_macro D2, 2, 0
|
||||
nmsub_macro D5, 2, 1
|
||||
GMUL xvf, d, U2, D7, U2
|
||||
nmsub_macro 3, 3, 0, D3
|
||||
nmsub_macro 3, 3, 1, D6
|
||||
nmsub_macro 3, 3, 2, D8
|
||||
nmsub_macro D3, 3, 0
|
||||
nmsub_macro D6, 3, 1
|
||||
nmsub_macro D8, 3, 2
|
||||
GMUL xvf, d, U3, D9, U3
|
||||
// Store A
|
||||
A_st_macro 0, 3, 0, 2
|
||||
A_st_macro 2, 0, 0, 1, 2, 3
|
||||
// Store C
|
||||
GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00,
|
||||
.endm
|
||||
|
@ -344,13 +343,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//0 1
|
||||
// 3
|
||||
// Sequentially extract data from B in row order
|
||||
ldrepl_macro 16, 17, 0
|
||||
ldrepl_macro 16, 16, 17
|
||||
GMUL xvf, d, U0, D0, U0
|
||||
ldrepl_macro 18, 18, 3
|
||||
nmsub_macro 1, 1, 0, D1
|
||||
ldrepl_macro 15, 18
|
||||
nmsub_macro D1, 1, 0
|
||||
GMUL xvf, d, U1, D2, U1
|
||||
// Store A
|
||||
A_st_macro 0, 1, 0, 2
|
||||
A_st_macro 2, 0, 0, 1
|
||||
// Store C
|
||||
GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00
|
||||
.endm
|
||||
|
@ -364,23 +363,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 10 11
|
||||
// 15
|
||||
// Sequentially extract data from B in row order
|
||||
ldrepl_macro 16, 19, 0
|
||||
ldrepl_macro 16, 16, 17, 18, 19
|
||||
GMUL xvf, d, U0, D0, U0
|
||||
ldrepl_macro 20, 22, 5
|
||||
nmsub_macro 1, 1, 0, D1
|
||||
ldrepl_macro 23, 24, 10
|
||||
ldrepl_macro 15, 20, 21, 22
|
||||
nmsub_macro D1, 1, 0
|
||||
ldrepl_macro 13, 23, 24
|
||||
GMUL xvf, d, U1, D4, U1
|
||||
|
||||
ldrepl_macro 25, 25, 15
|
||||
nmsub_macro 2, 2, 0, D2
|
||||
nmsub_macro 2, 2, 1, D5
|
||||
ldrepl_macro 10, 25
|
||||
nmsub_macro D2, 2, 0
|
||||
nmsub_macro D5, 2, 1
|
||||
GMUL xvf, d, U2, D7, U2
|
||||
nmsub_macro 3, 3, 0, D3
|
||||
nmsub_macro 3, 3, 1, D6
|
||||
nmsub_macro 3, 3, 2, D8
|
||||
nmsub_macro D3, 3, 0
|
||||
nmsub_macro D6, 3, 1
|
||||
nmsub_macro D8, 3, 2
|
||||
GMUL xvf, d, U3, D9, U3
|
||||
// Store A
|
||||
A_st_macro 0, 3, 0, 1
|
||||
A_st_macro 1, 0, 0, 1, 2, 3
|
||||
// Store C
|
||||
GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00,
|
||||
.endm
|
||||
|
@ -392,13 +391,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//0 1
|
||||
// 3
|
||||
// Sequentially extract data from B in row order
|
||||
ldrepl_macro 16, 17, 0
|
||||
ldrepl_macro 16, 16, 17
|
||||
GMUL xvf, d, U0, D0, U0
|
||||
ldrepl_macro 18, 18, 3
|
||||
nmsub_macro 1, 1, 0, D1
|
||||
ldrepl_macro 15, 18
|
||||
nmsub_macro D1, 1, 0
|
||||
GMUL xvf, d, U1, D2, U1
|
||||
// Store A
|
||||
A_st_macro 0, 1, 0, 1
|
||||
A_st_macro 1, 0, 0, 1
|
||||
// Store C
|
||||
GST f, d, $f0, C0, 0x00, $f1, C1, 0x00
|
||||
.endm
|
||||
|
@ -582,10 +581,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvld U2, C0, 0x40
|
||||
xvld U3, C0, 0x60
|
||||
.L_dsolve_16x1:
|
||||
ldrepl_macro 16, 16, 0
|
||||
ldrepl_macro 16, 16
|
||||
GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
|
||||
// Store A
|
||||
A_st_macro 0, 3, 0, 4
|
||||
A_st_macro 4, 0, 0, 1, 2, 3
|
||||
// Strore C
|
||||
GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60
|
||||
.endm
|
||||
|
@ -599,10 +598,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvld U0, C0, 0x00
|
||||
xvld U1, C0, 0x20
|
||||
.L_dsolve_8x1:
|
||||
ldrepl_macro 16, 16, 0
|
||||
ldrepl_macro 16, 16
|
||||
GMUL xvf, d, U0, D0, U0, U1, D0, U1
|
||||
// Store A
|
||||
A_st_macro 0, 1, 0, 4
|
||||
A_st_macro 4, 0, 0, 1
|
||||
// Strore C
|
||||
GST xv, , U0, C0, 0x00, U1, C0, 0x20
|
||||
.endm
|
||||
|
@ -615,10 +614,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/* Load C0 */
|
||||
xvld U0, C0, 0x00
|
||||
.L_dsolve_4x1:
|
||||
ldrepl_macro 16, 16, 0
|
||||
ldrepl_macro 16, 16
|
||||
GMUL xvf, d, U0, D0, U0
|
||||
// Store A
|
||||
A_st_macro 0, 0, 0, 4
|
||||
A_st_macro 4, 0, 0
|
||||
// Strore C
|
||||
GST xv, , U0, C0, 0x00
|
||||
.endm
|
||||
|
@ -631,10 +630,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/* Load C0 */
|
||||
xvld U0, C0, 0x00
|
||||
.L_dsolve_2x1:
|
||||
ldrepl_macro 16, 16, 0
|
||||
ldrepl_macro 16, 16
|
||||
GMUL xvf, d, U0, D0, U0
|
||||
// Store A
|
||||
A_st_macro 0, 0, 0, 2
|
||||
A_st_macro 2, 0, 0
|
||||
// Strore C
|
||||
GST v, , $vr0, C0, 0x00
|
||||
.endm
|
||||
|
@ -647,16 +646,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// Load C
|
||||
fld.d $f0, C0, 0x00
|
||||
.L_dsolve_1x1:
|
||||
ldrepl_macro 16, 16, 0
|
||||
ldrepl_macro 16, 16
|
||||
GMUL xvf, d, U0, D0, U0
|
||||
// Store A
|
||||
A_st_macro 0, 0, 0, 1
|
||||
A_st_macro 1, 0, 0
|
||||
// Strore C
|
||||
GST f, d, $f0, C0, 0x00
|
||||
.endm
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 26, 32
|
||||
push_if_used 9, 8
|
||||
PTR_SLLI LDC, LDC, 3
|
||||
PTR_SUB KK, ZERO, OFFSET
|
||||
/* if (!(N >> 2)) goto L_N3 */
|
||||
|
@ -877,6 +876,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
PTR_ADD AA, AA, T0 // aa += 1 * k
|
||||
.L_N1_M0:
|
||||
.L_N0:
|
||||
pop_if_used 26, 32
|
||||
pop_if_used 9, 8
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
|
|
@ -111,33 +111,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "dtrsm_kernel_macro.S"
|
||||
|
||||
.macro ldrepl_macro start, end, stride
|
||||
.macro ldrepl_macro stride:req, index:req, more:vararg
|
||||
// Load Ux (x = 0...15)
|
||||
.if \start <= \end
|
||||
GLDREPL xv, d, $xr\start, B0, \stride * 8
|
||||
ldrepl_macro %start + 1, \end, %stride + 1
|
||||
GLDREPL xv, d, $xr\index, B0, \index * 8 - \stride * 8
|
||||
.ifnb \more
|
||||
ldrepl_macro \stride, \more
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro nmsub_macro start0, end0, start1, reg
|
||||
// Ux -= reg * Dx
|
||||
.if \start0 <= \end0
|
||||
.macro nmsub_macro reg:req, start0:req, start1:req, more:vararg
|
||||
// Gx -= reg * Ux
|
||||
xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0
|
||||
nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg
|
||||
.ifnb \more
|
||||
nmsub_macro \reg, \more
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro A_st_macro start, end, stride, N
|
||||
// Store Ux(x = 0...15)
|
||||
.if \start <= \end
|
||||
.macro A_st_macro N:req, stride:req, start:req, more:vararg
|
||||
// Store Gx(x = 16...31)
|
||||
.if \N == 4
|
||||
xvst $xr\start, A0, \stride * 0x20
|
||||
xvst $xr\start, A0, \start * 0x20 - \stride * 0x20
|
||||
.elseif \N == 2
|
||||
vst $vr\start, A0, \stride * 0x10
|
||||
vst $vr\start, A0, \start * 0x10 - \stride * 0x10
|
||||
.elseif \N == 1
|
||||
fst.d $f\start, A0, \stride * 0x08
|
||||
fst.d $f\start, A0, \start * 0x08 - \stride * 0x08
|
||||
.endif
|
||||
A_st_macro %start + 1, \end, %stride + 1, \N
|
||||
.ifnb \more
|
||||
A_st_macro \N, \stride, \more
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
@ -148,13 +146,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//0
|
||||
//2 3
|
||||
// Sequentially extract data from B in row order
|
||||
ldrepl_macro 16, 16, 0
|
||||
ldrepl_macro 17, 18, 2
|
||||
ldrepl_macro 16, 16
|
||||
ldrepl_macro 15, 17, 18
|
||||
GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
|
||||
nmsub_macro 0, 3, 4, D1
|
||||
nmsub_macro D1, 0, 4, 1, 5, 2, 6, 3, 7
|
||||
GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
|
||||
// Store A
|
||||
A_st_macro 0, 7, 0, 4
|
||||
A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7
|
||||
// Store C
|
||||
GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \
|
||||
U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60
|
||||
|
@ -167,13 +165,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//0
|
||||
//2 3
|
||||
// Sequentially extract data from B in row order
|
||||
ldrepl_macro 16, 16, 0
|
||||
ldrepl_macro 17, 18, 2
|
||||
ldrepl_macro 16, 16
|
||||
ldrepl_macro 15, 17, 18
|
||||
GMUL xvf, d, U2, D2, U2, U3, D2, U3
|
||||
nmsub_macro 0, 1, 2, D1
|
||||
nmsub_macro D1, 0, 2, 1, 3
|
||||
GMUL xvf, d, U0, D0, U0, U1, D0, U1
|
||||
// Store A
|
||||
A_st_macro 0, 3, 0, 4
|
||||
A_st_macro 4, 0, 0, 1, 2, 3
|
||||
// Store C
|
||||
GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
|
||||
U2, C1, 0x00, U3, C1, 0x20
|
||||
|
@ -186,13 +184,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//0
|
||||
//2 3
|
||||
// Sequentially extract data from B in row order
|
||||
ldrepl_macro 16, 16, 0
|
||||
ldrepl_macro 17, 18, 2
|
||||
ldrepl_macro 16, 16
|
||||
ldrepl_macro 15, 17, 18
|
||||
GMUL xvf, d, U1, D2, U1
|
||||
nmsub_macro 0, 0, 1, D1
|
||||
nmsub_macro D1, 0, 1
|
||||
GMUL xvf, d, U0, D0, U0
|
||||
// Store A
|
||||
A_st_macro 0, 1, 0, 4
|
||||
A_st_macro 4, 0, 0, 1
|
||||
// Store C
|
||||
GST xv, , U0, C0, 0x00, U1, C1, 0x00
|
||||
.endm
|
||||
|
@ -204,13 +202,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//0
|
||||
//2 3
|
||||
// Sequentially extract data from B in row order
|
||||
ldrepl_macro 16, 16, 0
|
||||
ldrepl_macro 17, 18, 2
|
||||
ldrepl_macro 16, 16
|
||||
ldrepl_macro 15, 17, 18
|
||||
GMUL xvf, d, U1, D2, U1
|
||||
nmsub_macro 0, 0, 1, D1
|
||||
nmsub_macro D1, 0, 1
|
||||
GMUL xvf, d, U0, D0, U0
|
||||
// Store A
|
||||
A_st_macro 0, 1, 0, 2
|
||||
A_st_macro 2, 0, 0, 1
|
||||
// Store C
|
||||
GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00
|
||||
.endm
|
||||
|
@ -222,13 +220,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//0
|
||||
//2 3
|
||||
// Sequentially extract data from B in row order
|
||||
ldrepl_macro 16, 16, 0
|
||||
ldrepl_macro 17, 18, 2
|
||||
ldrepl_macro 16, 16
|
||||
ldrepl_macro 15, 17, 18
|
||||
GMUL xvf, d, U1, D2, U1
|
||||
nmsub_macro 0, 0, 1, D1
|
||||
nmsub_macro D1, 0, 1
|
||||
GMUL xvf, d, U0, D0, U0
|
||||
// Store A
|
||||
A_st_macro 0, 1, 0, 1
|
||||
A_st_macro 1, 0, 0, 1
|
||||
// Store C
|
||||
GST f, d, $f0, C0, 0x00, $f1, C1, 0x00
|
||||
.endm
|
||||
|
@ -242,22 +240,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//8 9 10
|
||||
//12 13 14 15
|
||||
// Sequentially extract data from B in row order
|
||||
ldrepl_macro 22, 25, 12
|
||||
ldrepl_macro 10, 22, 23, 24, 25
|
||||
GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15
|
||||
ldrepl_macro 19, 21, 8
|
||||
nmsub_macro 8, 11, 12, D8
|
||||
ldrepl_macro 17, 18, 4
|
||||
ldrepl_macro 11, 19, 20, 21
|
||||
nmsub_macro D8, 8, 12, 9, 13, 10, 14, 11, 15
|
||||
ldrepl_macro 13, 17, 18
|
||||
GMUL xvf, d, U8, D5, U8, U9, D5, U9, U10, D5, U10, U11, D5, U11
|
||||
ldrepl_macro 16, 16, 0
|
||||
nmsub_macro 4, 7, 12, D7
|
||||
nmsub_macro 4, 7, 8, D4
|
||||
ldrepl_macro 16, 16
|
||||
nmsub_macro D7, 4, 12, 5, 13, 6, 14, 7, 15
|
||||
nmsub_macro D4, 4, 8, 5, 9, 6, 10, 7, 11
|
||||
GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
|
||||
nmsub_macro 0, 3, 12, D6
|
||||
nmsub_macro 0, 3, 8, D3
|
||||
nmsub_macro 0, 3, 4, D1
|
||||
nmsub_macro D6, 0, 12, 1, 13, 2, 14, 3, 15
|
||||
nmsub_macro D3, 0, 8, 1, 9, 2, 10, 3, 11
|
||||
nmsub_macro D1, 0, 4, 1, 5, 2, 6, 3, 7
|
||||
GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
|
||||
// Store A
|
||||
A_st_macro 0, 15, 0, 4
|
||||
A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
// Store C
|
||||
GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \
|
||||
U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \
|
||||
|
@ -274,22 +272,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//8 9 10
|
||||
//12 13 14 15
|
||||
// Sequentially extract data from B in row order
|
||||
ldrepl_macro 22, 25, 12
|
||||
ldrepl_macro 10, 22, 23, 24, 25
|
||||
GMUL xvf, d, U6, D9, U6, U7, D9, U7
|
||||
ldrepl_macro 19, 21, 8
|
||||
nmsub_macro 4, 5, 6, D8
|
||||
ldrepl_macro 17, 18, 4
|
||||
ldrepl_macro 11, 19, 20, 21
|
||||
nmsub_macro D8, 4, 6, 5, 7
|
||||
ldrepl_macro 13, 17, 18
|
||||
GMUL xvf, d, U4, D5, U4, U5, D5, U5
|
||||
ldrepl_macro 16, 16, 0
|
||||
nmsub_macro 2, 3, 6, D7
|
||||
nmsub_macro 2, 3, 4, D4
|
||||
ldrepl_macro 16, 16
|
||||
nmsub_macro D7, 2, 6, 3, 7
|
||||
nmsub_macro D4, 2, 4, 3, 5
|
||||
GMUL xvf, d, U2, D2, U2, U3, D2, U3
|
||||
nmsub_macro 0, 1, 6, D6
|
||||
nmsub_macro 0, 1, 4, D3
|
||||
nmsub_macro 0, 1, 2, D1
|
||||
nmsub_macro D6, 0, 6, 1, 7
|
||||
nmsub_macro D3, 0, 4, 1, 5
|
||||
nmsub_macro D1, 0, 2, 1, 3
|
||||
GMUL xvf, d, U0, D0, U0, U1, D0, U1
|
||||
// Store A
|
||||
A_st_macro 0, 7, 0, 4
|
||||
A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7
|
||||
// Store C
|
||||
GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
|
||||
U2, C1, 0x00, U3, C1, 0x20, \
|
||||
|
@ -306,22 +304,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//8 9 10
|
||||
//12 13 14 15
|
||||
// Sequentially extract data from B in row order
|
||||
ldrepl_macro 22, 25, 12
|
||||
ldrepl_macro 10, 22, 23, 24, 25
|
||||
GMUL xvf, d, U3, D9, U3
|
||||
ldrepl_macro 19, 21, 8
|
||||
nmsub_macro 2, 2, 3, D8
|
||||
ldrepl_macro 17, 18, 4
|
||||
ldrepl_macro 11, 19, 20, 21
|
||||
nmsub_macro D8, 2, 3
|
||||
ldrepl_macro 13, 17, 18
|
||||
GMUL xvf, d, U2, D5, U2
|
||||
ldrepl_macro 16, 16, 0
|
||||
nmsub_macro 1, 1, 3, D7
|
||||
nmsub_macro 1, 1, 2, D4
|
||||
ldrepl_macro 16, 16
|
||||
nmsub_macro D7, 1, 3
|
||||
nmsub_macro D4, 1, 2
|
||||
GMUL xvf, d, U1, D2, U1
|
||||
nmsub_macro 0, 0, 3, D6
|
||||
nmsub_macro 0, 0, 2, D3
|
||||
nmsub_macro 0, 0, 1, D1
|
||||
nmsub_macro D6, 0, 3
|
||||
nmsub_macro D3, 0, 2
|
||||
nmsub_macro D1, 0, 1
|
||||
GMUL xvf, d, U0, D0, U0
|
||||
// Store A
|
||||
A_st_macro 0, 3, 0, 4
|
||||
A_st_macro 4, 0, 0, 1, 2, 3
|
||||
// Store C
|
||||
GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00
|
||||
.endm
|
||||
|
@ -335,22 +333,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//8 9 10
|
||||
//12 13 14 15
|
||||
// Sequentially extract data from B in row order
|
||||
ldrepl_macro 22, 25, 12
|
||||
ldrepl_macro 10, 22, 23, 24, 25
|
||||
GMUL xvf, d, U3, D9, U3
|
||||
ldrepl_macro 19, 21, 8
|
||||
nmsub_macro 2, 2, 3, D8
|
||||
ldrepl_macro 17, 18, 4
|
||||
ldrepl_macro 11, 19, 20, 21
|
||||
nmsub_macro D8, 2, 3
|
||||
ldrepl_macro 13, 17, 18
|
||||
GMUL xvf, d, U2, D5, U2
|
||||
ldrepl_macro 16, 16, 0
|
||||
nmsub_macro 1, 1, 3, D7
|
||||
nmsub_macro 1, 1, 2, D4
|
||||
ldrepl_macro 16, 16
|
||||
nmsub_macro D7, 1, 3
|
||||
nmsub_macro D4, 1, 2
|
||||
GMUL xvf, d, U1, D2, U1
|
||||
nmsub_macro 0, 0, 3, D6
|
||||
nmsub_macro 0, 0, 2, D3
|
||||
nmsub_macro 0, 0, 1, D1
|
||||
nmsub_macro D6, 0, 3
|
||||
nmsub_macro D3, 0, 2
|
||||
nmsub_macro D1, 0, 1
|
||||
GMUL xvf, d, U0, D0, U0
|
||||
// Store A
|
||||
A_st_macro 0, 3, 0, 2
|
||||
A_st_macro 2, 0, 0, 1, 2, 3
|
||||
// Store C
|
||||
GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00
|
||||
.endm
|
||||
|
@ -364,22 +362,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//8 9 10
|
||||
//12 13 14 15
|
||||
// Sequentially extract data from B in row order
|
||||
ldrepl_macro 22, 25, 12
|
||||
ldrepl_macro 10, 22, 23, 24, 25
|
||||
GMUL xvf, d, U3, D9, U3
|
||||
ldrepl_macro 19, 21, 8
|
||||
nmsub_macro 2, 2, 3, D8
|
||||
ldrepl_macro 17, 18, 4
|
||||
ldrepl_macro 11, 19, 20, 21
|
||||
nmsub_macro D8, 2, 3
|
||||
ldrepl_macro 13, 17, 18
|
||||
GMUL xvf, d, U2, D5, U2
|
||||
ldrepl_macro 16, 16, 0
|
||||
nmsub_macro 1, 1, 3, D7
|
||||
nmsub_macro 1, 1, 2, D4
|
||||
ldrepl_macro 16, 16
|
||||
nmsub_macro D7, 1, 3
|
||||
nmsub_macro D4, 1, 2
|
||||
GMUL xvf, d, U1, D2, U1
|
||||
nmsub_macro 0, 0, 3, D6
|
||||
nmsub_macro 0, 0, 2, D3
|
||||
nmsub_macro 0, 0, 1, D1
|
||||
nmsub_macro D6, 0, 3
|
||||
nmsub_macro D3, 0, 2
|
||||
nmsub_macro D1, 0, 1
|
||||
GMUL xvf, d, U0, D0, U0
|
||||
// Store A
|
||||
A_st_macro 0, 3, 0, 1
|
||||
A_st_macro 1, 0, 0, 1, 2, 3
|
||||
// Store C
|
||||
GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00,
|
||||
.endm
|
||||
|
@ -399,10 +397,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.L_dsolve_16x1:
|
||||
PTR_ADDI A0, T1, -16 * 8
|
||||
PTR_ADDI B0, T2, -1 * 8
|
||||
ldrepl_macro 16, 16, 0
|
||||
ldrepl_macro 16, 16
|
||||
GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
|
||||
// Store A
|
||||
A_st_macro 0, 3, 0, 4
|
||||
A_st_macro 4, 0, 0, 1, 2, 3
|
||||
// Strore C
|
||||
GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60
|
||||
.endm
|
||||
|
@ -420,10 +418,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.L_dsolve_8x1:
|
||||
PTR_ADDI A0, T1, -8 * 8
|
||||
PTR_ADDI B0, T2, -1 * 8
|
||||
ldrepl_macro 16, 16, 0
|
||||
ldrepl_macro 16, 16
|
||||
GMUL xvf, d, U0, D0, U0, U1, D0, U1
|
||||
// Store A
|
||||
A_st_macro 0, 1, 0, 4
|
||||
A_st_macro 4, 0, 0, 1
|
||||
// Strore C
|
||||
GST xv, , U0, C0, 0x00, U1, C0, 0x20
|
||||
.endm
|
||||
|
@ -440,10 +438,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.L_dsolve_4x1:
|
||||
PTR_ADDI A0, T1, -4 * 8
|
||||
PTR_ADDI B0, T2, -1 * 8
|
||||
ldrepl_macro 16, 16, 0
|
||||
ldrepl_macro 16, 16
|
||||
GMUL xvf, d, U0, D0, U0
|
||||
// Store A
|
||||
A_st_macro 0, 0, 0, 4
|
||||
A_st_macro 4, 0, 0
|
||||
// Strore C
|
||||
GST xv, , U0, C0, 0x00
|
||||
.endm
|
||||
|
@ -460,10 +458,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.L_dsolve_2x1:
|
||||
PTR_ADDI A0, T1, -2 * 8
|
||||
PTR_ADDI B0, T2, -1 * 8
|
||||
ldrepl_macro 16, 16, 0
|
||||
ldrepl_macro 16, 16
|
||||
GMUL xvf, d, U0, D0, U0
|
||||
// Store A
|
||||
A_st_macro 0, 0, 0, 2
|
||||
A_st_macro 2, 0, 0
|
||||
// Strore C
|
||||
GST v, , $vr0, C0, 0x00
|
||||
.endm
|
||||
|
@ -480,10 +478,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.L_dsolve_1x1:
|
||||
PTR_ADDI A0, T1, -1 * 8
|
||||
PTR_ADDI B0, T2, -1 * 8
|
||||
ldrepl_macro 16, 16, 0
|
||||
ldrepl_macro 16, 16
|
||||
GMUL xvf, d, U0, D0, U0
|
||||
// Store A
|
||||
A_st_macro 0, 0, 0, 1
|
||||
A_st_macro 1, 0, 0
|
||||
// Strore C
|
||||
GST f, d, $f0, C0, 0x00
|
||||
.endm
|
||||
|
@ -697,7 +695,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 26, 32
|
||||
push_if_used 9, 8
|
||||
PTR_SLLI LDC, LDC, 3
|
||||
PTR_SUB KK, N, OFFSET
|
||||
PTR_MUL T0, N, LDC
|
||||
|
@ -948,6 +946,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
PTR_ADDI KK, KK, -4
|
||||
bnez J, .L_J1
|
||||
.L_N0:
|
||||
pop_if_used 26, 32
|
||||
pop_if_used 9, 8
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
|
|
@ -90,57 +90,175 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define PTR_FST fst.d
|
||||
#endif
|
||||
|
||||
// The max registers available to the user which
|
||||
// do not need to be preserved across calls.
|
||||
// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html
|
||||
#define MAX_INT_CALLER_SAVED 17
|
||||
#define MAX_FP_CALLER_SAVED 24
|
||||
|
||||
.altmacro // Enable alternate macro mode
|
||||
|
||||
/*
|
||||
* Pushing and popping static registers into/from the stack.
|
||||
* regs : number of static general-purpose registers, greater than or equal to 0, less than or equal to 9
|
||||
* fregs: number of static floating-point registers, greater than or equal to 0, less than or equal to 8
|
||||
*/
|
||||
.macro push_if_used regs, fregs
|
||||
.if \regs > MAX_INT_CALLER_SAVED
|
||||
PTR_ADDI $sp, $sp, -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG)
|
||||
push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
|
||||
.if \regs > 0
|
||||
PTR_ADDI $sp, $sp, -(\regs << REG_LOG)
|
||||
push_regs 0, \regs - 1
|
||||
.endif
|
||||
.if \fregs > MAX_FP_CALLER_SAVED
|
||||
PTR_ADDI $sp, $sp, -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG)
|
||||
push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
|
||||
.if \fregs > 0
|
||||
PTR_ADDI $sp, $sp, -(\fregs << FREG_LOG)
|
||||
push_fregs 0, \fregs - 1
|
||||
.endif
|
||||
.endm // End push_if_used
|
||||
|
||||
.macro pop_if_used regs, fregs
|
||||
.if \fregs > MAX_FP_CALLER_SAVED
|
||||
pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
|
||||
PTR_ADDI $sp, $sp, (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG
|
||||
.if \fregs > 0
|
||||
pop_fregs 0, \fregs - 1
|
||||
PTR_ADDI $sp, $sp, \fregs << FREG_LOG
|
||||
.endif
|
||||
.if \regs > MAX_INT_CALLER_SAVED
|
||||
pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
|
||||
PTR_ADDI $sp, $sp, (\regs - MAX_INT_CALLER_SAVED) << REG_LOG
|
||||
.if \regs > 0
|
||||
pop_regs 0, \regs - 1
|
||||
PTR_ADDI $sp, $sp, \regs << REG_LOG
|
||||
.endif
|
||||
.endm // End pop_if_used
|
||||
|
||||
.macro push_regs from, to
|
||||
PTR_ST $s\()\from, $sp, \from << REG_LOG
|
||||
#ifdef __clang__
|
||||
.if \to >= 0
|
||||
PTR_ST $s0, $sp, 0 << REG_LOG
|
||||
.endif
|
||||
.if \to >= 1
|
||||
PTR_ST $s1, $sp, 1 << REG_LOG
|
||||
.endif
|
||||
.if \to >= 2
|
||||
PTR_ST $s2, $sp, 2 << REG_LOG
|
||||
.endif
|
||||
.if \to >= 3
|
||||
PTR_ST $s3, $sp, 3 << REG_LOG
|
||||
.endif
|
||||
.if \to >= 4
|
||||
PTR_ST $s4, $sp, 4 << REG_LOG
|
||||
.endif
|
||||
.if \to >= 5
|
||||
PTR_ST $s5, $sp, 5 << REG_LOG
|
||||
.endif
|
||||
.if \to >= 6
|
||||
PTR_ST $s6, $sp, 6 << REG_LOG
|
||||
.endif
|
||||
.if \to >= 7
|
||||
PTR_ST $s7, $sp, 7 << REG_LOG
|
||||
.endif
|
||||
.if \to >= 8
|
||||
PTR_ST $s8, $sp, 8 << REG_LOG
|
||||
.endif
|
||||
#else
|
||||
PTR_ST $s\()\from, $sp, \from << REG_LOG
|
||||
.if \to - \from
|
||||
push_regs %from + 1, \to
|
||||
.endif
|
||||
#endif
|
||||
.endm // End push_regs
|
||||
|
||||
.macro pop_regs from, to
|
||||
#ifdef __clang__
|
||||
.if \to >= 0
|
||||
PTR_LD $s0, $sp, 0 << REG_LOG
|
||||
.endif
|
||||
.if \to >= 1
|
||||
PTR_LD $s1, $sp, 1 << REG_LOG
|
||||
.endif
|
||||
.if \to >= 2
|
||||
PTR_LD $s2, $sp, 2 << REG_LOG
|
||||
.endif
|
||||
.if \to >= 3
|
||||
PTR_LD $s3, $sp, 3 << REG_LOG
|
||||
.endif
|
||||
.if \to >= 4
|
||||
PTR_LD $s4, $sp, 4 << REG_LOG
|
||||
.endif
|
||||
.if \to >= 5
|
||||
PTR_LD $s5, $sp, 5 << REG_LOG
|
||||
.endif
|
||||
.if \to >= 6
|
||||
PTR_LD $s6, $sp, 6 << REG_LOG
|
||||
.endif
|
||||
.if \to >= 7
|
||||
PTR_LD $s7, $sp, 7 << REG_LOG
|
||||
.endif
|
||||
.if \to >= 8
|
||||
PTR_LD $s8, $sp, 8 << REG_LOG
|
||||
.endif
|
||||
#else
|
||||
PTR_LD $s\()\from, $sp, \from << REG_LOG
|
||||
.if \to - \from
|
||||
pop_regs %from + 1, \to
|
||||
.endif
|
||||
#endif
|
||||
.endm // End pop_regs
|
||||
|
||||
.macro push_fregs from, to
|
||||
#ifdef __clang__
|
||||
.if \to >= 0
|
||||
PTR_FST $fs0, $sp, 0 << FREG_LOG
|
||||
.endif
|
||||
.if \to >= 1
|
||||
PTR_FST $fs1, $sp, 1 << FREG_LOG
|
||||
.endif
|
||||
.if \to >= 2
|
||||
PTR_FST $fs2, $sp, 2 << FREG_LOG
|
||||
.endif
|
||||
.if \to >= 3
|
||||
PTR_FST $fs3, $sp, 3 << FREG_LOG
|
||||
.endif
|
||||
.if \to >= 4
|
||||
PTR_FST $fs4, $sp, 4 << FREG_LOG
|
||||
.endif
|
||||
.if \to >= 5
|
||||
PTR_FST $fs5, $sp, 5 << FREG_LOG
|
||||
.endif
|
||||
.if \to >= 6
|
||||
PTR_FST $fs6, $sp, 6 << FREG_LOG
|
||||
.endif
|
||||
.if \to >= 7
|
||||
PTR_FST $fs7, $sp, 7 << FREG_LOG
|
||||
.endif
|
||||
#else
|
||||
PTR_FST $fs\()\from, $sp, \from << FREG_LOG
|
||||
.if \to - \from
|
||||
push_fregs %from + 1, \to
|
||||
.endif
|
||||
#endif
|
||||
.endm // End push_fregs
|
||||
|
||||
.macro pop_fregs from, to
|
||||
#ifdef __clang__
|
||||
.if \to >= 0
|
||||
PTR_FLD $fs0, $sp, 0 << FREG_LOG
|
||||
.endif
|
||||
.if \to >= 1
|
||||
PTR_FLD $fs1, $sp, 1 << FREG_LOG
|
||||
.endif
|
||||
.if \to >= 2
|
||||
PTR_FLD $fs2, $sp, 2 << FREG_LOG
|
||||
.endif
|
||||
.if \to >= 3
|
||||
PTR_FLD $fs3, $sp, 3 << FREG_LOG
|
||||
.endif
|
||||
.if \to >= 4
|
||||
PTR_FLD $fs4, $sp, 4 << FREG_LOG
|
||||
.endif
|
||||
.if \to >= 5
|
||||
PTR_FLD $fs5, $sp, 5 << FREG_LOG
|
||||
.endif
|
||||
.if \to >= 6
|
||||
PTR_FLD $fs6, $sp, 6 << FREG_LOG
|
||||
.endif
|
||||
.if \to >= 7
|
||||
PTR_FLD $fs7, $sp, 7 << FREG_LOG
|
||||
.endif
|
||||
#else
|
||||
PTR_FLD $fs\()\from, $sp, \from << FREG_LOG
|
||||
.if \to - \from
|
||||
pop_fregs %from + 1, \to
|
||||
.endif
|
||||
#endif
|
||||
.endm // End pop_fregs
|
||||
|
||||
//
|
||||
|
@ -275,7 +393,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// GXOR
|
||||
//
|
||||
.macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
|
||||
\pre_op\()xor.\suf_op \out, \in0, \in1
|
||||
.ifnb \pre_op
|
||||
\pre_op\()xor.v \out, \in0, \in1
|
||||
.else
|
||||
xor.\suf_op \out, \in0, \in1
|
||||
.endif
|
||||
.ifnb \more
|
||||
GXOR \pre_op, \suf_op, \more
|
||||
.endif
|
||||
|
@ -307,6 +429,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
GPRELD \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GPACKEV
|
||||
//
|
||||
.macro GPACKEV pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
|
||||
\pre_op\()packev.\suf_op \out, \in0, \in1
|
||||
.ifnb \more
|
||||
GPACKEV \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GPACKOD
|
||||
//
|
||||
.macro GPACKOD pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
|
||||
\pre_op\()packod.\suf_op \out, \in0, \in1
|
||||
.ifnb \more
|
||||
GPACKOD \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GSHUF4I
|
||||
//
|
||||
.macro GSHUF4I pre_op:req, suf_op:req, out:req, in0:req, in1:req /* imm */, more:vararg
|
||||
\pre_op\()shuf4i.\suf_op \out, \in0, \in1
|
||||
.ifnb \more
|
||||
GSHUF4I \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro TRANSF2G name, pre_op:req, suf_op:req, more:vararg
|
||||
.ifeqs "\pre_op\()\suf_op", "vfs"
|
||||
\name v, w, \more
|
||||
.endif
|
||||
.ifeqs "\pre_op\()\suf_op", "vfd"
|
||||
\name v, d, \more
|
||||
.endif
|
||||
.ifeqs "\pre_op\()\suf_op", "xvfs"
|
||||
\name xv, w, \more
|
||||
.endif
|
||||
.ifeqs "\pre_op\()\suf_op", "xvfd"
|
||||
\name xv, d, \more
|
||||
.endif
|
||||
.endm
|
||||
|
||||
//
|
||||
// Compound instructions
|
||||
|
@ -314,61 +478,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// GACC: Accumulate the values of vector registers
|
||||
//
|
||||
.macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
|
||||
.ifeqs "\pre_op", "xvf"
|
||||
.ifeqs "\pre_op\()\suf_op", "xvfd"
|
||||
xvpermi.q \out, \in, 0x01
|
||||
\pre_op\()add.\suf_op \in, \out, \in
|
||||
xvpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
.ifeqs "\pre_op\()\suf_op", "xvfs"
|
||||
xvpermi.q \out, \in, 0x01
|
||||
\pre_op\()add.\suf_op \in, \out, \in
|
||||
xvpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.ifeqs "\suf_op", "s"
|
||||
xvpackod.w \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
.endif
|
||||
|
||||
.ifeqs "\pre_op", "vf"
|
||||
.ifeqs "\pre_op\()\suf_op", "vfd"
|
||||
vpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
.ifeqs "\pre_op\()\suf_op", "vfs"
|
||||
vpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.ifeqs "\suf_op", "s"
|
||||
vpackod.w \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
.endif
|
||||
|
||||
.ifeqs "\pre_op", "xv"
|
||||
.ifeqs "\pre_op\()\suf_op", "xvd"
|
||||
xvpermi.q \out, \in, 0x01
|
||||
\pre_op\()add.\suf_op \in, \out, \in
|
||||
xvpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
.ifeqs "\pre_op\()\suf_op", "xvw"
|
||||
xvpermi.q \out, \in, 0x01
|
||||
\pre_op\()add.\suf_op \in, \out, \in
|
||||
xvpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
xvpackod.w \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
.ifeqs "\pre_op\()\suf_op", "xvh"
|
||||
xvpermi.q \out, \in, 0x01
|
||||
\pre_op\()add.\suf_op \in, \out, \in
|
||||
xvpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
xvpackod.w \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
xvpackod.h \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
.ifeqs "\pre_op\()\suf_op", "xvb"
|
||||
xvpermi.q \out, \in, 0x01
|
||||
\pre_op\()add.\suf_op \in, \out, \in
|
||||
xvpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.ifnc "\suf_op", "d"
|
||||
xvpackod.w \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.ifnc "\suf_op", "w"
|
||||
xvpackod.h \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.ifnc "\suf_op", "h"
|
||||
xvpackod.b \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
|
||||
.ifeqs "\pre_op", "v"
|
||||
.ifeqs "\pre_op\()\suf_op", "vd"
|
||||
vpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
.ifeqs "\pre_op\()\suf_op", "vw"
|
||||
vpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
vpackod.w \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
.ifeqs "\pre_op\()\suf_op", "vh"
|
||||
vpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
vpackod.w \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
vpackod.h \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
.ifeqs "\pre_op\()\suf_op", "vb"
|
||||
vpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.ifnc "\suf_op", "d"
|
||||
vpackod.w \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.ifnc "\suf_op", "w"
|
||||
vpackod.h \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.ifnc "\suf_op", "h"
|
||||
vpackod.b \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
|
||||
.ifnb \more
|
||||
GACC \pre_op, \suf_op, \more
|
||||
|
@ -391,26 +590,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// Note: When "pre_op = xvf && suf_op = s", in will be modified.
|
||||
//
|
||||
.macro GCOMPLEXACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
|
||||
.ifeqs "\pre_op", "xvf"
|
||||
.ifeqs "\pre_op\()\suf_op", "xvfd"
|
||||
xvpermi.q \out, \in, 0x01
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
|
||||
.ifeqs "\pre_op\()\suf_op", "xvfs"
|
||||
xvpermi.q \out, \in, 0x01
|
||||
.ifeqs "\suf_op", "s"
|
||||
\pre_op\()add.\suf_op \in, \out, \in
|
||||
xvpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.else
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
.endif
|
||||
|
||||
.ifeqs "\pre_op", "vf"
|
||||
.ifeqs "\suf_op", "s"
|
||||
vpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.else
|
||||
.ifeqs "\pre_op\()\suf_op", "vfd"
|
||||
vor.v \out, \in, \in
|
||||
.endif
|
||||
.endif
|
||||
|
||||
.ifeqs "\pre_op\()\suf_op", "vfs"
|
||||
vpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
|
||||
.ifnb \more
|
||||
GCOMPLEXACC \pre_op, \suf_op, \more
|
||||
|
@ -430,56 +629,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// suf_op: s or d, differentiate between single precision or double precision complex numbers
|
||||
//
|
||||
.macro GCOMPLEXMUL xconj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, tmp0:req, tmp1:req, tmp2:req, more:vararg
|
||||
.ifeqs "\pre_op", "xvf"
|
||||
xvxor.v \tmp1, \tmp1, \tmp1
|
||||
.ifeqs "\suf_op", "s"
|
||||
xvpackev.w \tmp0, \in0, \in0
|
||||
.else
|
||||
xvpackev.d \tmp0, \in0, \in0
|
||||
.endif
|
||||
.else
|
||||
vxor.v \tmp1, \tmp1, \tmp1
|
||||
.ifeqs "\suf_op", "s"
|
||||
vpackev.w \tmp0, \in0, \in0
|
||||
.else
|
||||
vpackev.d \tmp0, \in0, \in0
|
||||
.endif
|
||||
.endif
|
||||
TRANSF2G GXOR, \pre_op, s, \tmp1, \tmp1, \tmp1
|
||||
TRANSF2G GPACKEV, \pre_op, \suf_op, \tmp0, \in0, \in0
|
||||
|
||||
\pre_op\()sub.\suf_op \tmp1, \tmp1, \in0
|
||||
|
||||
.ifeqs "\pre_op", "xvf"
|
||||
.ifeqs "\xconj", "0"
|
||||
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1
|
||||
.else
|
||||
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
|
||||
.ifeqs "\suf_op", "s"
|
||||
.ifeqs "\xconj", "0"
|
||||
xvpackod.w \tmp1, \in0, \tmp1
|
||||
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1
|
||||
.else
|
||||
xvpackod.w \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
xvshuf4i.w \tmp2, \in1, 0xb1
|
||||
.else
|
||||
.ifeqs "\xconj", "0"
|
||||
xvpackod.d \tmp1, \in0, \tmp1
|
||||
.else
|
||||
xvpackod.d \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
xvshuf4i.d \tmp2, \in1, 0x0b
|
||||
.endif
|
||||
.else
|
||||
.ifeqs "\suf_op", "s"
|
||||
.ifeqs "\xconj", "0"
|
||||
vpackod.w \tmp1, \in0, \tmp1
|
||||
.else
|
||||
vpackod.w \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
vshuf4i.w \tmp2, \in1, 0xb1
|
||||
.else
|
||||
.ifeqs "\xconj", "0"
|
||||
vpackod.d \tmp1, \in0, \tmp1
|
||||
.else
|
||||
vpackod.d \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
vshuf4i.d \tmp2, \in1, 0x0b
|
||||
.endif
|
||||
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b
|
||||
.endif
|
||||
|
||||
\pre_op\()mul.\suf_op \out, \tmp0, \in1
|
||||
|
@ -512,112 +676,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// suf_op: s or d, differentiate between single precision or double precision complex numbers
|
||||
//
|
||||
.macro GCOMPLEXMADD xconj=0, conj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, tmp0:req, tmp1:req, tmp2:req, more:vararg
|
||||
.ifeqs "\pre_op", "xvf"
|
||||
xvxor.v \tmp1, \tmp1, \tmp1
|
||||
.ifeqs "\suf_op", "s"
|
||||
xvpackev.w \tmp0, \in0, \in0
|
||||
.else
|
||||
xvpackev.d \tmp0, \in0, \in0
|
||||
.endif
|
||||
.else
|
||||
vxor.v \tmp1, \tmp1, \tmp1
|
||||
.ifeqs "\suf_op", "s"
|
||||
vpackev.w \tmp0, \in0, \in0
|
||||
.else
|
||||
vpackev.d \tmp0, \in0, \in0
|
||||
.endif
|
||||
.endif
|
||||
TRANSF2G GXOR, \pre_op, s, \tmp1, \tmp1, \tmp1
|
||||
TRANSF2G GPACKEV, \pre_op, \suf_op, \tmp0, \in0, \in0
|
||||
|
||||
\pre_op\()madd.\suf_op \tmp2, \tmp0, \in1, \in2
|
||||
.ifeqs "\conj", "1"
|
||||
|
||||
.ifeqs "\conj\()\suf_op", "1s"
|
||||
\pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2
|
||||
.ifeqs "\pre_op", "xvf"
|
||||
.ifeqs "\suf_op", "s"
|
||||
xvshuf4i.w \tmp0, \tmp0, 0xb1
|
||||
xvpackev.w \out, \tmp0, \tmp2
|
||||
.else
|
||||
xvshuf4i.d \tmp0, \tmp0, 0x0b
|
||||
xvpackev.d \out, \tmp0, \tmp2
|
||||
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp0, \tmp0, 0xb1
|
||||
TRANSF2G GPACKEV, \pre_op, \suf_op, \out, \tmp0, \tmp2
|
||||
.endif
|
||||
.else
|
||||
.ifeqs "\suf_op", "s"
|
||||
vshuf4i.w \tmp0, \tmp0, 0xb1
|
||||
vpackev.w \out, \tmp0, \tmp2
|
||||
.else
|
||||
vshuf4i.d \tmp0, \tmp0, 0x0b
|
||||
vpackev.d \out, \tmp0, \tmp2
|
||||
.ifeqs "\conj\()\suf_op", "1d"
|
||||
\pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2
|
||||
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp0, \tmp0, 0x0b
|
||||
TRANSF2G GPACKEV, \pre_op, \suf_op, \out, \tmp0, \tmp2
|
||||
.endif
|
||||
.endif /* pre_op = xvf */
|
||||
.else
|
||||
.ifeqs "\conj", "0"
|
||||
\pre_op\()add.\suf_op \out, \tmp2, \tmp1
|
||||
.endif /* conj = 1 */
|
||||
.endif
|
||||
|
||||
\pre_op\()sub.\suf_op \tmp1, \tmp1, \in0
|
||||
|
||||
.ifeqs "\pre_op", "xvf"
|
||||
.ifeqs "\suf_op", "s"
|
||||
.ifeqs "\conj", "0"
|
||||
.ifeqs "\xconj", "0"
|
||||
xvpackod.w \tmp1, \in0, \tmp1
|
||||
.else
|
||||
xvpackod.w \tmp1, \tmp1, \in0
|
||||
.ifeqs "\xconj\()\conj\()\suf_op", "00s"
|
||||
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1
|
||||
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1
|
||||
.endif
|
||||
.else
|
||||
.ifeqs "\xconj", "0"
|
||||
xvpackod.w \tmp1, \in0, \in0
|
||||
.else
|
||||
xvpackod.w \tmp1, \tmp1, \tmp1
|
||||
.ifeqs "\xconj\()\conj\()\suf_op", "10s"
|
||||
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0
|
||||
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1
|
||||
.endif
|
||||
.ifeqs "\xconj\()\conj\()\suf_op", "01s"
|
||||
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \in0
|
||||
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1
|
||||
.endif
|
||||
xvshuf4i.w \tmp2, \in1, 0xb1
|
||||
.else
|
||||
.ifeqs "\conj", "0"
|
||||
.ifeqs "\xconj", "0"
|
||||
xvpackod.d \tmp1, \in0, \tmp1
|
||||
.else
|
||||
xvpackod.d \tmp1, \tmp1, \in0
|
||||
.ifeqs "\xconj\()\conj\()\suf_op", "11s"
|
||||
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \tmp1
|
||||
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1
|
||||
.endif
|
||||
.else
|
||||
.ifeqs "\xconj", "0"
|
||||
xvpackod.d \tmp1, \in0, \in0
|
||||
.else
|
||||
xvpackod.d \tmp1, \tmp1, \tmp1
|
||||
.ifeqs "\xconj\()\conj\()\suf_op", "00d"
|
||||
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1
|
||||
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b
|
||||
.endif
|
||||
.ifeqs "\xconj\()\conj\()\suf_op", "10d"
|
||||
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0
|
||||
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b
|
||||
.endif
|
||||
xvshuf4i.d \tmp2, \in1, 0x0b
|
||||
.endif
|
||||
.else
|
||||
.ifeqs "\suf_op", "s"
|
||||
.ifeqs "\conj", "0"
|
||||
.ifeqs "\xconj", "0"
|
||||
vpackod.w \tmp1, \in0, \tmp1
|
||||
.else
|
||||
vpackod.w \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
.else
|
||||
.ifeqs "\xconj", "0"
|
||||
vpackod.w \tmp1, \in0, \in0
|
||||
.else
|
||||
vpackod.w \tmp1, \tmp1, \tmp1
|
||||
.endif
|
||||
.endif
|
||||
vshuf4i.w \tmp2, \in1, 0xb1
|
||||
.else
|
||||
.ifeqs "\conj", "0"
|
||||
.ifeqs "\xconj", "0"
|
||||
vpackod.d \tmp1, \in0, \tmp1
|
||||
.else
|
||||
vpackod.d \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
.else
|
||||
.ifeqs "\xconj", "0"
|
||||
vpackod.d \tmp1, \in0, \in0
|
||||
.else
|
||||
vpackod.d \tmp1, \tmp1, \tmp1
|
||||
.endif
|
||||
.endif
|
||||
vshuf4i.d \tmp2, \in1, 0x0b
|
||||
.ifeqs "\xconj\()\conj\()\suf_op", "01d"
|
||||
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \in0
|
||||
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b
|
||||
.endif
|
||||
.ifeqs "\xconj\()\conj\()\suf_op", "11d"
|
||||
TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \tmp1
|
||||
TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b
|
||||
.endif
|
||||
|
||||
\pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out
|
||||
|
|
|
@ -837,7 +837,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 26, 32
|
||||
push_if_used 9, 8
|
||||
xvreplve0.w VALPHA, $xr0
|
||||
#if defined (TRMMKERNEL) && !defined(LEFT)
|
||||
PTR_SUB OFF, ZERO, OFFSET
|
||||
|
@ -2343,6 +2343,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif // #if defined(TRMMKERNEL)
|
||||
.L_N1_M0:
|
||||
.L_N0:
|
||||
pop_if_used 26, 32
|
||||
pop_if_used 9, 8
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
|
|
@ -135,7 +135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//.L_N0
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 26, 32
|
||||
push_if_used 9, 8
|
||||
|
||||
move TD, DST
|
||||
move TS, SRC
|
||||
|
@ -458,6 +458,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
PTR_ADDI M, M, -1
|
||||
blt ZERO, M, .L_N1_M1
|
||||
.L_N0:
|
||||
pop_if_used 26, 32
|
||||
pop_if_used 9, 8
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
|
|
@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//.L_N0
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 17, 20
|
||||
push_if_used 0, 0
|
||||
|
||||
move TD, DST
|
||||
move TS, SRC
|
||||
|
@ -293,6 +293,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
PTR_ADDI M, M, -1
|
||||
blt ZERO, M, .L_N1_M1
|
||||
.L_N0:
|
||||
pop_if_used 17, 20
|
||||
pop_if_used 0, 0
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
|
|
@ -118,7 +118,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//.L_M0
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 24, 8
|
||||
push_if_used 7, 0
|
||||
|
||||
move S0, SRC
|
||||
move P0, DST
|
||||
|
@ -521,6 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
PTR_ADDI S1, S1, 0x04
|
||||
PTR_ADDI P5, P5, 0x04
|
||||
.L_M0:
|
||||
pop_if_used 24, 8
|
||||
pop_if_used 7, 0
|
||||
jirl $r0, $r1, 0x00
|
||||
EPILOGUE
|
||||
|
|
|
@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//.L_M0
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 23, 8
|
||||
push_if_used 6, 0
|
||||
|
||||
move S0, SRC
|
||||
move P0, DST
|
||||
|
@ -401,6 +401,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
PTR_ADDI S1, S1, 0x04
|
||||
PTR_ADDI P4, P4, 0x04
|
||||
.L_M0:
|
||||
pop_if_used 23, 8
|
||||
pop_if_used 6, 0
|
||||
jirl $r0, $r1, 0x00
|
||||
EPILOGUE
|
||||
|
|
|
@ -418,7 +418,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 7, 19
|
||||
push_if_used 7, 0
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
PTR_SUB J, INC_Y, K
|
||||
|
@ -458,6 +458,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
|
||||
SGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
|
||||
.L_END:
|
||||
pop_if_used 17 + 7, 19
|
||||
pop_if_used 7, 0
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
|
|
@ -369,7 +369,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 8, 18
|
||||
push_if_used 8, 0
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
|
||||
|
@ -400,6 +400,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.L_GAP_1: /* if (incx != 1) */
|
||||
SGEMV_T_LASX GAP_1, X8_GAP, X4_GAP
|
||||
.L_END:
|
||||
pop_if_used 17 + 8, 18
|
||||
pop_if_used 8, 0
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
|
|
@ -253,7 +253,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 7, 31
|
||||
push_if_used 7, 7
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
PTR_SUB J, INC_Y, K
|
||||
|
@ -291,6 +291,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
|
||||
ZGEMV_N_LSX GAP_1_1, X_2_GAP, X_1, Y_2_GAP, Y_1
|
||||
.L_END:
|
||||
pop_if_used 17 + 7, 31
|
||||
pop_if_used 7, 7
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
|
|
@ -298,7 +298,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 7, 31
|
||||
push_if_used 7, 7
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
PTR_SUB J, INC_Y, K
|
||||
|
@ -337,7 +337,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
|
||||
ZGEMV_N_LASX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1
|
||||
.L_END:
|
||||
pop_if_used 17 + 7, 31
|
||||
pop_if_used 7, 7
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
||||
|
|
|
@ -234,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 8, 30
|
||||
push_if_used 8, 6
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
|
||||
|
@ -263,6 +263,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.L_GAP_1: /* if (incx != 1) */
|
||||
ZGEMV_T_LSX GAP_1, X2_GAP
|
||||
.L_END:
|
||||
pop_if_used 17 + 8, 30
|
||||
pop_if_used 8, 6
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
|
|
@ -264,7 +264,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 8, 30
|
||||
push_if_used 8, 6
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
|
||||
|
@ -294,6 +294,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.L_GAP_1: /* if (incx != 1) */
|
||||
ZGEMV_T_LASX GAP_1, X4_GAP
|
||||
.L_END:
|
||||
pop_if_used 17 + 8, 30
|
||||
pop_if_used 8, 6
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
|
Loading…
Reference in New Issue