LoongArch64: Add sgemm_kernel
This commit is contained in:
parent
12ede72ab7
commit
553cc1372f
|
@ -11,9 +11,24 @@ DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
|||
|
||||
DGEMVNKERNEL = dgemv_n_8_lasx.S
|
||||
DGEMVTKERNEL = dgemv_t_8_lasx.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_16x8_lasx.S
|
||||
SGEMMINCOPY = sgemm_ncopy_16_lasx.S
|
||||
SGEMMITCOPY = sgemm_tcopy_16_lasx.S
|
||||
SGEMMONCOPY = sgemm_ncopy_8_lasx.S
|
||||
SGEMMOTCOPY = sgemm_tcopy_8_lasx.S
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
|
|
@ -36,6 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define PTR_ST st.d
|
||||
#define PTR_SLLI slli.d
|
||||
#define PTR_SRLI srli.d
|
||||
#define PTR_SRAI srai.d
|
||||
#define PTR_MUL mul.d
|
||||
#define PTR_ALSL alsl.d
|
||||
#else
|
||||
#define LA_REG int32_t
|
||||
|
@ -48,6 +50,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define PTR_ST st.w
|
||||
#define PTR_SLLI slli.w
|
||||
#define PTR_SRLI srli.w
|
||||
#define PTR_SRAI srai.w
|
||||
#define PTR_MUL mul.w
|
||||
#define PTR_ALSL alsl.w
|
||||
#endif
|
||||
|
||||
|
@ -218,6 +222,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endif
|
||||
.endm
|
||||
//
|
||||
// GSUB
|
||||
//
|
||||
.macro GSUB pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
|
||||
\pre_op\()sub.\suf_op \out, \in0, \in1
|
||||
.ifnb \more
|
||||
GSUB \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GSLLI
|
||||
//
|
||||
.macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
|
||||
|
@ -244,6 +257,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
GXOR \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GPERMI
|
||||
//
|
||||
.macro GPERMI pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
|
||||
\pre_op\()permi.\suf_op \out, \in0, \in1
|
||||
.ifnb \more
|
||||
GPERMI \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GNMSUB
|
||||
//
|
||||
.macro GNMSUB pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg
|
||||
\pre_op\()nmsub.\suf_op \out, \in0, \in1, \in2
|
||||
.ifnb \more
|
||||
GNMSUB \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GPRELD
|
||||
//
|
||||
.macro GPRELD in0:req, in1:req, in2:req, more:vararg
|
||||
preld \in0, \in1, \in2
|
||||
.ifnb \more
|
||||
GPRELD \more
|
||||
.endif
|
||||
.endm
|
||||
|
||||
//
|
||||
// Compound instructions
|
||||
|
@ -311,3 +351,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
GACC \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GMOV
|
||||
//
|
||||
.macro GMOV pre_op:req, out:req, in:req, more:vararg
|
||||
\pre_op\()or.v \out, \in, \in
|
||||
.ifnb \more
|
||||
GMOV \pre_op, \more
|
||||
.endif
|
||||
.endm
|
||||
|
||||
//
|
||||
// Media Related Macros
|
||||
//
|
||||
.macro GSBUTTERFLY pre_op, suf_op, out0, out1, in0, in1
|
||||
\pre_op\()ilvl.\suf_op \out0, \in0, \in1
|
||||
\pre_op\()ilvh.\suf_op \out1, \in0, \in1
|
||||
.endm
|
||||
.macro GINTERLACE pre_op, suf_op, out0, out1, in0, in1
|
||||
\pre_op\()pickev.\suf_op \out0, \in0, \in1
|
||||
\pre_op\()pickod.\suf_op \out1, \in0, \in1
|
||||
.endm
|
||||
|
||||
//
|
||||
// TRANSPOSE4x4_D: Transpose 4x4 block with double-word elements in vectors,
|
||||
// has no pre_op param. 128-bit vector instructions are not supported.
|
||||
//
|
||||
.macro GTRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \
|
||||
vt0, vt1
|
||||
GSBUTTERFLY xv, d, \vt0, \out1, \in1, \in0
|
||||
GSBUTTERFLY xv, d, \vt1, \out3, \in3, \in2
|
||||
GMOV xv, \out0, \vt0, \out2, \vt1, \vt1, \out3
|
||||
GPERMI xv, q, \out0, \out2, 0x02, \out2, \vt0, 0x31, \out3, \out1, 0x31, \out1, \vt1, 0x02
|
||||
.endm
|
||||
|
||||
.macro GTRANSPOSE8x8_W out0, out1, out2, out3, out4, out5, out6, out7, \
|
||||
in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
tmp0, tmp1, tmp2, tmp3
|
||||
GSBUTTERFLY xv, w, \tmp0, \tmp2, \in2, \in0
|
||||
GSBUTTERFLY xv, w, \tmp1, \tmp3, \in3, \in1
|
||||
GSBUTTERFLY xv, w, \out0, \out1, \tmp1, \tmp0
|
||||
GSBUTTERFLY xv, w, \out2, \out3, \tmp3, \tmp2
|
||||
|
||||
GSBUTTERFLY xv, w, \tmp0, \tmp2, \in6, \in4
|
||||
GSBUTTERFLY xv, w, \tmp1, \tmp3, \in7, \in5
|
||||
GSBUTTERFLY xv, w, \out4, \out5, \tmp1, \tmp0
|
||||
GSBUTTERFLY xv, w, \out6, \out7, \tmp3, \tmp2
|
||||
|
||||
GMOV xv, \tmp0, \out0, \tmp1, \out1, \tmp2, \out2, \tmp3, \out3
|
||||
|
||||
GPERMI xv, q, \out0, \out4, 0x02, \out1, \out5, 0x02, \
|
||||
\out2, \out6, 0x02, \out3, \out7, 0x02, \
|
||||
\out4, \tmp0, 0x31, \out5, \tmp1, 0x31, \
|
||||
\out6, \tmp2, 0x31, \out7, \tmp3, 0x31
|
||||
.endm
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,463 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
|
||||
/*********************************************************************
|
||||
* 2023/08/23 guxiwei
|
||||
* UTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*********************************************************************/
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define S5 $r16
|
||||
#define S6 $r17
|
||||
#define S7 $r18
|
||||
#define S8 $r19
|
||||
#define S9 $r20
|
||||
#define S10 $r23
|
||||
#define S11 $r24
|
||||
#define S12 $r25
|
||||
#define S13 $r26
|
||||
#define S14 $r27
|
||||
#define S15 $r28
|
||||
#define S16 $r29
|
||||
#define TD $r30
|
||||
#define TS $r31
|
||||
#define TL $r7
|
||||
#define T0 $r6
|
||||
#undef ZERO
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
#define F4 $f4
|
||||
#define F5 $f5
|
||||
#define F6 $f6
|
||||
#define F7 $f7
|
||||
/* LASX vectors */
|
||||
#define U0 $xr0
|
||||
#define U1 $xr1
|
||||
#define U2 $xr2
|
||||
#define U3 $xr3
|
||||
#define U4 $xr4
|
||||
#define U5 $xr5
|
||||
#define U6 $xr6
|
||||
#define U7 $xr7
|
||||
#define U8 $xr8
|
||||
#define U9 $xr9
|
||||
#define U10 $xr10
|
||||
#define U11 $xr11
|
||||
#define U12 $xr12
|
||||
#define U13 $xr13
|
||||
#define U14 $xr14
|
||||
#define U15 $xr15
|
||||
#define D0 $xr16
|
||||
#define D1 $xr17
|
||||
#define D2 $xr18
|
||||
#define D3 $xr19
|
||||
#define D4 $xr20
|
||||
#define D5 $xr21
|
||||
#define D6 $xr22
|
||||
#define D7 $xr23
|
||||
#define D8 $xr24
|
||||
#define D9 $xr25
|
||||
#define D10 $xr26
|
||||
#define D11 $xr27
|
||||
#define D12 $xr28
|
||||
#define D13 $xr29
|
||||
#define D14 $xr30
|
||||
#define D15 $xr31
|
||||
|
||||
// Loops outline
|
||||
//.L_N16 <-------------------
|
||||
//| .L_M8: |
|
||||
//| .L_M7: | Main Loop
|
||||
//| .L_M1: |
|
||||
//| .L_M0: ---------------
|
||||
//.L_N15:
|
||||
//.L_N8:
|
||||
//| .L_N8_M8:
|
||||
//| .L_N8_M7:
|
||||
//| .L_N8_M1:
|
||||
//.L_N7:
|
||||
//.L_N4:
|
||||
//| .L_N4_M4:
|
||||
//| .L_N4_M3:
|
||||
//| .L_N4_M1:
|
||||
//.L_N3:
|
||||
//.L_N2:
|
||||
//| .L_N2_M2:
|
||||
//| .L_N2_M1:
|
||||
//.L_N1:
|
||||
//| .L_N1_M1:
|
||||
//.L_N0
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 26, 32
|
||||
|
||||
move TD, DST
|
||||
move TS, SRC
|
||||
PTR_SLLI TL, LDA, 0x02
|
||||
PTR_SLLI T0, TL, 0x01
|
||||
PTR_SRAI J, N, 0x04
|
||||
beq J, ZERO, .L_N15
|
||||
.align 5
|
||||
.L_N16:
|
||||
move S1, TS
|
||||
PTR_ADD S2, TS, TL
|
||||
PTR_SRAI I, M, 0x03
|
||||
PTR_ADD S3, S2, TL
|
||||
PTR_ADDI J, J, -1
|
||||
PTR_ADD S4, S3, TL
|
||||
PTR_ADD S5, S3, T0
|
||||
PTR_ADD S6, S4, T0
|
||||
PTR_ADD S7, S5, T0
|
||||
PTR_ADD S8, S6, T0
|
||||
PTR_ADD S9, S7, T0
|
||||
PTR_ADD S10, S8, T0
|
||||
PTR_ADD S11, S9, T0
|
||||
PTR_ADD S12, S10, T0
|
||||
PTR_ADD S13, S11, T0
|
||||
PTR_ADD S14, S12, T0
|
||||
PTR_ADD S15, S13, T0
|
||||
PTR_ADD S16, S14, T0
|
||||
PTR_ADD TS, S15, T0
|
||||
beq I, ZERO, .L_M7
|
||||
.align 5
|
||||
.L_M8:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
xvld U4, S5, 0x00
|
||||
xvld U5, S6, 0x00
|
||||
xvld U6, S7, 0x00
|
||||
xvld U7, S8, 0x00
|
||||
xvld U8, S9, 0x00
|
||||
xvld U9, S10, 0x00
|
||||
xvld U10, S11, 0x00
|
||||
xvld U11, S12, 0x00
|
||||
xvld U12, S13, 0x00
|
||||
xvld U13, S14, 0x00
|
||||
xvld U14, S15, 0x00
|
||||
xvld U15, S16, 0x00
|
||||
|
||||
GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \
|
||||
U0, U1, U2, U3, U4, U5, U6, U7, \
|
||||
D1, D3, D5, D7 // As tmp
|
||||
GTRANSPOSE8x8_W D1, D3, D5, D7, D9, D11, D13, D15, \
|
||||
U8, U9, U10, U11, U12, U13, U14, U15, \
|
||||
U0, U1, U2, U3 // As tmp
|
||||
GST xv, , D0, TD, 0x00, D1, TD, 0x20, D2, TD, 0x40, D3, TD, 0x60, \
|
||||
D4, TD, 0x80, D5, TD, 0xA0, D6, TD, 0xC0, D7, TD, 0xE0
|
||||
PTR_ADDI TD, TD, 0x100
|
||||
GST xv, , D8, TD, 0x00, D9, TD, 0x20, D10, TD, 0x40, D11, TD, 0x60, \
|
||||
D12, TD, 0x80, D13, TD, 0xA0, D14, TD, 0xC0, D15, TD, 0xE0
|
||||
PTR_ADDI TD, TD, 0x100
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
PTR_ADDI S2, S2, 0x20
|
||||
PTR_ADDI S3, S3, 0x20
|
||||
PTR_ADDI S4, S4, 0x20
|
||||
PTR_ADDI S5, S5, 0x20
|
||||
PTR_ADDI S6, S6, 0x20
|
||||
PTR_ADDI S7, S7, 0x20
|
||||
PTR_ADDI S8, S8, 0x20
|
||||
PTR_ADDI S9, S9, 0x20
|
||||
PTR_ADDI S10, S10, 0x20
|
||||
PTR_ADDI S11, S11, 0x20
|
||||
PTR_ADDI S12, S12, 0x20
|
||||
PTR_ADDI S13, S13, 0x20
|
||||
PTR_ADDI S14, S14, 0x20
|
||||
PTR_ADDI S15, S15, 0x20
|
||||
PTR_ADDI S16, S16, 0x20
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_M8
|
||||
.L_M7:
|
||||
andi I, M, 0x07
|
||||
beq I, ZERO, .L_M0
|
||||
.align 5
|
||||
.L_M1:
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S2, 0x00
|
||||
fld.s F2, S3, 0x00
|
||||
fld.s F3, S4, 0x00
|
||||
fld.s F4, S5, 0x00
|
||||
fld.s F5, S6, 0x00
|
||||
fld.s F6, S7, 0x00
|
||||
fld.s F7, S8, 0x00
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
fst.s F1, TD, 0x04
|
||||
fst.s F2, TD, 0x08
|
||||
fst.s F3, TD, 0x0C
|
||||
fst.s F4, TD, 0x10
|
||||
fst.s F5, TD, 0x14
|
||||
fst.s F6, TD, 0x18
|
||||
fst.s F7, TD, 0x1C
|
||||
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
PTR_ADDI S3, S3, 0x04
|
||||
PTR_ADDI S4, S4, 0x04
|
||||
PTR_ADDI S5, S5, 0x04
|
||||
PTR_ADDI S6, S6, 0x04
|
||||
PTR_ADDI S7, S7, 0x04
|
||||
PTR_ADDI S8, S8, 0x04
|
||||
PTR_ADDI TD, TD, 0x20
|
||||
|
||||
fld.s F0, S9, 0x00
|
||||
fld.s F1, S10, 0x00
|
||||
fld.s F2, S11, 0x00
|
||||
fld.s F3, S12, 0x00
|
||||
fld.s F4, S13, 0x00
|
||||
fld.s F5, S14, 0x00
|
||||
fld.s F6, S15, 0x00
|
||||
fld.s F7, S16, 0x00
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
fst.s F1, TD, 0x04
|
||||
fst.s F2, TD, 0x08
|
||||
fst.s F3, TD, 0x0C
|
||||
fst.s F4, TD, 0x10
|
||||
fst.s F5, TD, 0x14
|
||||
fst.s F6, TD, 0x18
|
||||
fst.s F7, TD, 0x1C
|
||||
|
||||
PTR_ADDI S9, S9, 0x04
|
||||
PTR_ADDI S10, S10, 0x04
|
||||
PTR_ADDI S11, S11, 0x04
|
||||
PTR_ADDI S12, S12, 0x04
|
||||
PTR_ADDI S13, S13, 0x04
|
||||
PTR_ADDI S14, S14, 0x04
|
||||
PTR_ADDI S15, S15, 0x04
|
||||
PTR_ADDI S16, S16, 0x04
|
||||
PTR_ADDI TD, TD, 0x20
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_M1
|
||||
.L_M0:
|
||||
blt ZERO, J, .L_N16
|
||||
.L_N15:
|
||||
andi J, N, 0x0f
|
||||
beq ZERO, J, .L_N0
|
||||
|
||||
andi J, N, 0x08
|
||||
beq ZERO, J, .L_N7
|
||||
.L_N8:
|
||||
move S1, TS
|
||||
PTR_ADD S2, TS, TL
|
||||
PTR_SRAI I, M, 0x03
|
||||
PTR_ADD S3, S2, TL
|
||||
PTR_ADD S4, S2, T0
|
||||
PTR_ADD S5, S3, T0
|
||||
PTR_ADD S6, S4, T0
|
||||
PTR_ADD S7, S5, T0
|
||||
PTR_ADD S8, S6, T0
|
||||
PTR_ADD TS, S7, T0
|
||||
beq I, ZERO, .L_N8_M7
|
||||
.align 5
|
||||
.L_N8_M8:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
xvld U4, S5, 0x00
|
||||
xvld U5, S6, 0x00
|
||||
xvld U6, S7, 0x00
|
||||
xvld U7, S8, 0x00
|
||||
|
||||
GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \
|
||||
U0, U1, U2, U3, U4, U5, U6, U7, \
|
||||
D1, D3, D5, D7 // As tmp
|
||||
GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \
|
||||
D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0
|
||||
PTR_ADDI TD, TD, 0x100
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
PTR_ADDI S2, S2, 0x20
|
||||
PTR_ADDI S3, S3, 0x20
|
||||
PTR_ADDI S4, S4, 0x20
|
||||
PTR_ADDI S5, S5, 0x20
|
||||
PTR_ADDI S6, S6, 0x20
|
||||
PTR_ADDI S7, S7, 0x20
|
||||
PTR_ADDI S8, S8, 0x20
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_N8_M8
|
||||
.L_N8_M7:
|
||||
andi I, M, 0x07
|
||||
beq I, ZERO, .L_N7
|
||||
.align 5
|
||||
.L_N8_M1:
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S2, 0x00
|
||||
fld.s F2, S3, 0x00
|
||||
fld.s F3, S4, 0x00
|
||||
fld.s F4, S5, 0x00
|
||||
fld.s F5, S6, 0x00
|
||||
fld.s F6, S7, 0x00
|
||||
fld.s F7, S8, 0x00
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
fst.s F1, TD, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
fst.s F2, TD, 0x08
|
||||
PTR_ADDI S3, S3, 0x04
|
||||
fst.s F3, TD, 0x0C
|
||||
PTR_ADDI S4, S4, 0x04
|
||||
fst.s F4, TD, 0x10
|
||||
PTR_ADDI S5, S5, 0x04
|
||||
fst.s F5, TD, 0x14
|
||||
PTR_ADDI S6, S6, 0x04
|
||||
fst.s F6, TD, 0x18
|
||||
PTR_ADDI S7, S7, 0x04
|
||||
fst.s F7, TD, 0x1C
|
||||
PTR_ADDI S8, S8, 0x04
|
||||
|
||||
PTR_ADDI TD, TD, 0x20
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_N8_M1
|
||||
.L_N7:
|
||||
andi J, N, 0x07
|
||||
beq ZERO, J, .L_N0
|
||||
|
||||
andi J, N, 0x04
|
||||
beq ZERO, J, .L_N3
|
||||
.L_N4:
|
||||
move S1, TS
|
||||
PTR_ADD S2, TS, TL
|
||||
PTR_SRAI I, M, 0x02
|
||||
PTR_ADD S3, S2, TL
|
||||
PTR_ADD S4, S2, T0
|
||||
PTR_ADD TS, S3, T0
|
||||
beq I, ZERO, .L_N4_M3
|
||||
.align 5
|
||||
.L_N4_M4:
|
||||
GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0
|
||||
GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0
|
||||
GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1
|
||||
GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4
|
||||
GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5
|
||||
GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30
|
||||
PTR_ADDI S1, S1, 0x10
|
||||
PTR_ADDI S2, S2, 0x10
|
||||
PTR_ADDI S3, S3, 0x10
|
||||
PTR_ADDI S4, S4, 0x10
|
||||
PTR_ADDI TD, TD, 0x40
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_N4_M4
|
||||
.L_N4_M3:
|
||||
andi I, M, 0x03
|
||||
beq I, ZERO, .L_N3
|
||||
.align 5
|
||||
.L_N4_M1:
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S2, 0x00
|
||||
fld.s F2, S3, 0x00
|
||||
fld.s F3, S4, 0x00
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
fst.s F1, TD, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
fst.s F2, TD, 0x08
|
||||
PTR_ADDI S3, S3, 0x04
|
||||
fst.s F3, TD, 0x0C
|
||||
PTR_ADDI S4, S4, 0x04
|
||||
|
||||
PTR_ADDI TD, TD, 0x10
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_N4_M1
|
||||
.L_N3:
|
||||
andi J, N, 0x03
|
||||
beq ZERO, J, .L_N0
|
||||
|
||||
andi J, N, 0x02
|
||||
beq ZERO, J, .L_N1
|
||||
.L_N2:
|
||||
move S1, TS
|
||||
PTR_ADD S2, TS, TL
|
||||
PTR_SRAI I, M, 0x01
|
||||
PTR_ADD TS, S2, TL
|
||||
beq I, ZERO, .L_N2_M1
|
||||
.align 5
|
||||
.L_N2_M2:
|
||||
GLD f, d, F0, S1, 0x00, F1, S2, 0x00
|
||||
vilvl.w $vr0, $vr1, $vr0
|
||||
GST v, , $vr0, TD, 0x00
|
||||
PTR_ADDI S1, S1, 0x08
|
||||
PTR_ADDI S2, S2, 0x08
|
||||
PTR_ADDI TD, TD, 0x10
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_N2_M2
|
||||
.L_N2_M1:
|
||||
andi I, M, 0x01
|
||||
beq I, ZERO, .L_N1
|
||||
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S2, 0x00
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
fst.s F1, TD, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
PTR_ADDI TD, TD, 0x08
|
||||
.align 5
|
||||
.L_N1:
|
||||
move S1, TS
|
||||
beq ZERO, M, .L_N0
|
||||
.L_N1_M1:
|
||||
fld.s F0, S1, 0x00
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
fst.s F0, TD, 0x00
|
||||
PTR_ADDI TD, TD, 0x04
|
||||
PTR_ADDI M, M, -1
|
||||
blt ZERO, M, .L_N1_M1
|
||||
.L_N0:
|
||||
pop_if_used 26, 32
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
|
@ -0,0 +1,298 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
|
||||
/*********************************************************************
|
||||
* 2023/08/23 guxiwei
|
||||
* UTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*********************************************************************/
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define S5 $r16
|
||||
#define S6 $r17
|
||||
#define S7 $r18
|
||||
#define S8 $r19
|
||||
#define TD $r20
|
||||
#define TS $r11
|
||||
#define TL $r7
|
||||
#define T0 $r6
|
||||
#undef ZERO
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
#define F4 $f4
|
||||
#define F5 $f5
|
||||
#define F6 $f6
|
||||
#define F7 $f7
|
||||
/* LASX vectors */
|
||||
#define U0 $xr0
|
||||
#define U1 $xr1
|
||||
#define U2 $xr2
|
||||
#define U3 $xr3
|
||||
#define U4 $xr4
|
||||
#define U5 $xr5
|
||||
#define U6 $xr6
|
||||
#define U7 $xr7
|
||||
#define D0 $xr8
|
||||
#define D1 $xr9
|
||||
#define D2 $xr10
|
||||
#define D3 $xr11
|
||||
#define D4 $xr12
|
||||
#define D5 $xr13
|
||||
#define D6 $xr14
|
||||
#define D7 $xr15
|
||||
#define D8 $xr16
|
||||
#define D10 $xr17
|
||||
#define D12 $xr18
|
||||
#define D14 $xr19
|
||||
|
||||
// Loops outline
|
||||
//.L_N8: <----------------
|
||||
//| .L_M8: |
|
||||
//| .L_M7: | Main Loop
|
||||
//| .L_M1: |
|
||||
//| .L_M0:--------------
|
||||
//.L_N7:
|
||||
//.L_N4:
|
||||
//| .L_N4_M4:
|
||||
//| .L_N4_M3:
|
||||
//| .L_N4_M1:
|
||||
//.L_N3:
|
||||
//.L_N2:
|
||||
//| .L_N2_M2:
|
||||
//| .L_N2_M1:
|
||||
//.L_N1:
|
||||
//| .L_N1_M1:
|
||||
//.L_N0
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 17, 20
|
||||
|
||||
move TD, DST
|
||||
move TS, SRC
|
||||
PTR_SLLI TL, LDA, 0x02
|
||||
PTR_SLLI T0, TL, 0x01
|
||||
PTR_SRAI J, N, 0x03
|
||||
beq J, ZERO, .L_N7
|
||||
.align 5
|
||||
.L_N8:
|
||||
move S1, TS
|
||||
PTR_ADD S2, TS, TL
|
||||
PTR_SRAI I, M, 0x03
|
||||
PTR_ADD S3, S2, TL
|
||||
PTR_ADDI J, J, -1
|
||||
PTR_ADD S4, S2, T0
|
||||
PTR_ADD S5, S3, T0
|
||||
PTR_ADD S6, S4, T0
|
||||
PTR_ADD S7, S5, T0
|
||||
PTR_ADD S8, S6, T0
|
||||
PTR_ADD TS, S7, T0
|
||||
beq I, ZERO, .L_M7
|
||||
.align 5
|
||||
.L_M8:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
xvld U4, S5, 0x00
|
||||
xvld U5, S6, 0x00
|
||||
xvld U6, S7, 0x00
|
||||
xvld U7, S8, 0x00
|
||||
|
||||
GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \
|
||||
U0, U1, U2, U3, U4, U5, U6, U7, \
|
||||
D1, D3, D5, D7 // As tmp
|
||||
GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \
|
||||
D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0
|
||||
PTR_ADDI TD, TD, 0x100
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
PTR_ADDI S2, S2, 0x20
|
||||
PTR_ADDI S3, S3, 0x20
|
||||
PTR_ADDI S4, S4, 0x20
|
||||
PTR_ADDI S5, S5, 0x20
|
||||
PTR_ADDI S6, S6, 0x20
|
||||
PTR_ADDI S7, S7, 0x20
|
||||
PTR_ADDI S8, S8, 0x20
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_M8
|
||||
.L_M7:
|
||||
andi I, M, 0x07
|
||||
beq I, ZERO, .L_M0
|
||||
.align 5
|
||||
.L_M1:
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S2, 0x00
|
||||
fld.s F2, S3, 0x00
|
||||
fld.s F3, S4, 0x00
|
||||
fld.s F4, S5, 0x00
|
||||
fld.s F5, S6, 0x00
|
||||
fld.s F6, S7, 0x00
|
||||
fld.s F7, S8, 0x00
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
fst.s F1, TD, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
fst.s F2, TD, 0x08
|
||||
PTR_ADDI S3, S3, 0x04
|
||||
fst.s F3, TD, 0x0C
|
||||
PTR_ADDI S4, S4, 0x04
|
||||
fst.s F4, TD, 0x10
|
||||
PTR_ADDI S5, S5, 0x04
|
||||
fst.s F5, TD, 0x14
|
||||
PTR_ADDI S6, S6, 0x04
|
||||
fst.s F6, TD, 0x18
|
||||
PTR_ADDI S7, S7, 0x04
|
||||
fst.s F7, TD, 0x1C
|
||||
PTR_ADDI S8, S8, 0x04
|
||||
|
||||
PTR_ADDI TD, TD, 0x20
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_M1
|
||||
.L_M0:
|
||||
blt ZERO, J, .L_N8
|
||||
.L_N7:
|
||||
andi J, N, 0x07
|
||||
beq ZERO, J, .L_N0
|
||||
|
||||
andi J, N, 0x04
|
||||
beq ZERO, J, .L_N3
|
||||
.L_N4:
|
||||
move S1, TS
|
||||
PTR_ADD S2, TS, TL
|
||||
PTR_SRAI I, M, 0x02
|
||||
PTR_ADD S3, S2, TL
|
||||
PTR_ADD S4, S2, T0
|
||||
PTR_ADD TS, S3, T0
|
||||
beq I, ZERO, .L_N4_M3
|
||||
.align 5
|
||||
.L_N4_M4:
|
||||
GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0
|
||||
GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0
|
||||
GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1
|
||||
GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4
|
||||
GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5
|
||||
GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30
|
||||
PTR_ADDI S1, S1, 0x10
|
||||
PTR_ADDI S2, S2, 0x10
|
||||
PTR_ADDI S3, S3, 0x10
|
||||
PTR_ADDI S4, S4, 0x10
|
||||
PTR_ADDI TD, TD, 0x40
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_N4_M4
|
||||
.L_N4_M3:
|
||||
andi I, M, 0x03
|
||||
beq I, ZERO, .L_N3
|
||||
.align 5
|
||||
.L_N4_M1:
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S2, 0x00
|
||||
fld.s F2, S3, 0x00
|
||||
fld.s F3, S4, 0x00
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
fst.s F1, TD, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
fst.s F2, TD, 0x08
|
||||
PTR_ADDI S3, S3, 0x04
|
||||
fst.s F3, TD, 0x0C
|
||||
PTR_ADDI S4, S4, 0x04
|
||||
|
||||
PTR_ADDI TD, TD, 0x10
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_N4_M1
|
||||
.L_N3:
|
||||
andi J, N, 0x03
|
||||
beq ZERO, J, .L_N0
|
||||
|
||||
andi J, N, 0x02
|
||||
beq ZERO, J, .L_N1
|
||||
.L_N2:
|
||||
move S1, TS
|
||||
PTR_ADD S2, TS, TL
|
||||
PTR_SRAI I, M, 0x01
|
||||
PTR_ADD TS, S2, TL
|
||||
beq I, ZERO, .L_N2_M1
|
||||
.align 5
|
||||
.L_N2_M2:
|
||||
GLD f, d, F0, S1, 0x00, F1, S2, 0x00
|
||||
vilvl.w $vr0, $vr1, $vr0
|
||||
GST v, , $vr0, TD, 0x00
|
||||
PTR_ADDI S1, S1, 0x08
|
||||
PTR_ADDI S2, S2, 0x08
|
||||
PTR_ADDI TD, TD, 0x10
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_N2_M2
|
||||
.L_N2_M1:
|
||||
andi I, M, 0x01
|
||||
beq I, ZERO, .L_N1
|
||||
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S2, 0x00
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
fst.s F1, TD, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
PTR_ADDI TD, TD, 0x08
|
||||
.align 5
|
||||
.L_N1:
|
||||
move S1, TS
|
||||
beq ZERO, M, .L_N0
|
||||
.L_N1_M1:
|
||||
fld.s F0, S1, 0x00
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
fst.s F0, TD, 0x00
|
||||
PTR_ADDI TD, TD, 0x04
|
||||
PTR_ADDI M, M, -1
|
||||
blt ZERO, M, .L_N1_M1
|
||||
.L_N0:
|
||||
pop_if_used 17, 20
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
|
@ -0,0 +1,526 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
|
||||
/*********************************************************************
|
||||
* 2023/08/23 guxiwei
|
||||
* UTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*********************************************************************/
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S0 $r11
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define S5 $r16
|
||||
#define S6 $r17
|
||||
#define S7 $r18
|
||||
#define S8 $r19
|
||||
#define P0 $r20
|
||||
#define P1 $r23
|
||||
#define P2 $r24
|
||||
#define P3 $r25
|
||||
#define P4 $r26
|
||||
#define P5 $r27
|
||||
#define T0 $r28
|
||||
#define T1 $r29
|
||||
#define TL $r7
|
||||
#define ZERO $r0
|
||||
|
||||
/* LASX vectors */
|
||||
#define U0 $xr0
|
||||
#define U1 $xr1
|
||||
#define U2 $xr2
|
||||
#define U3 $xr3
|
||||
#define U4 $xr4
|
||||
#define U5 $xr5
|
||||
#define U6 $xr6
|
||||
#define U7 $xr7
|
||||
|
||||
// Loops outline
|
||||
//.L_M8 <-------------------
|
||||
//| .L_N16: |
|
||||
//| .L_N15: |
|
||||
//| .L_N8: |
|
||||
//| .L_N7: | Main Loop
|
||||
//| .L_N4: |
|
||||
//| .L_N3: |
|
||||
//| .L_N2: |
|
||||
//| .L_N1: |
|
||||
//| .L_N0: ---------------
|
||||
//.L_M7
|
||||
//.L_M4
|
||||
//| .L_M4_N16:
|
||||
//| .L_M4_N15:
|
||||
//| .L_M4_N8:
|
||||
//| .L_M4_N7:
|
||||
//| .L_M4_N4:
|
||||
//| .L_M4_N3:
|
||||
//| .L_M4_N2:
|
||||
//| .L_M4_N1:
|
||||
//.L_M3
|
||||
//.L_M2
|
||||
//| .L_M2_N16:
|
||||
//| .L_M2_N15:
|
||||
//| .L_M2_N8:
|
||||
//| .L_M2_N7:
|
||||
//| .L_M2_N4:
|
||||
//| .L_M2_N3:
|
||||
//| .L_M2_N2:
|
||||
//| .L_M2_N1:
|
||||
//.L_M1
|
||||
//| .L_M1_N16:
|
||||
//| .L_M1_N15:
|
||||
//| .L_M1_N8:
|
||||
//| .L_M1_N7:
|
||||
//| .L_M1_N4:
|
||||
//| .L_M1_N3:
|
||||
//| .L_M1_N2:
|
||||
//| .L_M1_N1:
|
||||
//.L_M0
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 24, 8
|
||||
|
||||
move S0, SRC
|
||||
move P0, DST
|
||||
|
||||
PTR_SRAI T0, N, 0x04
|
||||
PTR_SRAI T1, N, 0x03
|
||||
PTR_SLLI T0, T0, 0x04
|
||||
PTR_SLLI T1, T1, 0x03
|
||||
|
||||
PTR_MUL P2, M, T0
|
||||
PTR_MUL P3, M, T1
|
||||
PTR_SLLI P2, P2, 0x02
|
||||
PTR_SLLI P3, P3, 0x02
|
||||
PTR_ADD P2, DST, P2
|
||||
PTR_ADD P3, DST, P3
|
||||
|
||||
PTR_SRAI T0, N, 0x02
|
||||
PTR_SRAI T1, N, 0x01
|
||||
PTR_SLLI T0, T0, 0x02
|
||||
PTR_SLLI T1, T1, 0x01
|
||||
PTR_MUL P4, M, T0
|
||||
PTR_MUL P5, M, T1
|
||||
PTR_SLLI P4, P4, 0x02
|
||||
PTR_SLLI P5, P5, 0x02
|
||||
PTR_ADD P4, DST, P4
|
||||
PTR_ADD P5, DST, P5
|
||||
|
||||
PTR_SLLI TL, LDA, 0x02
|
||||
PTR_SRAI J, M, 0x03
|
||||
PTR_SLLI T0, TL, 0x01
|
||||
PTR_SLLI T1, M, 0x06
|
||||
beq ZERO, J, .L_M7
|
||||
.align 5
|
||||
.L_M8:
|
||||
move S1, S0
|
||||
PTR_ADD S2, S0, TL
|
||||
PTR_ADD S3, S1, T0
|
||||
PTR_ADD S4, S2, T0
|
||||
PTR_ADD S5, S3, T0
|
||||
PTR_ADD S6, S4, T0
|
||||
PTR_ADD S7, S5, T0
|
||||
PTR_ADD S8, S6, T0
|
||||
PTR_ADD S0, S7, T0
|
||||
|
||||
move P1, P0
|
||||
PTR_ADDI P0, P0, 0x200
|
||||
|
||||
PTR_SRAI I, N, 0x04
|
||||
PTR_ADDI J, J, -1
|
||||
beq ZERO, I, .L_N15
|
||||
.L_N16:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
xvld U2, S2, 0x00
|
||||
xvld U3, S2, 0x20
|
||||
|
||||
xvst U0, P1, 0x00
|
||||
xvst U1, P1, 0x20
|
||||
xvst U2, P1, 0x40
|
||||
xvst U3, P1, 0x60
|
||||
|
||||
xvld U4, S3, 0x00
|
||||
xvld U5, S3, 0x20
|
||||
xvld U6, S4, 0x00
|
||||
xvld U7, S4, 0x20
|
||||
|
||||
xvst U4, P1, 0x80
|
||||
xvst U5, P1, 0xA0
|
||||
xvst U6, P1, 0xC0
|
||||
xvst U7, P1, 0xE0
|
||||
|
||||
xvld U0, S5, 0x00
|
||||
xvld U1, S5, 0x20
|
||||
xvld U2, S6, 0x00
|
||||
xvld U3, S6, 0x20
|
||||
|
||||
xvst U0, P1, 0x100
|
||||
xvst U1, P1, 0x120
|
||||
xvst U2, P1, 0x140
|
||||
xvst U3, P1, 0x160
|
||||
|
||||
xvld U4, S7, 0x00
|
||||
xvld U5, S7, 0x20
|
||||
xvld U6, S8, 0x00
|
||||
xvld U7, S8, 0x20
|
||||
|
||||
xvst U4, P1, 0x180
|
||||
xvst U5, P1, 0x1A0
|
||||
xvst U6, P1, 0x1C0
|
||||
xvst U7, P1, 0x1E0
|
||||
|
||||
PTR_ADDI S1, S1, 0x40
|
||||
PTR_ADDI S2, S2, 0x40
|
||||
PTR_ADDI S3, S3, 0x40
|
||||
PTR_ADDI S4, S4, 0x40
|
||||
PTR_ADDI S5, S5, 0x40
|
||||
PTR_ADDI S6, S6, 0x40
|
||||
PTR_ADDI S7, S7, 0x40
|
||||
PTR_ADDI S8, S8, 0x40
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD P1, P1, T1
|
||||
blt ZERO, I, .L_N16
|
||||
.L_N15:
|
||||
andi I, N, 0x08
|
||||
beq ZERO, I, .L_N7
|
||||
.L_N8:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
xvld U4, S5, 0x00
|
||||
xvld U5, S6, 0x00
|
||||
xvld U6, S7, 0x00
|
||||
xvld U7, S8, 0x00
|
||||
|
||||
GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60, \
|
||||
U4, P2, 0x80, U5, P2, 0xA0, U6, P2, 0xC0, U7, P2, 0xE0
|
||||
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
PTR_ADDI S2, S2, 0x20
|
||||
PTR_ADDI S3, S3, 0x20
|
||||
PTR_ADDI S4, S4, 0x20
|
||||
PTR_ADDI S5, S5, 0x20
|
||||
PTR_ADDI S6, S6, 0x20
|
||||
PTR_ADDI S7, S7, 0x20
|
||||
PTR_ADDI S8, S8, 0x20
|
||||
PTR_ADDI P2, P2, 0x100
|
||||
.L_N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_N3
|
||||
.L_N4:
|
||||
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \
|
||||
$vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00
|
||||
GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30, \
|
||||
$vr4, P3, 0x40, $vr5, P3, 0x50, $vr6, P3, 0x60, $vr7, P3, 0x70
|
||||
PTR_ADDI S1, S1, 0x10
|
||||
PTR_ADDI S2, S2, 0x10
|
||||
PTR_ADDI S3, S3, 0x10
|
||||
PTR_ADDI S4, S4, 0x10
|
||||
PTR_ADDI S5, S5, 0x10
|
||||
PTR_ADDI S6, S6, 0x10
|
||||
PTR_ADDI S7, S7, 0x10
|
||||
PTR_ADDI S8, S8, 0x10
|
||||
PTR_ADDI P3, P3, 0x80
|
||||
.L_N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_N1
|
||||
.L_N2:
|
||||
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
|
||||
$f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
|
||||
GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18, \
|
||||
$f4, P4, 0x20, $f5, P4, 0x28, $f6, P4, 0x30, $f7, P4, 0x38
|
||||
PTR_ADDI S1, S1, 0x08
|
||||
PTR_ADDI S2, S2, 0x08
|
||||
PTR_ADDI S3, S3, 0x08
|
||||
PTR_ADDI S4, S4, 0x08
|
||||
PTR_ADDI S5, S5, 0x08
|
||||
PTR_ADDI S6, S6, 0x08
|
||||
PTR_ADDI S7, S7, 0x08
|
||||
PTR_ADDI S8, S8, 0x08
|
||||
PTR_ADDI P4, P4, 0x40
|
||||
.L_N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_N0
|
||||
|
||||
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
|
||||
$f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
|
||||
GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C, \
|
||||
$f4, P5, 0x10, $f5, P5, 0x14, $f6, P5, 0x18, $f7, P5, 0x1C
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
PTR_ADDI S3, S3, 0x04
|
||||
PTR_ADDI S4, S4, 0x04
|
||||
PTR_ADDI S5, S5, 0x04
|
||||
PTR_ADDI S6, S6, 0x04
|
||||
PTR_ADDI S7, S7, 0x04
|
||||
PTR_ADDI S8, S8, 0x04
|
||||
PTR_ADDI P5, P5, 0x20
|
||||
.L_N0:
|
||||
blt ZERO, J, .L_M8
|
||||
.L_M7:
|
||||
andi J, M, 0x04
|
||||
beq ZERO, J, .L_M3
|
||||
.L_M4:
|
||||
move S1, S0
|
||||
PTR_ADD S2, S0, TL
|
||||
PTR_ADD S3, S1, T0
|
||||
PTR_ADD S4, S2, T0
|
||||
PTR_ADD S0, S3, T0
|
||||
|
||||
move P1, P0
|
||||
PTR_ADDI P0, P0, 0x100
|
||||
|
||||
PTR_SRAI I, N, 0x04
|
||||
beq ZERO, I, .L_M4_N15
|
||||
.align 5
|
||||
.L_M4_N16:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
xvld U2, S2, 0x00
|
||||
xvld U3, S2, 0x20
|
||||
|
||||
xvst U0, P1, 0x00
|
||||
xvst U1, P1, 0x20
|
||||
xvst U2, P1, 0x40
|
||||
xvst U3, P1, 0x60
|
||||
|
||||
xvld U4, S3, 0x00
|
||||
xvld U5, S3, 0x20
|
||||
xvld U6, S4, 0x00
|
||||
xvld U7, S4, 0x20
|
||||
|
||||
xvst U4, P1, 0x80
|
||||
xvst U5, P1, 0xA0
|
||||
xvst U6, P1, 0xC0
|
||||
xvst U7, P1, 0xE0
|
||||
|
||||
PTR_ADDI S1, S1, 0x40
|
||||
PTR_ADDI S2, S2, 0x40
|
||||
PTR_ADDI S3, S3, 0x40
|
||||
PTR_ADDI S4, S4, 0x40
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD P1, P1, T1
|
||||
blt ZERO, I, .L_M4_N16
|
||||
.L_M4_N15:
|
||||
andi I, N, 0x08
|
||||
beq ZERO, I, .L_M4_N7
|
||||
.L_M4_N8:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
|
||||
GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60
|
||||
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
PTR_ADDI S2, S2, 0x20
|
||||
PTR_ADDI S3, S3, 0x20
|
||||
PTR_ADDI S4, S4, 0x20
|
||||
PTR_ADDI P2, P2, 0x80
|
||||
.L_M4_N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_M4_N3
|
||||
.L_M4_N4:
|
||||
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00
|
||||
GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30
|
||||
PTR_ADDI S1, S1, 0x10
|
||||
PTR_ADDI S2, S2, 0x10
|
||||
PTR_ADDI S3, S3, 0x10
|
||||
PTR_ADDI S4, S4, 0x10
|
||||
PTR_ADDI P3, P3, 0x40
|
||||
.L_M4_N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_M4_N1
|
||||
.L_M4_N2:
|
||||
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
|
||||
GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18
|
||||
PTR_ADDI S1, S1, 0x08
|
||||
PTR_ADDI S2, S2, 0x08
|
||||
PTR_ADDI S3, S3, 0x08
|
||||
PTR_ADDI S4, S4, 0x08
|
||||
PTR_ADDI P4, P4, 0x20
|
||||
.L_M4_N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M3
|
||||
|
||||
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
|
||||
GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
PTR_ADDI S3, S3, 0x04
|
||||
PTR_ADDI S4, S4, 0x04
|
||||
PTR_ADDI P5, P5, 0x10
|
||||
.L_M3:
|
||||
andi J, M, 0x02
|
||||
beq ZERO, J, .L_M1
|
||||
.L_M2:
|
||||
move S1, S0
|
||||
PTR_ADD S2, S0, TL
|
||||
PTR_ADD S0, S0, T0
|
||||
|
||||
move P1, P0
|
||||
PTR_ADDI P0, P0, 0x80
|
||||
|
||||
PTR_SRAI I, N, 0x04
|
||||
beq ZERO, I, .L_M2_N15
|
||||
.align 5
|
||||
.L_M2_N16:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
xvld U2, S2, 0x00
|
||||
xvld U3, S2, 0x20
|
||||
|
||||
xvst U0, P1, 0x00
|
||||
xvst U1, P1, 0x20
|
||||
xvst U2, P1, 0x40
|
||||
xvst U3, P1, 0x60
|
||||
|
||||
PTR_ADDI S1, S1, 0x40
|
||||
PTR_ADDI S2, S2, 0x40
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD P1, P1, T1
|
||||
blt ZERO, I, .L_M2_N16
|
||||
.L_M2_N15:
|
||||
andi I, N, 0x08
|
||||
beq ZERO, I, .L_M2_N7
|
||||
.L_M2_N8:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
|
||||
GST xv, , U0, P2, 0x00, U1, P2, 0x20
|
||||
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
PTR_ADDI S2, S2, 0x20
|
||||
PTR_ADDI P2, P2, 0x40
|
||||
.L_M2_N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_M2_N3
|
||||
.L_M2_N4:
|
||||
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00
|
||||
GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10
|
||||
PTR_ADDI S1, S1, 0x10
|
||||
PTR_ADDI S2, S2, 0x10
|
||||
PTR_ADDI P3, P3, 0x20
|
||||
.L_M2_N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_M2_N1
|
||||
.L_M2_N2:
|
||||
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00
|
||||
GST f, d, $f0, P4, 0x00, $f1, P4, 0x08
|
||||
PTR_ADDI S1, S1, 0x08
|
||||
PTR_ADDI S2, S2, 0x08
|
||||
PTR_ADDI P4, P4, 0x10
|
||||
.L_M2_N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M1
|
||||
|
||||
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00
|
||||
GST f, s, $f0, P5, 0x00, $f1, P5, 0x04
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
PTR_ADDI P5, P5, 0x08
|
||||
.L_M1:
|
||||
andi J, M, 0x01
|
||||
beq ZERO, J, .L_M0
|
||||
|
||||
move S1, S0
|
||||
PTR_ADD S2, S0, TL
|
||||
|
||||
move P1, P0
|
||||
PTR_ADDI P0, P0, 0x40
|
||||
|
||||
PTR_SRAI I, N, 0x04
|
||||
beq ZERO, I, .L_M1_N15
|
||||
.align 5
|
||||
.L_M1_N16:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
|
||||
xvst U0, P1, 0x00
|
||||
xvst U1, P1, 0x20
|
||||
|
||||
PTR_ADDI S1, S1, 0x40
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD P1, P1, T1
|
||||
blt ZERO, I, .L_M1_N16
|
||||
.L_M1_N15:
|
||||
andi I, N, 0x08
|
||||
beq ZERO, I, .L_M1_N7
|
||||
.L_M1_N8:
|
||||
xvld U0, S1, 0x00
|
||||
|
||||
GST xv, , U0, P2, 0x00
|
||||
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
PTR_ADDI P2, P2, 0x20
|
||||
.L_M1_N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_M1_N3
|
||||
.L_M1_N4:
|
||||
GLD v, , $vr0, S1, 0x00
|
||||
GST v, , $vr0, P3, 0x00
|
||||
PTR_ADDI S1, S1, 0x10
|
||||
PTR_ADDI P3, P3, 0x10
|
||||
.L_M1_N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_M1_N1
|
||||
.L_M1_N2:
|
||||
GLD f, d, $f0, S1, 0x00
|
||||
GST f, d, $f0, P4, 0x00
|
||||
PTR_ADDI S1, S1, 0x08
|
||||
PTR_ADDI P4, P4, 0x08
|
||||
.L_M1_N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M0
|
||||
|
||||
GLD f, s, $f0, S1, 0x00
|
||||
GST f, s, $f0, P5, 0x00
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
PTR_ADDI P5, P5, 0x04
|
||||
.L_M0:
|
||||
pop_if_used 24, 8
|
||||
jirl $r0, $r1, 0x00
|
||||
EPILOGUE
|
|
@ -0,0 +1,406 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
|
||||
/*********************************************************************
|
||||
* 2023/08/23 guxiwei
|
||||
* UTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*********************************************************************/
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S0 $r11
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define S5 $r16
|
||||
#define S6 $r17
|
||||
#define S7 $r18
|
||||
#define S8 $r19
|
||||
#define P0 $r20
|
||||
#define P1 $r23
|
||||
#define P2 $r24
|
||||
#define P3 $r25
|
||||
#define P4 $r26
|
||||
#define T0 $r27
|
||||
#define T1 $r28
|
||||
#define TL $r7
|
||||
#undef ZERO
|
||||
#define ZERO $r0
|
||||
|
||||
/* LASX vectors */
|
||||
#define U0 $xr0
|
||||
#define U1 $xr1
|
||||
#define U2 $xr2
|
||||
#define U3 $xr3
|
||||
#define U4 $xr4
|
||||
#define U5 $xr5
|
||||
#define U6 $xr6
|
||||
#define U7 $xr7
|
||||
|
||||
// Loops outline
|
||||
//.L_M8 <-------------------
|
||||
//| .L_N8: |
|
||||
//| .L_N7: | Main Loop
|
||||
//| .L_N4: |
|
||||
//| .L_N3: |
|
||||
//| .L_N2: |
|
||||
//| .L_N1: |
|
||||
//| .L_N0: ---------------
|
||||
//.L_M7
|
||||
//.L_M4
|
||||
//| .L_M4_N8:
|
||||
//| .L_M4_N7:
|
||||
//| .L_M4_N4:
|
||||
//| .L_M4_N3:
|
||||
//| .L_M4_N2:
|
||||
//| .L_M4_N1:
|
||||
//.L_M3
|
||||
//.L_M2
|
||||
//| .L_M2_N8:
|
||||
//| .L_M2_N7:
|
||||
//| .L_M2_N4:
|
||||
//| .L_M2_N3:
|
||||
//| .L_M2_N2:
|
||||
//| .L_M2_N1:
|
||||
//.L_M1
|
||||
//| .L_M1_N8:
|
||||
//| .L_M1_N7:
|
||||
//| .L_M1_N4:
|
||||
//| .L_M1_N3:
|
||||
//| .L_M1_N2:
|
||||
//| .L_M1_N1:
|
||||
//.L_M0
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 23, 8
|
||||
|
||||
move S0, SRC
|
||||
move P0, DST
|
||||
|
||||
PTR_SRAI T0, N, 0x04
|
||||
PTR_SRAI T1, N, 0x03
|
||||
PTR_SLLI T0, T0, 0x04
|
||||
PTR_SLLI T1, T1, 0x03
|
||||
|
||||
PTR_MUL P2, M, T1
|
||||
PTR_SLLI P2, P2, 0x02
|
||||
PTR_ADD P2, DST, P2
|
||||
PTR_SRAI T0, N, 0x02
|
||||
PTR_SRAI T1, N, 0x01
|
||||
PTR_SLLI T0, T0, 0x02
|
||||
PTR_SLLI T1, T1, 0x01
|
||||
PTR_MUL P3, M, T0
|
||||
PTR_MUL P4, M, T1
|
||||
PTR_SLLI P3, P3, 0x02
|
||||
PTR_SLLI P4, P4, 0x02
|
||||
PTR_ADD P3, DST, P3
|
||||
PTR_ADD P4, DST, P4
|
||||
|
||||
PTR_SLLI TL, LDA, 0x02
|
||||
PTR_SRAI J, M, 0x03
|
||||
PTR_SLLI T0, TL, 0x01
|
||||
PTR_SLLI T1, M, 0x05
|
||||
beq ZERO, J, .L_M7
|
||||
.align 5
|
||||
.L_M8:
|
||||
move S1, S0
|
||||
PTR_ADD S2, S0, TL
|
||||
PTR_ADD S3, S1, T0
|
||||
PTR_ADD S4, S2, T0
|
||||
PTR_ADD S5, S3, T0
|
||||
PTR_ADD S6, S4, T0
|
||||
PTR_ADD S7, S5, T0
|
||||
PTR_ADD S8, S6, T0
|
||||
PTR_ADD S0, S7, T0
|
||||
|
||||
move P1, P0
|
||||
PTR_ADDI P0, P0, 0x100
|
||||
|
||||
PTR_SRAI I, N, 0x03
|
||||
PTR_ADDI J, J, -1
|
||||
beq ZERO, I, .L_N7
|
||||
.L_N8:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
xvld U4, S5, 0x00
|
||||
xvld U5, S6, 0x00
|
||||
xvld U6, S7, 0x00
|
||||
xvld U7, S8, 0x00
|
||||
|
||||
GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60, \
|
||||
U4, P1, 0x80, U5, P1, 0xA0, U6, P1, 0xC0, U7, P1, 0xE0
|
||||
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
PTR_ADDI S2, S2, 0x20
|
||||
PTR_ADDI S3, S3, 0x20
|
||||
PTR_ADDI S4, S4, 0x20
|
||||
PTR_ADDI S5, S5, 0x20
|
||||
PTR_ADDI S6, S6, 0x20
|
||||
PTR_ADDI S7, S7, 0x20
|
||||
PTR_ADDI S8, S8, 0x20
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD P1, P1, T1
|
||||
blt ZERO, I, .L_N8
|
||||
.L_N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_N3
|
||||
.L_N4:
|
||||
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \
|
||||
$vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00
|
||||
GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30, \
|
||||
$vr4, P2, 0x40, $vr5, P2, 0x50, $vr6, P2, 0x60, $vr7, P2, 0x70
|
||||
PTR_ADDI S1, S1, 0x10
|
||||
PTR_ADDI S2, S2, 0x10
|
||||
PTR_ADDI S3, S3, 0x10
|
||||
PTR_ADDI S4, S4, 0x10
|
||||
PTR_ADDI S5, S5, 0x10
|
||||
PTR_ADDI S6, S6, 0x10
|
||||
PTR_ADDI S7, S7, 0x10
|
||||
PTR_ADDI S8, S8, 0x10
|
||||
PTR_ADDI P2, P2, 0x80
|
||||
.L_N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_N1
|
||||
.L_N2:
|
||||
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
|
||||
$f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
|
||||
GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18, \
|
||||
$f4, P3, 0x20, $f5, P3, 0x28, $f6, P3, 0x30, $f7, P3, 0x38
|
||||
PTR_ADDI S1, S1, 0x08
|
||||
PTR_ADDI S2, S2, 0x08
|
||||
PTR_ADDI S3, S3, 0x08
|
||||
PTR_ADDI S4, S4, 0x08
|
||||
PTR_ADDI S5, S5, 0x08
|
||||
PTR_ADDI S6, S6, 0x08
|
||||
PTR_ADDI S7, S7, 0x08
|
||||
PTR_ADDI S8, S8, 0x08
|
||||
PTR_ADDI P3, P3, 0x40
|
||||
.L_N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_N0
|
||||
|
||||
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
|
||||
$f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
|
||||
GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C, \
|
||||
$f4, P4, 0x10, $f5, P4, 0x14, $f6, P4, 0x18, $f7, P4, 0x1C
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
PTR_ADDI S3, S3, 0x04
|
||||
PTR_ADDI S4, S4, 0x04
|
||||
PTR_ADDI S5, S5, 0x04
|
||||
PTR_ADDI S6, S6, 0x04
|
||||
PTR_ADDI S7, S7, 0x04
|
||||
PTR_ADDI S8, S8, 0x04
|
||||
PTR_ADDI P4, P4, 0x20
|
||||
.L_N0:
|
||||
blt ZERO, J, .L_M8
|
||||
|
||||
.L_M7:
|
||||
andi J, M, 0x04
|
||||
beq ZERO, J, .L_M3
|
||||
.L_M4:
|
||||
move S1, S0
|
||||
PTR_ADD S2, S0, TL
|
||||
PTR_ADD S3, S1, T0
|
||||
PTR_ADD S4, S2, T0
|
||||
PTR_ADD S0, S3, T0
|
||||
|
||||
move P1, P0
|
||||
PTR_ADDI P0, P0, 0x80
|
||||
|
||||
PTR_SRAI I, N, 0x03
|
||||
beq ZERO, I, .L_M4_N7
|
||||
.align 5
|
||||
.L_M4_N8:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
|
||||
GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60
|
||||
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
PTR_ADDI S2, S2, 0x20
|
||||
PTR_ADDI S3, S3, 0x20
|
||||
PTR_ADDI S4, S4, 0x20
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD P1, P1, T1
|
||||
blt ZERO, I, .L_M4_N8
|
||||
.L_M4_N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_M4_N3
|
||||
.L_M4_N4:
|
||||
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00
|
||||
GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30
|
||||
PTR_ADDI S1, S1, 0x10
|
||||
PTR_ADDI S2, S2, 0x10
|
||||
PTR_ADDI S3, S3, 0x10
|
||||
PTR_ADDI S4, S4, 0x10
|
||||
PTR_ADDI P2, P2, 0x40
|
||||
.L_M4_N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_M4_N1
|
||||
.L_M4_N2:
|
||||
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
|
||||
GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18
|
||||
PTR_ADDI S1, S1, 0x08
|
||||
PTR_ADDI S2, S2, 0x08
|
||||
PTR_ADDI S3, S3, 0x08
|
||||
PTR_ADDI S4, S4, 0x08
|
||||
PTR_ADDI P3, P3, 0x20
|
||||
.L_M4_N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M3
|
||||
|
||||
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
|
||||
GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
PTR_ADDI S3, S3, 0x04
|
||||
PTR_ADDI S4, S4, 0x04
|
||||
PTR_ADDI P4, P4, 0x10
|
||||
.L_M3:
|
||||
andi J, M, 0x02
|
||||
beq ZERO, J, .L_M1
|
||||
.L_M2:
|
||||
move S1, S0
|
||||
PTR_ADD S2, S0, TL
|
||||
PTR_ADD S0, S0, T0
|
||||
|
||||
move P1, P0
|
||||
PTR_ADDI P0, P0, 0x40
|
||||
|
||||
PTR_SRAI I, N, 0x03
|
||||
beq ZERO, I, .L_M2_N7
|
||||
.align 5
|
||||
.L_M2_N8:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
|
||||
GST xv, , U0, P1, 0x00, U1, P1, 0x20
|
||||
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
PTR_ADDI S2, S2, 0x20
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD P1, P1, T1
|
||||
blt ZERO, I, .L_M2_N8
|
||||
.L_M2_N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_M2_N3
|
||||
.L_M2_N4:
|
||||
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00
|
||||
GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10
|
||||
PTR_ADDI S1, S1, 0x10
|
||||
PTR_ADDI S2, S2, 0x10
|
||||
PTR_ADDI P2, P2, 0x20
|
||||
.L_M2_N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_M2_N1
|
||||
.L_M2_N2:
|
||||
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00
|
||||
GST f, d, $f0, P3, 0x00, $f1, P3, 0x08
|
||||
PTR_ADDI S1, S1, 0x08
|
||||
PTR_ADDI S2, S2, 0x08
|
||||
PTR_ADDI P3, P3, 0x10
|
||||
.L_M2_N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M1
|
||||
|
||||
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00
|
||||
GST f, s, $f0, P4, 0x00, $f1, P4, 0x04
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
PTR_ADDI P4, P4, 0x08
|
||||
.L_M1:
|
||||
andi J, M, 0x01
|
||||
beq ZERO, J, .L_M0
|
||||
|
||||
move S1, S0
|
||||
PTR_ADD S2, S0, TL
|
||||
|
||||
move P1, P0
|
||||
PTR_ADDI P0, P0, 0x20
|
||||
|
||||
PTR_SRAI I, N, 0x03
|
||||
beq ZERO, I, .L_M1_N7
|
||||
.align 5
|
||||
.L_M1_N8:
|
||||
xvld U0, S1, 0x00
|
||||
|
||||
GST xv, , U0, P1, 0x00
|
||||
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD P1, P1, T1
|
||||
blt ZERO, I, .L_M1_N8
|
||||
.L_M1_N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_M1_N3
|
||||
.L_M1_N4:
|
||||
GLD v, , $vr0, S1, 0x00
|
||||
GST v, , $vr0, P2, 0x00
|
||||
PTR_ADDI S1, S1, 0x10
|
||||
PTR_ADDI P2, P2, 0x10
|
||||
.L_M1_N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_M1_N1
|
||||
.L_M1_N2:
|
||||
GLD f, d, $f0, S1, 0x00
|
||||
GST f, d, $f0, P3, 0x00
|
||||
PTR_ADDI S1, S1, 0x08
|
||||
PTR_ADDI P3, P3, 0x08
|
||||
.L_M1_N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M0
|
||||
|
||||
GLD f, s, $f0, S1, 0x00
|
||||
GST f, s, $f0, P4, 0x00
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
PTR_ADDI P4, P4, 0x04
|
||||
.L_M0:
|
||||
pop_if_used 23, 8
|
||||
jirl $r0, $r1, 0x00
|
||||
EPILOGUE
|
12
param.h
12
param.h
|
@ -2848,34 +2848,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(NO_LASX)
|
||||
#define DGEMM_DEFAULT_UNROLL_N 8
|
||||
#define DGEMM_DEFAULT_UNROLL_M 2
|
||||
#define SGEMM_DEFAULT_UNROLL_N 8
|
||||
#define SGEMM_DEFAULT_UNROLL_M 2
|
||||
#else
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
#define DGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SGEMM_DEFAULT_UNROLL_N 8
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
#endif
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_N 8
|
||||
#define QGEMM_DEFAULT_UNROLL_N 2
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||
#define XGEMM_DEFAULT_UNROLL_N 1
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 2
|
||||
#define QGEMM_DEFAULT_UNROLL_M 2
|
||||
#define CGEMM_DEFAULT_UNROLL_M 1
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 1
|
||||
#define XGEMM_DEFAULT_UNROLL_M 1
|
||||
|
||||
#define SGEMM_DEFAULT_P 512
|
||||
#define SGEMM_DEFAULT_P 256
|
||||
#define DGEMM_DEFAULT_P 32
|
||||
#define CGEMM_DEFAULT_P 128
|
||||
#define ZGEMM_DEFAULT_P 128
|
||||
|
||||
#define SGEMM_DEFAULT_R 12288
|
||||
#define SGEMM_DEFAULT_R 1024
|
||||
#define DGEMM_DEFAULT_R 858
|
||||
#define CGEMM_DEFAULT_R 4096
|
||||
#define ZGEMM_DEFAULT_R 4096
|
||||
|
||||
#define SGEMM_DEFAULT_Q 128
|
||||
#define SGEMM_DEFAULT_Q 256
|
||||
#define DGEMM_DEFAULT_Q 152
|
||||
#define CGEMM_DEFAULT_Q 128
|
||||
#define ZGEMM_DEFAULT_Q 128
|
||||
|
|
Loading…
Reference in New Issue