fix sve dgemm kernel + sve dtrmm
This commit is contained in:
parent
746b4f0f17
commit
a8fbdbac34
|
@ -46,16 +46,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow0 x12
|
||||
#define pCRow1 x13
|
||||
#define pCRow2 x14
|
||||
#define pCRow3 x15
|
||||
|
||||
#define lanes x15
|
||||
#define pA x16
|
||||
#define alpha x17
|
||||
|
||||
#define alpha0 d10
|
||||
#define alphaZ z10.d
|
||||
#define alphaV0 v10.d[0]
|
||||
#define alphaZ z2.d
|
||||
|
||||
#define A_PRE_SIZE 2560
|
||||
#define B_PRE_SIZE 448
|
||||
#define B_PRE_SIZE 512
|
||||
#define C_PRE_SIZE 128
|
||||
|
||||
// 00 origM
|
||||
|
@ -73,9 +73,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 12 pCRow0
|
||||
// 13 pCRow1
|
||||
// 14 pCRow2
|
||||
// 15 pCRow3
|
||||
// 15 lanes
|
||||
// 16 pA
|
||||
// 17
|
||||
// 17
|
||||
// 18 must save
|
||||
// 19 must save
|
||||
// 20 must save
|
||||
|
@ -93,20 +93,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
//v00 ALPHA -> pA0_0
|
||||
//v01 pA0_1
|
||||
//v02 pA0_2
|
||||
//v03 pA0_3
|
||||
//v04 pA0_4
|
||||
//v05 pA0_5
|
||||
//v06 pA0_6
|
||||
//v07 pA0_7
|
||||
//v02 ALPHA0
|
||||
//v03
|
||||
//v04
|
||||
//v05
|
||||
//v06
|
||||
//v07
|
||||
//v08 must save pB0_0
|
||||
//v09 must save pB0_1
|
||||
//v10 must save pB0_2 --> ALPHA0
|
||||
//v10 must save pB0_2
|
||||
//v11 must save pB0_3
|
||||
//v12 must save pB1_0
|
||||
//v13 must save pB1_1
|
||||
//v14 must save pB1_2
|
||||
//v15 must save pB1_3
|
||||
//v12 must save pB0_4
|
||||
//v13 must save pB0_5
|
||||
//v14 must save pB0_6
|
||||
//v15 must save pB0_7
|
||||
//v16 must save C0
|
||||
//v17 must save C1
|
||||
//v18 must save C2
|
||||
|
@ -133,9 +133,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro KERNELv1x8_I
|
||||
ld1d z0.d, p1/z, [pA]
|
||||
ld1d z1.d, p1/z, [pA, x18, lsl #3] // next one
|
||||
ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one
|
||||
//incb pA, all, mul #2
|
||||
add pA, pA, x18, lsl #4 // pA = pA + cnt_active * 2 * 8
|
||||
add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8
|
||||
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
|
@ -157,12 +157,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
fmla z19.d, p1/m, z0.d, z11.d
|
||||
ld1rd z11.d, p0/z, [pB, 24]
|
||||
fmla z20.d, p1/m, z0.d, z12.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
ld1rd z12.d, p0/z, [pB, 32]
|
||||
fmla z21.d, p1/m, z0.d, z13.d
|
||||
ld1rd z13.d, p0/z, [pB, 40]
|
||||
fmla z22.d, p1/m, z0.d, z14.d
|
||||
ld1rd z14.d, p0/z, [pB, 48]
|
||||
fmla z23.d, p1/m, z0.d, z15.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
ld1rd z15.d, p0/z, [pB, 56]
|
||||
|
||||
add pB, pB, 64
|
||||
|
@ -170,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro KERNELv1x8_M1
|
||||
ld1d z1.d, p1/z, [pA]
|
||||
add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
|
||||
|
||||
fmla z16.d, p1/m, z0.d, z8.d
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
|
@ -181,12 +183,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
fmla z19.d, p1/m, z0.d, z11.d
|
||||
ld1rd z11.d, p0/z, [pB, 24]
|
||||
fmla z20.d, p1/m, z0.d, z12.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
ld1rd z12.d, p0/z, [pB, 32]
|
||||
fmla z21.d, p1/m, z0.d, z13.d
|
||||
ld1rd z13.d, p0/z, [pB, 40]
|
||||
fmla z22.d, p1/m, z0.d, z14.d
|
||||
ld1rd z14.d, p0/z, [pB, 48]
|
||||
fmla z23.d, p1/m, z0.d, z15.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
ld1rd z15.d, p0/z, [pB, 56]
|
||||
|
||||
add pB, pB, 64
|
||||
|
@ -194,7 +198,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro KERNELv1x8_M2
|
||||
ld1d z0.d, p1/z, [pA]
|
||||
add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
|
||||
|
||||
fmla z16.d, p1/m, z1.d, z8.d
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
|
@ -206,6 +210,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1rd z11.d, p0/z, [pB, 24]
|
||||
fmla z20.d, p1/m, z1.d, z12.d
|
||||
ld1rd z12.d, p0/z, [pB, 32]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
fmla z21.d, p1/m, z1.d, z13.d
|
||||
ld1rd z13.d, p0/z, [pB, 40]
|
||||
fmla z22.d, p1/m, z1.d, z14.d
|
||||
|
@ -222,6 +227,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
fmla z18.d, p1/m, z1.d, z10.d
|
||||
fmla z19.d, p1/m, z1.d, z11.d
|
||||
fmla z20.d, p1/m, z1.d, z12.d
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
fmla z21.d, p1/m, z1.d, z13.d
|
||||
fmla z22.d, p1/m, z1.d, z14.d
|
||||
fmla z23.d, p1/m, z1.d, z15.d
|
||||
|
@ -229,7 +235,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro KERNELv1x8_SUB
|
||||
ld1d z0.d, p1/z, [pA]
|
||||
add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
|
||||
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
|
@ -245,16 +251,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
fmla z16.d, p1/m, z0.d, z8.d
|
||||
fmla z17.d, p1/m, z0.d, z9.d
|
||||
fmla z18.d, p1/m, z0.d, z10.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
fmla z19.d, p1/m, z0.d, z11.d
|
||||
fmla z20.d, p1/m, z0.d, z12.d
|
||||
fmla z21.d, p1/m, z0.d, z13.d
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
fmla z22.d, p1/m, z0.d, z14.d
|
||||
fmla z23.d, p1/m, z0.d, z15.d
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x8
|
||||
dup alphaZ, alpha
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
|
@ -262,43 +269,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1d z24.d, p1/z, [pCRow0]
|
||||
fmla z24.d, p1/m, z16.d, alphaZ
|
||||
st1d z24.d, p1, [pCRow0]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1d z25.d, p1/z, [pCRow1]
|
||||
fmla z25.d, p1/m, z17.d, alphaZ
|
||||
st1d z25.d, p1, [pCRow1]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
ld1d z26.d, p1/z, [pCRow2]
|
||||
fmla z26.d, p1/m, z18.d, alphaZ
|
||||
st1d z26.d, p1, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1d z27.d, p1/z, [pCRow1]
|
||||
fmla z27.d, p1/m, z19.d, alphaZ
|
||||
st1d z27.d, p1, [pCRow1]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
ld1d z28.d, p1/z, [pCRow2]
|
||||
fmla z28.d, p1/m, z20.d, alphaZ
|
||||
st1d z28.d, p1, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1d z29.d, p1/z, [pCRow1]
|
||||
fmla z29.d, p1/m, z21.d, alphaZ
|
||||
st1d z29.d, p1, [pCRow1]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
ld1d z30.d, p1/z, [pCRow2]
|
||||
fmla z30.d, p1/m, z22.d, alphaZ
|
||||
st1d z30.d, p1, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1d z31.d, p1/z, [pCRow1]
|
||||
fmla z31.d, p1/m, z23.d, alphaZ
|
||||
st1d z31.d, p1, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8
|
||||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
|
||||
|
||||
.endm
|
||||
|
||||
|
@ -313,7 +326,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro KERNELv1x4_SUB
|
||||
ld1d z0.d, p1/z, [pA]
|
||||
add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
|
||||
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
|
@ -324,13 +337,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
fmla z16.d, p1/m, z0.d, z8.d
|
||||
fmla z17.d, p1/m, z0.d, z9.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
fmla z18.d, p1/m, z0.d, z10.d
|
||||
fmla z19.d, p1/m, z0.d, z11.d
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x4
|
||||
dup alphaZ, alpha
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
|
@ -338,23 +351,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1d z24.d, p1/z, [pCRow0]
|
||||
fmla z24.d, p1/m, z16.d, alphaZ
|
||||
st1d z24.d, p1, [pCRow0]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1d z25.d, p1/z, [pCRow1]
|
||||
fmla z25.d, p1/m, z17.d, alphaZ
|
||||
st1d z25.d, p1, [pCRow1]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
ld1d z26.d, p1/z, [pCRow2]
|
||||
fmla z26.d, p1/m, z18.d, alphaZ
|
||||
st1d z26.d, p1, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1d z27.d, p1/z, [pCRow1]
|
||||
fmla z27.d, p1/m, z19.d, alphaZ
|
||||
st1d z27.d, p1, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8
|
||||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
|
||||
|
||||
.endm
|
||||
|
||||
|
@ -367,7 +382,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro KERNELv1x2_SUB
|
||||
ld1d z0.d, p1/z, [pA]
|
||||
add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
|
||||
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
|
@ -375,12 +390,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add pB, pB, 16
|
||||
|
||||
fmla z16.d, p1/m, z0.d, z8.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
fmla z17.d, p1/m, z0.d, z9.d
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x2
|
||||
dup alphaZ, alpha
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
|
@ -388,13 +403,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1d z24.d, p1/z, [pCRow0]
|
||||
fmla z24.d, p1/m, z16.d, alphaZ
|
||||
st1d z24.d, p1, [pCRow0]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1d z25.d, p1/z, [pCRow1]
|
||||
fmla z25.d, p1/m, z17.d, alphaZ
|
||||
st1d z25.d, p1, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8
|
||||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
|
||||
|
||||
.endm
|
||||
|
||||
|
@ -406,28 +421,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro KERNELv1x1_SUB
|
||||
ld1d z0.d, p1/z, [pA]
|
||||
add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
|
||||
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
|
||||
add pB, pB, 8
|
||||
|
||||
fmla z16.d, p1/m, z0.d, z8.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x1
|
||||
dup alphaZ, alpha
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
ld1d z24.d, p1/z, [pCRow0]
|
||||
fmla z24.d, p1/m, z16.d, alphaZ
|
||||
st1d z24.d, p1, [pCRow0]
|
||||
|
||||
|
||||
add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8
|
||||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
|
||||
|
||||
.endm
|
||||
|
||||
|
@ -456,6 +470,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
prfm PLDL1KEEP, [origPA]
|
||||
|
||||
fmov alpha, d0
|
||||
dup alphaZ, alpha
|
||||
|
||||
lsl LDC, LDC, #3 // ldc = ldc * 8
|
||||
ptrue p0.d // create true predicate
|
||||
|
@ -473,7 +488,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.Ldgemm_kernel_L8_BEGIN:
|
||||
mov pCRow0, pC
|
||||
|
||||
add pC, pCRow0, LDC, lsl #3 // add 8 x LDC
|
||||
add pC, pC, LDC, lsl #3 // add 8 x LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
|
@ -481,11 +496,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
mov counterI, #0
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp x18, p0, p1.d
|
||||
/* mov counterI, origM */
|
||||
/* asr counterI, counterI, #3 // counterI = counterI / 8 */
|
||||
/* cmp counterI, #0 */
|
||||
/* ble .Ldgemm_kernel_L4_M4_BEGIN */
|
||||
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_Mv1_20:
|
||||
|
@ -584,7 +595,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
incd counterI
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp x18, p0, p1.d
|
||||
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
|
||||
b.any .Ldgemm_kernel_L8_Mv1_20
|
||||
|
||||
.Ldgemm_kernel_L8_END:
|
||||
|
@ -608,7 +619,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
mov pCRow0, pC
|
||||
|
||||
add pC, pCRow0, LDC, lsl #2 // add 4 x LDC
|
||||
add pC, pC, LDC, lsl #2 // add 4 x LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
|
@ -616,7 +627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
mov counterI, #0
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp x18, p0, p1.d
|
||||
cntp lanes, p0, p1.d
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L4_Mv1_20:
|
||||
|
@ -626,17 +637,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
asr counterL , origK, #3 // L = K / 8
|
||||
cmp counterL , #0 // is there at least 4 to do?
|
||||
blt .Ldgemm_kernel_L4_Mv1_44
|
||||
ble .Ldgemm_kernel_L4_Mv1_44
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L4_Mv1_22:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
KERNELv1x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
KERNELv1x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
KERNELv1x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
KERNELv1x4_SUB
|
||||
|
||||
|
@ -651,6 +666,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.align 5
|
||||
.Ldgemm_kernel_L4_Mv1_46:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
|
@ -667,12 +683,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
incd counterI
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp x18, p0, p1.d
|
||||
cntp lanes, p0, p1.d
|
||||
b.any .Ldgemm_kernel_L4_Mv1_20
|
||||
|
||||
|
||||
.Ldgemm_kernel_L4_END:
|
||||
add origPB, origPB, origK, lsl #5 // B = B + K * 4 * 8
|
||||
lsl temp, origK, #5
|
||||
add origPB, origPB, temp // B = B + K * 4 * 8
|
||||
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
|
@ -686,7 +703,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
mov pCRow0, pC
|
||||
|
||||
add pC, pCRow0, LDC, lsl #1 // add 2 x LDC
|
||||
add pC, pC, LDC, lsl #1 // add 2 x LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
|
@ -694,7 +711,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
mov counterI, #0
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp x18, p0, p1.d
|
||||
cntp lanes, p0, p1.d
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L2_Mv1_20:
|
||||
|
@ -704,15 +721,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
asr counterL , origK, #3 // L = K / 8
|
||||
cmp counterL , #0 // is there at least 4 to do?
|
||||
blt .Ldgemm_kernel_L2_Mv1_44
|
||||
ble .Ldgemm_kernel_L2_Mv1_44
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L2_Mv1_22:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
|
@ -729,6 +748,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.align 5
|
||||
.Ldgemm_kernel_L2_Mv1_46:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x2_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
|
@ -745,7 +765,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
incd counterI
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp x18, p0, p1.d
|
||||
cntp lanes, p0, p1.d
|
||||
b.any .Ldgemm_kernel_L2_Mv1_20
|
||||
|
||||
|
||||
|
@ -764,7 +784,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
mov pCRow0, pC
|
||||
|
||||
add pC, pCRow0, LDC, lsl #1 // add 2 x LDC
|
||||
add pC, pC, LDC // add 1 x LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
|
@ -772,7 +792,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
mov counterI, #0
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp x18, p0, p1.d
|
||||
cntp lanes, p0, p1.d
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L1_Mv1_20:
|
||||
|
@ -781,12 +801,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
INITv1x1 // fill with zeros
|
||||
|
||||
asr counterL , origK, #3 // L = K / 8
|
||||
cmp counterL , #0 // is there at least 4 to do?
|
||||
blt .Ldgemm_kernel_L1_Mv1_44
|
||||
cmp counterL , #0 // is there at least 8 to do?
|
||||
ble .Ldgemm_kernel_L1_Mv1_44
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L1_Mv1_22:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
|
@ -807,10 +828,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.align 5
|
||||
.Ldgemm_kernel_L1_Mv1_46:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x1_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne .Ldgemm_kernel_L1_Mv1_46
|
||||
bgt .Ldgemm_kernel_L1_Mv1_46
|
||||
|
||||
.Ldgemm_kernel_L1_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
|
@ -823,7 +845,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
incd counterI
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp x18, p0, p1.d
|
||||
cntp lanes, p0, p1.d
|
||||
b.any .Ldgemm_kernel_L1_Mv1_20
|
||||
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue