fix sve dgemm kernel + sve dtrmm

This commit is contained in:
Bine Brank 2021-10-31 10:24:25 +01:00
parent 746b4f0f17
commit a8fbdbac34
2 changed files with 1088 additions and 59 deletions

View File

@ -46,16 +46,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow0 x12 #define pCRow0 x12
#define pCRow1 x13 #define pCRow1 x13
#define pCRow2 x14 #define pCRow2 x14
#define pCRow3 x15
#define lanes x15
#define pA x16 #define pA x16
#define alpha x17 #define alpha x17
#define alpha0 d10 #define alpha0 d10
#define alphaZ z10.d #define alphaZ z2.d
#define alphaV0 v10.d[0]
#define A_PRE_SIZE 2560 #define A_PRE_SIZE 2560
#define B_PRE_SIZE 448 #define B_PRE_SIZE 512
#define C_PRE_SIZE 128 #define C_PRE_SIZE 128
// 00 origM // 00 origM
@ -73,9 +73,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 12 pCRow0 // 12 pCRow0
// 13 pCRow1 // 13 pCRow1
// 14 pCRow2 // 14 pCRow2
// 15 pCRow3 // 15 lanes
// 16 pA // 16 pA
// 17 // 17
// 18 must save // 18 must save
// 19 must save // 19 must save
// 20 must save // 20 must save
@ -93,20 +93,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//v00 ALPHA -> pA0_0 //v00 ALPHA -> pA0_0
//v01 pA0_1 //v01 pA0_1
//v02 pA0_2 //v02 ALPHA0
//v03 pA0_3 //v03
//v04 pA0_4 //v04
//v05 pA0_5 //v05
//v06 pA0_6 //v06
//v07 pA0_7 //v07
//v08 must save pB0_0 //v08 must save pB0_0
//v09 must save pB0_1 //v09 must save pB0_1
//v10 must save pB0_2 --> ALPHA0 //v10 must save pB0_2
//v11 must save pB0_3 //v11 must save pB0_3
//v12 must save pB1_0 //v12 must save pB0_4
//v13 must save pB1_1 //v13 must save pB0_5
//v14 must save pB1_2 //v14 must save pB0_6
//v15 must save pB1_3 //v15 must save pB0_7
//v16 must save C0 //v16 must save C0
//v17 must save C1 //v17 must save C1
//v18 must save C2 //v18 must save C2
@ -133,9 +133,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNELv1x8_I .macro KERNELv1x8_I
ld1d z0.d, p1/z, [pA] ld1d z0.d, p1/z, [pA]
ld1d z1.d, p1/z, [pA, x18, lsl #3] // next one ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one
//incb pA, all, mul #2 //incb pA, all, mul #2
add pA, pA, x18, lsl #4 // pA = pA + cnt_active * 2 * 8 add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8
ld1rd z8.d, p0/z, [pB] ld1rd z8.d, p0/z, [pB]
ld1rd z9.d, p0/z, [pB, 8] ld1rd z9.d, p0/z, [pB, 8]
@ -157,12 +157,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla z19.d, p1/m, z0.d, z11.d fmla z19.d, p1/m, z0.d, z11.d
ld1rd z11.d, p0/z, [pB, 24] ld1rd z11.d, p0/z, [pB, 24]
fmla z20.d, p1/m, z0.d, z12.d fmla z20.d, p1/m, z0.d, z12.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
ld1rd z12.d, p0/z, [pB, 32] ld1rd z12.d, p0/z, [pB, 32]
fmla z21.d, p1/m, z0.d, z13.d fmla z21.d, p1/m, z0.d, z13.d
ld1rd z13.d, p0/z, [pB, 40] ld1rd z13.d, p0/z, [pB, 40]
fmla z22.d, p1/m, z0.d, z14.d fmla z22.d, p1/m, z0.d, z14.d
ld1rd z14.d, p0/z, [pB, 48] ld1rd z14.d, p0/z, [pB, 48]
fmla z23.d, p1/m, z0.d, z15.d fmla z23.d, p1/m, z0.d, z15.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
ld1rd z15.d, p0/z, [pB, 56] ld1rd z15.d, p0/z, [pB, 56]
add pB, pB, 64 add pB, pB, 64
@ -170,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNELv1x8_M1 .macro KERNELv1x8_M1
ld1d z1.d, p1/z, [pA] ld1d z1.d, p1/z, [pA]
add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
fmla z16.d, p1/m, z0.d, z8.d fmla z16.d, p1/m, z0.d, z8.d
ld1rd z8.d, p0/z, [pB] ld1rd z8.d, p0/z, [pB]
@ -181,12 +183,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla z19.d, p1/m, z0.d, z11.d fmla z19.d, p1/m, z0.d, z11.d
ld1rd z11.d, p0/z, [pB, 24] ld1rd z11.d, p0/z, [pB, 24]
fmla z20.d, p1/m, z0.d, z12.d fmla z20.d, p1/m, z0.d, z12.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
ld1rd z12.d, p0/z, [pB, 32] ld1rd z12.d, p0/z, [pB, 32]
fmla z21.d, p1/m, z0.d, z13.d fmla z21.d, p1/m, z0.d, z13.d
ld1rd z13.d, p0/z, [pB, 40] ld1rd z13.d, p0/z, [pB, 40]
fmla z22.d, p1/m, z0.d, z14.d fmla z22.d, p1/m, z0.d, z14.d
ld1rd z14.d, p0/z, [pB, 48] ld1rd z14.d, p0/z, [pB, 48]
fmla z23.d, p1/m, z0.d, z15.d fmla z23.d, p1/m, z0.d, z15.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
ld1rd z15.d, p0/z, [pB, 56] ld1rd z15.d, p0/z, [pB, 56]
add pB, pB, 64 add pB, pB, 64
@ -194,7 +198,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNELv1x8_M2 .macro KERNELv1x8_M2
ld1d z0.d, p1/z, [pA] ld1d z0.d, p1/z, [pA]
add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
fmla z16.d, p1/m, z1.d, z8.d fmla z16.d, p1/m, z1.d, z8.d
ld1rd z8.d, p0/z, [pB] ld1rd z8.d, p0/z, [pB]
@ -206,6 +210,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1rd z11.d, p0/z, [pB, 24] ld1rd z11.d, p0/z, [pB, 24]
fmla z20.d, p1/m, z1.d, z12.d fmla z20.d, p1/m, z1.d, z12.d
ld1rd z12.d, p0/z, [pB, 32] ld1rd z12.d, p0/z, [pB, 32]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla z21.d, p1/m, z1.d, z13.d fmla z21.d, p1/m, z1.d, z13.d
ld1rd z13.d, p0/z, [pB, 40] ld1rd z13.d, p0/z, [pB, 40]
fmla z22.d, p1/m, z1.d, z14.d fmla z22.d, p1/m, z1.d, z14.d
@ -222,6 +227,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla z18.d, p1/m, z1.d, z10.d fmla z18.d, p1/m, z1.d, z10.d
fmla z19.d, p1/m, z1.d, z11.d fmla z19.d, p1/m, z1.d, z11.d
fmla z20.d, p1/m, z1.d, z12.d fmla z20.d, p1/m, z1.d, z12.d
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla z21.d, p1/m, z1.d, z13.d fmla z21.d, p1/m, z1.d, z13.d
fmla z22.d, p1/m, z1.d, z14.d fmla z22.d, p1/m, z1.d, z14.d
fmla z23.d, p1/m, z1.d, z15.d fmla z23.d, p1/m, z1.d, z15.d
@ -229,7 +235,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNELv1x8_SUB .macro KERNELv1x8_SUB
ld1d z0.d, p1/z, [pA] ld1d z0.d, p1/z, [pA]
add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
ld1rd z8.d, p0/z, [pB] ld1rd z8.d, p0/z, [pB]
ld1rd z9.d, p0/z, [pB, 8] ld1rd z9.d, p0/z, [pB, 8]
@ -245,16 +251,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla z16.d, p1/m, z0.d, z8.d fmla z16.d, p1/m, z0.d, z8.d
fmla z17.d, p1/m, z0.d, z9.d fmla z17.d, p1/m, z0.d, z9.d
fmla z18.d, p1/m, z0.d, z10.d fmla z18.d, p1/m, z0.d, z10.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla z19.d, p1/m, z0.d, z11.d fmla z19.d, p1/m, z0.d, z11.d
fmla z20.d, p1/m, z0.d, z12.d fmla z20.d, p1/m, z0.d, z12.d
fmla z21.d, p1/m, z0.d, z13.d fmla z21.d, p1/m, z0.d, z13.d
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla z22.d, p1/m, z0.d, z14.d fmla z22.d, p1/m, z0.d, z14.d
fmla z23.d, p1/m, z0.d, z15.d fmla z23.d, p1/m, z0.d, z15.d
.endm .endm
.macro SAVEv1x8 .macro SAVEv1x8
dup alphaZ, alpha
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
@ -262,43 +269,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1d z24.d, p1/z, [pCRow0] ld1d z24.d, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaZ fmla z24.d, p1/m, z16.d, alphaZ
st1d z24.d, p1, [pCRow0] st1d z24.d, p1, [pCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
ld1d z25.d, p1/z, [pCRow1] ld1d z25.d, p1/z, [pCRow1]
fmla z25.d, p1/m, z17.d, alphaZ fmla z25.d, p1/m, z17.d, alphaZ
st1d z25.d, p1, [pCRow1] st1d z25.d, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC add pCRow1, pCRow2, LDC
ld1d z26.d, p1/z, [pCRow2] ld1d z26.d, p1/z, [pCRow2]
fmla z26.d, p1/m, z18.d, alphaZ fmla z26.d, p1/m, z18.d, alphaZ
st1d z26.d, p1, [pCRow2] st1d z26.d, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
ld1d z27.d, p1/z, [pCRow1] ld1d z27.d, p1/z, [pCRow1]
fmla z27.d, p1/m, z19.d, alphaZ fmla z27.d, p1/m, z19.d, alphaZ
st1d z27.d, p1, [pCRow1] st1d z27.d, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC add pCRow1, pCRow2, LDC
ld1d z28.d, p1/z, [pCRow2] ld1d z28.d, p1/z, [pCRow2]
fmla z28.d, p1/m, z20.d, alphaZ fmla z28.d, p1/m, z20.d, alphaZ
st1d z28.d, p1, [pCRow2] st1d z28.d, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
ld1d z29.d, p1/z, [pCRow1] ld1d z29.d, p1/z, [pCRow1]
fmla z29.d, p1/m, z21.d, alphaZ fmla z29.d, p1/m, z21.d, alphaZ
st1d z29.d, p1, [pCRow1] st1d z29.d, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC add pCRow1, pCRow2, LDC
ld1d z30.d, p1/z, [pCRow2] ld1d z30.d, p1/z, [pCRow2]
fmla z30.d, p1/m, z22.d, alphaZ fmla z30.d, p1/m, z22.d, alphaZ
st1d z30.d, p1, [pCRow2] st1d z30.d, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1d z31.d, p1/z, [pCRow1] ld1d z31.d, p1/z, [pCRow1]
fmla z31.d, p1/m, z23.d, alphaZ fmla z31.d, p1/m, z23.d, alphaZ
st1d z31.d, p1, [pCRow1] st1d z31.d, p1, [pCRow1]
add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
.endm .endm
@ -313,7 +326,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNELv1x4_SUB .macro KERNELv1x4_SUB
ld1d z0.d, p1/z, [pA] ld1d z0.d, p1/z, [pA]
add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
ld1rd z8.d, p0/z, [pB] ld1rd z8.d, p0/z, [pB]
ld1rd z9.d, p0/z, [pB, 8] ld1rd z9.d, p0/z, [pB, 8]
@ -324,13 +337,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla z16.d, p1/m, z0.d, z8.d fmla z16.d, p1/m, z0.d, z8.d
fmla z17.d, p1/m, z0.d, z9.d fmla z17.d, p1/m, z0.d, z9.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla z18.d, p1/m, z0.d, z10.d fmla z18.d, p1/m, z0.d, z10.d
fmla z19.d, p1/m, z0.d, z11.d fmla z19.d, p1/m, z0.d, z11.d
.endm .endm
.macro SAVEv1x4 .macro SAVEv1x4
dup alphaZ, alpha
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
@ -338,23 +351,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1d z24.d, p1/z, [pCRow0] ld1d z24.d, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaZ fmla z24.d, p1/m, z16.d, alphaZ
st1d z24.d, p1, [pCRow0] st1d z24.d, p1, [pCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
ld1d z25.d, p1/z, [pCRow1] ld1d z25.d, p1/z, [pCRow1]
fmla z25.d, p1/m, z17.d, alphaZ fmla z25.d, p1/m, z17.d, alphaZ
st1d z25.d, p1, [pCRow1] st1d z25.d, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC add pCRow1, pCRow2, LDC
ld1d z26.d, p1/z, [pCRow2] ld1d z26.d, p1/z, [pCRow2]
fmla z26.d, p1/m, z18.d, alphaZ fmla z26.d, p1/m, z18.d, alphaZ
st1d z26.d, p1, [pCRow2] st1d z26.d, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1d z27.d, p1/z, [pCRow1] ld1d z27.d, p1/z, [pCRow1]
fmla z27.d, p1/m, z19.d, alphaZ fmla z27.d, p1/m, z19.d, alphaZ
st1d z27.d, p1, [pCRow1] st1d z27.d, p1, [pCRow1]
add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
.endm .endm
@ -367,7 +382,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNELv1x2_SUB .macro KERNELv1x2_SUB
ld1d z0.d, p1/z, [pA] ld1d z0.d, p1/z, [pA]
add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
ld1rd z8.d, p0/z, [pB] ld1rd z8.d, p0/z, [pB]
ld1rd z9.d, p0/z, [pB, 8] ld1rd z9.d, p0/z, [pB, 8]
@ -375,12 +390,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pB, pB, 16 add pB, pB, 16
fmla z16.d, p1/m, z0.d, z8.d fmla z16.d, p1/m, z0.d, z8.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla z17.d, p1/m, z0.d, z9.d fmla z17.d, p1/m, z0.d, z9.d
.endm .endm
.macro SAVEv1x2 .macro SAVEv1x2
dup alphaZ, alpha
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
@ -388,13 +403,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1d z24.d, p1/z, [pCRow0] ld1d z24.d, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaZ fmla z24.d, p1/m, z16.d, alphaZ
st1d z24.d, p1, [pCRow0] st1d z24.d, p1, [pCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1d z25.d, p1/z, [pCRow1] ld1d z25.d, p1/z, [pCRow1]
fmla z25.d, p1/m, z17.d, alphaZ fmla z25.d, p1/m, z17.d, alphaZ
st1d z25.d, p1, [pCRow1] st1d z25.d, p1, [pCRow1]
add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
.endm .endm
@ -406,28 +421,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNELv1x1_SUB .macro KERNELv1x1_SUB
ld1d z0.d, p1/z, [pA] ld1d z0.d, p1/z, [pA]
add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
ld1rd z8.d, p0/z, [pB] ld1rd z8.d, p0/z, [pB]
add pB, pB, 8 add pB, pB, 8
fmla z16.d, p1/m, z0.d, z8.d fmla z16.d, p1/m, z0.d, z8.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
.endm .endm
.macro SAVEv1x1 .macro SAVEv1x1
dup alphaZ, alpha
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow1, pCRow0, LDC
ld1d z24.d, p1/z, [pCRow0] ld1d z24.d, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaZ fmla z24.d, p1/m, z16.d, alphaZ
st1d z24.d, p1, [pCRow0] st1d z24.d, p1, [pCRow0]
add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
.endm .endm
@ -456,6 +470,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
prfm PLDL1KEEP, [origPA] prfm PLDL1KEEP, [origPA]
fmov alpha, d0 fmov alpha, d0
dup alphaZ, alpha
lsl LDC, LDC, #3 // ldc = ldc * 8 lsl LDC, LDC, #3 // ldc = ldc * 8
ptrue p0.d // create true predicate ptrue p0.d // create true predicate
@ -473,7 +488,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.Ldgemm_kernel_L8_BEGIN: .Ldgemm_kernel_L8_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pC, pCRow0, LDC, lsl #3 // add 8 x LDC add pC, pC, LDC, lsl #3 // add 8 x LDC
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
@ -481,11 +496,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterI, #0 mov counterI, #0
whilelt p1.d, counterI, origM //SVE instruction whilelt p1.d, counterI, origM //SVE instruction
cntp x18, p0, p1.d cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
/* mov counterI, origM */
/* asr counterI, counterI, #3 // counterI = counterI / 8 */
/* cmp counterI, #0 */
/* ble .Ldgemm_kernel_L4_M4_BEGIN */
.align 5 .align 5
.Ldgemm_kernel_L8_Mv1_20: .Ldgemm_kernel_L8_Mv1_20:
@ -584,7 +595,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
incd counterI incd counterI
whilelt p1.d, counterI, origM //SVE instruction whilelt p1.d, counterI, origM //SVE instruction
cntp x18, p0, p1.d cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
b.any .Ldgemm_kernel_L8_Mv1_20 b.any .Ldgemm_kernel_L8_Mv1_20
.Ldgemm_kernel_L8_END: .Ldgemm_kernel_L8_END:
@ -608,7 +619,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov pCRow0, pC mov pCRow0, pC
add pC, pCRow0, LDC, lsl #2 // add 4 x LDC add pC, pC, LDC, lsl #2 // add 4 x LDC
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
@ -616,7 +627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterI, #0 mov counterI, #0
whilelt p1.d, counterI, origM //SVE instruction whilelt p1.d, counterI, origM //SVE instruction
cntp x18, p0, p1.d cntp lanes, p0, p1.d
.align 5 .align 5
.Ldgemm_kernel_L4_Mv1_20: .Ldgemm_kernel_L4_Mv1_20:
@ -626,17 +637,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
asr counterL , origK, #3 // L = K / 8 asr counterL , origK, #3 // L = K / 8
cmp counterL , #0 // is there at least 4 to do? cmp counterL , #0 // is there at least 4 to do?
blt .Ldgemm_kernel_L4_Mv1_44 ble .Ldgemm_kernel_L4_Mv1_44
.align 5 .align 5
.Ldgemm_kernel_L4_Mv1_22: .Ldgemm_kernel_L4_Mv1_22:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB KERNELv1x4_SUB
KERNELv1x4_SUB KERNELv1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB KERNELv1x4_SUB
KERNELv1x4_SUB KERNELv1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB KERNELv1x4_SUB
KERNELv1x4_SUB KERNELv1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB KERNELv1x4_SUB
KERNELv1x4_SUB KERNELv1x4_SUB
@ -651,6 +666,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 5 .align 5
.Ldgemm_kernel_L4_Mv1_46: .Ldgemm_kernel_L4_Mv1_46:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB KERNELv1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
@ -667,12 +683,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
incd counterI incd counterI
whilelt p1.d, counterI, origM //SVE instruction whilelt p1.d, counterI, origM //SVE instruction
cntp x18, p0, p1.d cntp lanes, p0, p1.d
b.any .Ldgemm_kernel_L4_Mv1_20 b.any .Ldgemm_kernel_L4_Mv1_20
.Ldgemm_kernel_L4_END: .Ldgemm_kernel_L4_END:
add origPB, origPB, origK, lsl #5 // B = B + K * 4 * 8 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
/******************************************************************************/ /******************************************************************************/
/******************************************************************************/ /******************************************************************************/
@ -686,7 +703,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov pCRow0, pC mov pCRow0, pC
add pC, pCRow0, LDC, lsl #1 // add 2 x LDC add pC, pC, LDC, lsl #1 // add 2 x LDC
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
@ -694,7 +711,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterI, #0 mov counterI, #0
whilelt p1.d, counterI, origM //SVE instruction whilelt p1.d, counterI, origM //SVE instruction
cntp x18, p0, p1.d cntp lanes, p0, p1.d
.align 5 .align 5
.Ldgemm_kernel_L2_Mv1_20: .Ldgemm_kernel_L2_Mv1_20:
@ -704,15 +721,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
asr counterL , origK, #3 // L = K / 8 asr counterL , origK, #3 // L = K / 8
cmp counterL , #0 // is there at least 4 to do? cmp counterL , #0 // is there at least 4 to do?
blt .Ldgemm_kernel_L2_Mv1_44 ble .Ldgemm_kernel_L2_Mv1_44
.align 5 .align 5
.Ldgemm_kernel_L2_Mv1_22: .Ldgemm_kernel_L2_Mv1_22:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x2_SUB KERNELv1x2_SUB
KERNELv1x2_SUB KERNELv1x2_SUB
KERNELv1x2_SUB KERNELv1x2_SUB
KERNELv1x2_SUB KERNELv1x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x2_SUB KERNELv1x2_SUB
KERNELv1x2_SUB KERNELv1x2_SUB
KERNELv1x2_SUB KERNELv1x2_SUB
@ -729,6 +748,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 5 .align 5
.Ldgemm_kernel_L2_Mv1_46: .Ldgemm_kernel_L2_Mv1_46:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x2_SUB KERNELv1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
@ -745,7 +765,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
incd counterI incd counterI
whilelt p1.d, counterI, origM //SVE instruction whilelt p1.d, counterI, origM //SVE instruction
cntp x18, p0, p1.d cntp lanes, p0, p1.d
b.any .Ldgemm_kernel_L2_Mv1_20 b.any .Ldgemm_kernel_L2_Mv1_20
@ -764,7 +784,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov pCRow0, pC mov pCRow0, pC
add pC, pCRow0, LDC, lsl #1 // add 2 x LDC add pC, pC, LDC // add 1 x LDC
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
@ -772,7 +792,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterI, #0 mov counterI, #0
whilelt p1.d, counterI, origM //SVE instruction whilelt p1.d, counterI, origM //SVE instruction
cntp x18, p0, p1.d cntp lanes, p0, p1.d
.align 5 .align 5
.Ldgemm_kernel_L1_Mv1_20: .Ldgemm_kernel_L1_Mv1_20:
@ -781,12 +801,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
INITv1x1 // fill with zeros INITv1x1 // fill with zeros
asr counterL , origK, #3 // L = K / 8 asr counterL , origK, #3 // L = K / 8
cmp counterL , #0 // is there at least 4 to do? cmp counterL , #0 // is there at least 8 to do?
blt .Ldgemm_kernel_L1_Mv1_44 ble .Ldgemm_kernel_L1_Mv1_44
.align 5 .align 5
.Ldgemm_kernel_L1_Mv1_22: .Ldgemm_kernel_L1_Mv1_22:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x1_SUB KERNELv1x1_SUB
KERNELv1x1_SUB KERNELv1x1_SUB
KERNELv1x1_SUB KERNELv1x1_SUB
@ -807,10 +828,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 5 .align 5
.Ldgemm_kernel_L1_Mv1_46: .Ldgemm_kernel_L1_Mv1_46:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x1_SUB KERNELv1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bne .Ldgemm_kernel_L1_Mv1_46 bgt .Ldgemm_kernel_L1_Mv1_46
.Ldgemm_kernel_L1_Mv1_100: .Ldgemm_kernel_L1_Mv1_100:
prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA]
@ -823,7 +845,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
incd counterI incd counterI
whilelt p1.d, counterI, origM //SVE instruction whilelt p1.d, counterI, origM //SVE instruction
cntp x18, p0, p1.d cntp lanes, p0, p1.d
b.any .Ldgemm_kernel_L1_Mv1_20 b.any .Ldgemm_kernel_L1_Mv1_20

File diff suppressed because it is too large Load Diff