Cortex-A57: Improve DGEMM 8x4 Implementation

This commit is contained in:
Ashwin Sekhar T K 2016-03-17 10:23:51 +05:30
parent 8519e4ed9f
commit 3b5ffb49d3
1 changed files with 206 additions and 174 deletions

View File

@ -52,12 +52,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define alpha0 d10 #define alpha0 d10
#define alphaV0 v10.d[0] #define alphaV0 v10.d[0]
#define alpha1 d11
#define alphaV1 v11.d[0] #define A_PRE_SIZE 2560
#define alpha2 d14 #define B_PRE_SIZE 448
#define alphaV2 v14.d[0] #define C_PRE_SIZE 128
#define alpha3 d15
#define alphaV3 v15.d[0]
// 00 origM // 00 origM
// 01 origN // 01 origN
@ -74,8 +72,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 12 pCRow0 // 12 pCRow0
// 13 pCRow1 // 13 pCRow1
// 14 pCRow2 // 14 pCRow2
// 15 pA // 15 pCRow3
// 16 // 16 pA
// 17 // 17
// 18 must save // 18 must save
// 19 must save // 19 must save
@ -100,14 +98,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//v05 pA1_2, pA1_3 //v05 pA1_2, pA1_3
//v06 pA1_4, pA1_5 //v06 pA1_4, pA1_5
//v07 pA1_6, pA1_7 //v07 pA1_6, pA1_7
//v08 must save pB0_0, pB0_1 //v08 must save pB0_0
//v09 must save pB0_2, pB0_3 //v09 must save pB0_1
//v10 must save ALPHA0 //v10 must save pB0_2 --> ALPHA0
//v11 must save ALPHA1 //v11 must save pB0_3
//v12 must save pB1_0, pB1_1 //v12 must save pB1_0
//v13 must save pB1_2, pB1_3 //v13 must save pB1_1
//v14 must save ALPHA2 //v14 must save pB1_2
//v15 must save ALPHA3 //v15 must save pB1_3
//v16 must save C00, C01 //v16 must save C00, C01
//v17 must save C02, C03 //v17 must save C02, C03
//v18 C04, C05 //v18 C04, C05
@ -149,244 +147,257 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_I .macro KERNEL8x4_I
ld1 {v0.2d, v1.2d}, [pA] ldp q0, q1, [pA], #32
add pA, pA, #32
ld1 {v2.2d, v3.2d}, [pA] ldp d8, d9, [pB], #16
add pA, pA, #32
ldp d8, d9, [pB]
add pB, pB, #16
ldp d10, d11, [pB]
add pB, pB, #16
fmul v16.2d, v0.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.2d[0]
fmul v17.2d, v1.2d, v8.2d[0]
fmul v18.2d, v2.2d, v8.2d[0]
fmul v19.2d, v3.2d, v8.2d[0]
fmul v20.2d, v0.2d, v9.2d[0] fmul v20.2d, v0.2d, v9.2d[0]
ldp d10, d11, [pB], #16
fmul v17.2d, v1.2d, v8.2d[0]
fmul v21.2d, v1.2d, v9.2d[0] fmul v21.2d, v1.2d, v9.2d[0]
fmul v22.2d, v2.2d, v9.2d[0] ldp q2, q3, [pA], #32
fmul v23.2d, v3.2d, v9.2d[0]
fmul v24.2d, v0.2d, v10.2d[0] fmul v24.2d, v0.2d, v10.2d[0]
fmul v25.2d, v1.2d, v10.2d[0]
fmul v26.2d, v2.2d, v10.2d[0]
fmul v27.2d, v3.2d, v10.2d[0]
fmul v28.2d, v0.2d, v11.2d[0] fmul v28.2d, v0.2d, v11.2d[0]
ldp q4, q5, [pA], #32
fmul v25.2d, v1.2d, v10.2d[0]
fmul v29.2d, v1.2d, v11.2d[0] fmul v29.2d, v1.2d, v11.2d[0]
fmul v30.2d, v2.2d, v11.2d[0] ldp d12, d13, [pB], #16
fmul v31.2d, v3.2d, v11.2d[0]
ld1 {v4.2d, v5.2d}, [pA] fmul v18.2d, v2.2d, v8.2d[0]
add pA, pA, #32 fmul v22.2d, v2.2d, v9.2d[0]
ld1 {v6.2d, v7.2d}, [pA]
add pA, pA, #32 ldp d14, d15, [pB], #16
ldp d12, d13, [pB]
add pB, pB, #16 fmul v26.2d, v2.2d, v10.2d[0]
ldp d14, d15, [pB] fmul v30.2d, v2.2d, v11.2d[0]
add pB, pB, #16
ldp q6, q7, [pA], #32
fmul v19.2d, v3.2d, v8.2d[0]
fmul v27.2d, v3.2d, v10.2d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmul v31.2d, v3.2d, v11.2d[0]
fmul v23.2d, v3.2d, v9.2d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.2d[0]
fmla v21.2d, v1.2d, v9.2d[0]
fmla v26.2d, v2.2d, v10.2d[0]
fmla v31.2d, v3.2d, v11.2d[0]
ld1 {v4.2d}, [pA], #16
fmla v20.2d, v0.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.2d[0]
fmla v17.2d, v1.2d, v8.2d[0]
ld1 {v5.2d}, [pA], #16 ldp q4, q5, [pA], #32
fmla v30.2d, v2.2d, v11.2d[0]
fmla v27.2d, v3.2d, v10.2d[0]
ldp d12, d13, [pB]
add pB, pB, #16
fmla v28.2d, v0.2d, v11.2d[0]
fmla v25.2d, v1.2d, v10.2d[0]
ldp d14, d15, [pB]
add pB, pB, #16
fmla v18.2d, v2.2d, v8.2d[0]
fmla v23.2d, v3.2d, v9.2d[0]
ld1 {v6.2d}, [pA], #16
fmla v24.2d, v0.2d, v10.2d[0] fmla v24.2d, v0.2d, v10.2d[0]
fmla v28.2d, v0.2d, v11.2d[0]
ldp d12, d13, [pB], #16
fmla v17.2d, v1.2d, v8.2d[0]
fmla v25.2d, v1.2d, v10.2d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
fmla v21.2d, v1.2d, v9.2d[0]
fmla v29.2d, v1.2d, v11.2d[0] fmla v29.2d, v1.2d, v11.2d[0]
ld1 {v7.2d}, [pA], #16 ldp d14, d15, [pB], #16
fmla v18.2d, v2.2d, v8.2d[0]
fmla v22.2d, v2.2d, v9.2d[0] fmla v22.2d, v2.2d, v9.2d[0]
fmla v19.2d, v3.2d, v8.2d[0]
prfm PLDL1KEEP, [pA, #224] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #224+64]
fmla v26.2d, v2.2d, v10.2d[0]
fmla v30.2d, v2.2d, v11.2d[0]
fmla v19.2d, v3.2d, v8.2d[0]
fmla v23.2d, v3.2d, v9.2d[0]
ldp q6, q7, [pA], #32
fmla v27.2d, v3.2d, v10.2d[0]
fmla v31.2d, v3.2d, v11.2d[0]
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.2d[0]
fmla v21.2d, v5.2d, v13.2d[0]
fmla v26.2d, v6.2d, v14.2d[0]
fmla v31.2d, v7.2d, v15.2d[0]
ld1 {v0.2d}, [pA], #16
fmla v20.2d, v4.2d, v13.2d[0] fmla v20.2d, v4.2d, v13.2d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v24.2d, v4.2d, v14.2d[0]
ld1 {v1.2d}, [pA], #16
fmla v30.2d, v6.2d, v15.2d[0]
fmla v27.2d, v7.2d, v14.2d[0]
ldp d8, d9, [pB]
add pB, pB, #16
fmla v28.2d, v4.2d, v15.2d[0] fmla v28.2d, v4.2d, v15.2d[0]
ldp q0, q1, [pA], #32
fmla v17.2d, v5.2d, v12.2d[0]
fmla v25.2d, v5.2d, v14.2d[0] fmla v25.2d, v5.2d, v14.2d[0]
ldp d10, d11, [pB] ldp d8, d9, [pB], #16
add pB, pB, #16
fmla v22.2d, v6.2d, v13.2d[0] fmla v21.2d, v5.2d, v13.2d[0]
fmla v19.2d, v7.2d, v12.2d[0]
ld1 {v2.2d}, [pA], #16
fmla v24.2d, v4.2d, v14.2d[0]
fmla v29.2d, v5.2d, v15.2d[0] fmla v29.2d, v5.2d, v15.2d[0]
ld1 {v3.2d}, [pA], #16 ldp d10, d11, [pB], #16
fmla v18.2d, v6.2d, v12.2d[0] fmla v18.2d, v6.2d, v12.2d[0]
fmla v22.2d, v6.2d, v13.2d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v26.2d, v6.2d, v14.2d[0]
fmla v30.2d, v6.2d, v15.2d[0]
fmla v19.2d, v7.2d, v12.2d[0]
fmla v23.2d, v7.2d, v13.2d[0] fmla v23.2d, v7.2d, v13.2d[0]
prfm PLDL1KEEP, [pB, #640] ldp q2, q3, [pA], #32
fmla v27.2d, v7.2d, v14.2d[0]
fmla v31.2d, v7.2d, v15.2d[0]
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.2d[0]
fmla v17.2d, v5.2d, v12.2d[0]
fmla v18.2d, v6.2d, v12.2d[0]
fmla v19.2d, v7.2d, v12.2d[0]
fmla v20.2d, v4.2d, v13.2d[0] fmla v20.2d, v4.2d, v13.2d[0]
fmla v21.2d, v5.2d, v13.2d[0]
fmla v22.2d, v6.2d, v13.2d[0]
fmla v23.2d, v7.2d, v13.2d[0]
fmla v24.2d, v4.2d, v14.2d[0] fmla v24.2d, v4.2d, v14.2d[0]
fmla v25.2d, v5.2d, v14.2d[0]
fmla v26.2d, v6.2d, v14.2d[0]
fmla v27.2d, v7.2d, v14.2d[0]
fmla v28.2d, v4.2d, v15.2d[0] fmla v28.2d, v4.2d, v15.2d[0]
fmla v17.2d, v5.2d, v12.2d[0]
fmla v25.2d, v5.2d, v14.2d[0]
fmla v21.2d, v5.2d, v13.2d[0]
fmla v29.2d, v5.2d, v15.2d[0] fmla v29.2d, v5.2d, v15.2d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v18.2d, v6.2d, v12.2d[0]
fmla v22.2d, v6.2d, v13.2d[0]
fmla v26.2d, v6.2d, v14.2d[0]
fmla v30.2d, v6.2d, v15.2d[0] fmla v30.2d, v6.2d, v15.2d[0]
fmla v19.2d, v7.2d, v12.2d[0]
fmla v23.2d, v7.2d, v13.2d[0]
fmla v27.2d, v7.2d, v14.2d[0]
fmla v31.2d, v7.2d, v15.2d[0] fmla v31.2d, v7.2d, v15.2d[0]
.endm .endm
.macro KERNEL8x4_SUB .macro KERNEL8x4_SUB
ld1 {v0.2d, v1.2d}, [pA] ldp q0, q1, [pA], #32
add pA, pA, #32
ld1 {v2.2d, v3.2d}, [pA] ldp d8, d9, [pB], #16
add pA, pA, #32
ldp d8, d9, [pB]
add pB, pB, #16
ldp d10, d11, [pB]
add pB, pB, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.2d[0]
fmla v17.2d, v1.2d, v8.2d[0]
fmla v18.2d, v2.2d, v8.2d[0]
fmla v19.2d, v3.2d, v8.2d[0]
fmla v20.2d, v0.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.2d[0]
ldp d10, d11, [pB], #16
fmla v17.2d, v1.2d, v8.2d[0]
fmla v21.2d, v1.2d, v9.2d[0] fmla v21.2d, v1.2d, v9.2d[0]
fmla v22.2d, v2.2d, v9.2d[0]
fmla v23.2d, v3.2d, v9.2d[0] ldp q2, q3, [pA], #32
fmla v24.2d, v0.2d, v10.2d[0] fmla v24.2d, v0.2d, v10.2d[0]
fmla v28.2d, v0.2d, v11.2d[0]
fmla v25.2d, v1.2d, v10.2d[0] fmla v25.2d, v1.2d, v10.2d[0]
fmla v29.2d, v1.2d, v11.2d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla v18.2d, v2.2d, v8.2d[0]
fmla v22.2d, v2.2d, v9.2d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
fmla v26.2d, v2.2d, v10.2d[0] fmla v26.2d, v2.2d, v10.2d[0]
fmla v30.2d, v2.2d, v11.2d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v19.2d, v3.2d, v8.2d[0]
fmla v27.2d, v3.2d, v10.2d[0] fmla v27.2d, v3.2d, v10.2d[0]
fmla v28.2d, v0.2d, v11.2d[0]
fmla v29.2d, v1.2d, v11.2d[0]
fmla v30.2d, v2.2d, v11.2d[0]
fmla v31.2d, v3.2d, v11.2d[0] fmla v31.2d, v3.2d, v11.2d[0]
fmla v23.2d, v3.2d, v9.2d[0]
.endm .endm
.macro SAVE8x4 .macro SAVE8x4
fmov alpha0, alpha fmov alpha0, alpha
ld1 {v0.2d, v1.2d}, [pCRow0] prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ldp q0, q1, [pCRow0]
fmla v0.2d, v16.2d, alphaV0 fmla v0.2d, v16.2d, alphaV0
fmla v1.2d, v17.2d, alphaV0 fmla v1.2d, v17.2d, alphaV0
st1 {v0.2d, v1.2d}, [pCRow0] stp q0, q1, [pCRow0]
add pCRow0, pCRow0, #32 add pCRow0, pCRow0, #32
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld1 {v2.2d, v3.2d}, [pCRow0] ldp q2, q3, [pCRow0]
fmla v2.2d, v18.2d, alphaV0 fmla v2.2d, v18.2d, alphaV0
fmla v3.2d, v19.2d, alphaV0 fmla v3.2d, v19.2d, alphaV0
st1 {v2.2d, v3.2d}, [pCRow0] stp q2, q3, [pCRow0]
add pCRow0, pCRow0, #32 add pCRow0, pCRow0, #32
ld1 {v4.2d, v5.2d}, [pCRow1] prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ldp q4, q5, [pCRow1]
fmla v4.2d, v20.2d, alphaV0 fmla v4.2d, v20.2d, alphaV0
fmla v5.2d, v21.2d, alphaV0 fmla v5.2d, v21.2d, alphaV0
st1 {v4.2d, v5.2d}, [pCRow1] stp q4, q5, [pCRow1]
add pCRow1, pCRow1, #32 add pCRow1, pCRow1, #32
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld1 {v6.2d, v7.2d}, [pCRow1] ldp q6, q7, [pCRow1]
fmla v6.2d, v22.2d, alphaV0 fmla v6.2d, v22.2d, alphaV0
fmla v7.2d, v23.2d, alphaV0 fmla v7.2d, v23.2d, alphaV0
st1 {v6.2d, v7.2d}, [pCRow1] stp q6, q7, [pCRow1]
add pCRow1, pCRow1, #32 add pCRow1, pCRow1, #32
ld1 {v0.2d, v1.2d}, [pCRow2] prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
ldp q0, q1, [pCRow2]
fmla v0.2d, v24.2d, alphaV0 fmla v0.2d, v24.2d, alphaV0
fmla v1.2d, v25.2d, alphaV0 fmla v1.2d, v25.2d, alphaV0
st1 {v0.2d, v1.2d}, [pCRow2] stp q0, q1, [pCRow2]
add pCRow2, pCRow2, #32 add pCRow2, pCRow2, #32
ld1 {v2.2d, v3.2d}, [pCRow2] prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
ldp q2, q3, [pCRow2]
fmla v2.2d, v26.2d, alphaV0 fmla v2.2d, v26.2d, alphaV0
fmla v3.2d, v27.2d, alphaV0 fmla v3.2d, v27.2d, alphaV0
st1 {v2.2d, v3.2d}, [pCRow2] stp q2, q3, [pCRow2]
add pCRow2, pCRow2, #32 add pCRow2, pCRow2, #32
ld1 {v4.2d, v5.2d}, [pCRow3] prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
ldp q4, q5, [pCRow3]
fmla v4.2d, v28.2d, alphaV0 fmla v4.2d, v28.2d, alphaV0
fmla v5.2d, v29.2d, alphaV0 fmla v5.2d, v29.2d, alphaV0
st1 {v4.2d, v5.2d}, [pCRow3] stp q4, q5, [pCRow3]
add pCRow3, pCRow3, #32 add pCRow3, pCRow3, #32
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
ld1 {v6.2d, v7.2d}, [pCRow3] ldp q6, q7, [pCRow3]
fmla v6.2d, v30.2d, alphaV0 fmla v6.2d, v30.2d, alphaV0
fmla v7.2d, v31.2d, alphaV0 fmla v7.2d, v31.2d, alphaV0
st1 {v6.2d, v7.2d}, [pCRow3] stp q6, q7, [pCRow3]
add pCRow3, pCRow3, #32 add pCRow3, pCRow3, #32
prfm PLDL2KEEP, [pCRow0, #128]
prfm PLDL2KEEP, [pCRow1, #128]
prfm PLDL2KEEP, [pCRow2, #128]
prfm PLDL2KEEP, [pCRow3, #128]
.endm .endm
/******************************************************************************/ /******************************************************************************/
@ -422,30 +433,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
fmov alpha0, alpha
ld1 {v8.2d, v9.2d}, [pCRow0] ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV1 fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0] st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
ld1 {v12.2d, v13.2d}, [pCRow1] ld1 {v12.2d, v13.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV2 fmla v12.2d, v20.2d, alphaV0
fmla v13.2d, v21.2d, alphaV3 fmla v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1] st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
ld1 {v8.2d, v9.2d}, [pCRow2] ld1 {v8.2d, v9.2d}, [pCRow2]
fmla v8.2d, v24.2d, alphaV0 fmla v8.2d, v24.2d, alphaV0
fmla v9.2d, v25.2d, alphaV1 fmla v9.2d, v25.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow2] st1 {v8.2d, v9.2d}, [pCRow2]
add pCRow1, pCRow2, LDC add pCRow1, pCRow2, LDC
ld1 {v12.2d, v13.2d}, [pCRow1] ld1 {v12.2d, v13.2d}, [pCRow1]
fmla v12.2d, v28.2d, alphaV2 fmla v12.2d, v28.2d, alphaV0
fmla v13.2d, v29.2d, alphaV3 fmla v13.2d, v29.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1] st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32 add pCRow0, pCRow0, #32
@ -474,6 +486,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
fmov alpha0, alpha
ld1 {v8.2d}, [pCRow0] ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0] st1 {v8.2d}, [pCRow0]
@ -481,19 +494,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
ld1 {v12.2d}, [pCRow1] ld1 {v12.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV1 fmla v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1] st1 {v12.2d}, [pCRow1]
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
ld1 {v8.2d}, [pCRow2] ld1 {v8.2d}, [pCRow2]
fmla v8.2d, v24.2d, alphaV2 fmla v8.2d, v24.2d, alphaV0
st1 {v8.2d}, [pCRow2] st1 {v8.2d}, [pCRow2]
add pCRow1, pCRow2, LDC add pCRow1, pCRow2, LDC
ld1 {v12.2d}, [pCRow1] ld1 {v12.2d}, [pCRow1]
fmla v12.2d, v28.2d, alphaV3 fmla v12.2d, v28.2d, alphaV0
st1 {v12.2d}, [pCRow1] st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16 add pCRow0, pCRow0, #16
@ -518,6 +531,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE1x4 .macro SAVE1x4
fmov alpha0, alpha
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
ld1 {v8.d}[0], [pCRow0] ld1 {v8.d}[0], [pCRow0]
@ -531,7 +545,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v12.d}[0], [pCRow2] ld1 {v12.d}[0], [pCRow2]
ld1 {v12.d}[1], [pCRow1] ld1 {v12.d}[1], [pCRow1]
fmla v12.2d, v20.2d, alphaV1 fmla v12.2d, v20.2d, alphaV0
st1 {v12.d}[0], [pCRow2] st1 {v12.d}[0], [pCRow2]
st1 {v12.d}[1], [pCRow1] st1 {v12.d}[1], [pCRow1]
@ -571,20 +585,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE8x2 .macro SAVE8x2
fmov alpha0, alpha
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmla v0.2d, v16.2d, alphaV0 fmla v0.2d, v16.2d, alphaV0
fmla v1.2d, v17.2d, alphaV1 fmla v1.2d, v17.2d, alphaV0
fmla v2.2d, v18.2d, alphaV2 fmla v2.2d, v18.2d, alphaV0
fmla v3.2d, v19.2d, alphaV3 fmla v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0 fmla v4.2d, v20.2d, alphaV0
fmla v5.2d, v21.2d, alphaV1 fmla v5.2d, v21.2d, alphaV0
fmla v6.2d, v22.2d, alphaV2 fmla v6.2d, v22.2d, alphaV0
fmla v7.2d, v23.2d, alphaV3 fmla v7.2d, v23.2d, alphaV0
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
add pCRow0, pCRow0, #64 add pCRow0, pCRow0, #64
@ -612,16 +627,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
fmov alpha0, alpha
ld1 {v8.2d, v9.2d}, [pCRow0] ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV1 fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0] st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
ld1 {v12.2d, v13.2d}, [pCRow1] ld1 {v12.2d, v13.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV2 fmla v12.2d, v20.2d, alphaV0
fmla v13.2d, v21.2d, alphaV3 fmla v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1] st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32 add pCRow0, pCRow0, #32
@ -646,6 +662,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
fmov alpha0, alpha
ld1 {v8.2d}, [pCRow0] ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0] st1 {v8.2d}, [pCRow0]
@ -653,7 +670,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow1 , pCRow0, LDC add pCRow1 , pCRow0, LDC
ld1 {v12.2d}, [pCRow1] ld1 {v12.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV1 fmla v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1] st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16 add pCRow0, pCRow0, #16
@ -676,6 +693,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
fmov alpha0, alpha
add pCRow1 , pCRow0, LDC add pCRow1 , pCRow0, LDC
ld1 {v8.d}[0], [pCRow0] ld1 {v8.d}[0], [pCRow0]
@ -713,11 +731,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE8x1 .macro SAVE8x1
fmov alpha0, alpha
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmla v0.2d, v16.2d, alphaV0 fmla v0.2d, v16.2d, alphaV0
fmla v1.2d, v17.2d, alphaV1 fmla v1.2d, v17.2d, alphaV0
fmla v2.2d, v18.2d, alphaV2 fmla v2.2d, v18.2d, alphaV0
fmla v3.2d, v19.2d, alphaV3 fmla v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
add pCRow0, pCRow0, #64 add pCRow0, pCRow0, #64
@ -743,9 +762,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
fmov alpha0, alpha
ld1 {v8.2d, v9.2d}, [pCRow0] ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV1 fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0] st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow0, pCRow0, #32 add pCRow0, pCRow0, #32
@ -769,6 +789,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE2x1 .macro SAVE2x1
fmov alpha0, alpha
ld1 {v8.2d}, [pCRow0] ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0] st1 {v8.2d}, [pCRow0]
@ -793,6 +814,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE1x1 .macro SAVE1x1
fmov alpha0, alpha
ldr d8, [pCRow0] ldr d8, [pCRow0]
fmadd d8, d16, alpha0, d8 fmadd d8, d16, alpha0, d8
str d8, [pCRow0] str d8, [pCRow0]
@ -820,6 +842,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)] stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)] str x28, [sp, #(10 * 16)]
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alpha, d0 fmov alpha, d0
lsl LDC, LDC, #3 // ldc = ldc * 8 lsl LDC, LDC, #3 // ldc = ldc * 8
@ -838,6 +863,7 @@ dgemm_kernel_L4_BEGIN:
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
add pCRow3, pCRow2, LDC add pCRow3, pCRow2, LDC
add pC, pCRow3, LDC add pC, pCRow3, LDC
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
@ -849,6 +875,7 @@ dgemm_kernel_L4_M8_BEGIN:
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN ble dgemm_kernel_L4_M4_BEGIN
.align 5
dgemm_kernel_L4_M8_20: dgemm_kernel_L4_M8_20:
mov pB, origPB mov pB, origPB
@ -868,8 +895,8 @@ dgemm_kernel_L4_M8_20:
subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a ble dgemm_kernel_L4_M8_22a
.align 5
.align 5
dgemm_kernel_L4_M8_22: dgemm_kernel_L4_M8_22:
KERNEL8x4_M1 KERNEL8x4_M1
@ -884,7 +911,7 @@ dgemm_kernel_L4_M8_22:
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22 bgt dgemm_kernel_L4_M8_22
.align 5
dgemm_kernel_L4_M8_22a: dgemm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
@ -898,6 +925,7 @@ dgemm_kernel_L4_M8_22a:
b dgemm_kernel_L4_M8_44 b dgemm_kernel_L4_M8_44
.align 5
dgemm_kernel_L4_M8_32: dgemm_kernel_L4_M8_32:
tst counterL, #1 tst counterL, #1
@ -923,6 +951,7 @@ dgemm_kernel_L4_M8_44:
ands counterL , origK, #7 ands counterL , origK, #7
ble dgemm_kernel_L4_M8_100 ble dgemm_kernel_L4_M8_100
.align 5
dgemm_kernel_L4_M8_46: dgemm_kernel_L4_M8_46:
KERNEL8x4_SUB KERNEL8x4_SUB
@ -931,6 +960,9 @@ dgemm_kernel_L4_M8_46:
bne dgemm_kernel_L4_M8_46 bne dgemm_kernel_L4_M8_46
dgemm_kernel_L4_M8_100: dgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE8x4 SAVE8x4