diff --git a/kernel/arm64/dgemm_kernel_8x4.S b/kernel/arm64/dgemm_kernel_8x4.S index a607fecc4..33e076e6a 100755 --- a/kernel/arm64/dgemm_kernel_8x4.S +++ b/kernel/arm64/dgemm_kernel_8x4.S @@ -52,12 +52,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alpha0 d10 #define alphaV0 v10.d[0] -#define alpha1 d11 -#define alphaV1 v11.d[0] -#define alpha2 d14 -#define alphaV2 v14.d[0] -#define alpha3 d15 -#define alphaV3 v15.d[0] + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 // 00 origM // 01 origN @@ -74,8 +72,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 -// 15 pA -// 16 +// 15 pCRow3 +// 16 pA // 17 // 18 must save // 19 must save @@ -100,14 +98,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_2, pA1_3 //v06 pA1_4, pA1_5 //v07 pA1_6, pA1_7 -//v08 must save pB0_0, pB0_1 -//v09 must save pB0_2, pB0_3 -//v10 must save ALPHA0 -//v11 must save ALPHA1 -//v12 must save pB1_0, pB1_1 -//v13 must save pB1_2, pB1_3 -//v14 must save ALPHA2 -//v15 must save ALPHA3 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 --> ALPHA0 +//v11 must save pB0_3 +//v12 must save pB1_0 +//v13 must save pB1_1 +//v14 must save pB1_2 +//v15 must save pB1_3 //v16 must save C00, C01 //v17 must save C02, C03 //v18 C04, C05 @@ -149,244 +147,257 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld1 {v0.2d, v1.2d}, [pA] - add pA, pA, #32 - ld1 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 - ldp d8, d9, [pB] - add pB, pB, #16 - ldp d10, d11, [pB] - add pB, pB, #16 + ldp q0, q1, [pA], #32 + + ldp d8, d9, [pB], #16 fmul v16.2d, v0.2d, v8.2d[0] - fmul v17.2d, v1.2d, v8.2d[0] - - fmul v18.2d, v2.2d, v8.2d[0] - fmul v19.2d, v3.2d, v8.2d[0] - fmul v20.2d, v0.2d, v9.2d[0] + + ldp d10, d11, [pB], #16 + + fmul v17.2d, v1.2d, v8.2d[0] fmul v21.2d, v1.2d, v9.2d[0] - fmul v22.2d, v2.2d, v9.2d[0] - fmul v23.2d, v3.2d, v9.2d[0] + ldp q2, q3, [pA], #32 fmul v24.2d, v0.2d, v10.2d[0] - fmul v25.2d, v1.2d, v10.2d[0] - - fmul v26.2d, v2.2d, v10.2d[0] - fmul v27.2d, v3.2d, v10.2d[0] - fmul v28.2d, v0.2d, v11.2d[0] + + ldp q4, q5, [pA], #32 + + fmul v25.2d, v1.2d, v10.2d[0] fmul v29.2d, v1.2d, v11.2d[0] - fmul v30.2d, v2.2d, v11.2d[0] - fmul v31.2d, v3.2d, v11.2d[0] + ldp d12, d13, [pB], #16 - ld1 {v4.2d, v5.2d}, [pA] - add pA, pA, #32 - ld1 {v6.2d, v7.2d}, [pA] - add pA, pA, #32 - ldp d12, d13, [pB] - add pB, pB, #16 - ldp d14, d15, [pB] - add pB, pB, #16 + fmul v18.2d, v2.2d, v8.2d[0] + fmul v22.2d, v2.2d, v9.2d[0] + + ldp d14, d15, [pB], #16 + + fmul v26.2d, v2.2d, v10.2d[0] + fmul v30.2d, v2.2d, v11.2d[0] + + ldp q6, q7, [pA], #32 + + fmul v19.2d, v3.2d, v8.2d[0] + fmul v27.2d, v3.2d, v10.2d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmul v31.2d, v3.2d, v11.2d[0] + fmul v23.2d, v3.2d, v9.2d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL8x4_M1 fmla v16.2d, v0.2d, v8.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v26.2d, v2.2d, v10.2d[0] - fmla v31.2d, v3.2d, v11.2d[0] - - ld1 {v4.2d}, [pA], #16 - fmla v20.2d, v0.2d, v9.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - ld1 {v5.2d}, [pA], #16 - - fmla v30.2d, v2.2d, v11.2d[0] - fmla v27.2d, v3.2d, v10.2d[0] - - ldp d12, d13, [pB] - add pB, pB, #16 - - fmla v28.2d, v0.2d, v11.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] - - ldp d14, d15, [pB] - add pB, pB, #16 - - fmla v18.2d, v2.2d, v8.2d[0] - fmla v23.2d, v3.2d, v9.2d[0] - - ld1 {v6.2d}, [pA], #16 + ldp q4, q5, [pA], #32 fmla v24.2d, v0.2d, v10.2d[0] + fmla v28.2d, v0.2d, v11.2d[0] + + ldp d12, d13, [pB], #16 + + fmla v17.2d, v1.2d, v8.2d[0] + fmla v25.2d, v1.2d, v10.2d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + fmla v21.2d, v1.2d, v9.2d[0] fmla v29.2d, v1.2d, v11.2d[0] - ld1 {v7.2d}, [pA], #16 + ldp d14, d15, [pB], #16 + fmla v18.2d, v2.2d, v8.2d[0] fmla v22.2d, v2.2d, v9.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] - prfm PLDL1KEEP, [pA, #224] - prfm PLDL1KEEP, [pA, #224+64] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmla v26.2d, v2.2d, v10.2d[0] + fmla v30.2d, v2.2d, v11.2d[0] + fmla v19.2d, v3.2d, v8.2d[0] + fmla v23.2d, v3.2d, v9.2d[0] + + ldp q6, q7, [pA], #32 + + fmla v27.2d, v3.2d, v10.2d[0] + fmla v31.2d, v3.2d, v11.2d[0] .endm .macro KERNEL8x4_M2 fmla v16.2d, v4.2d, v12.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v26.2d, v6.2d, v14.2d[0] - fmla v31.2d, v7.2d, v15.2d[0] - - ld1 {v0.2d}, [pA], #16 - fmla v20.2d, v4.2d, v13.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - - ld1 {v1.2d}, [pA], #16 - - fmla v30.2d, v6.2d, v15.2d[0] - fmla v27.2d, v7.2d, v14.2d[0] - - ldp d8, d9, [pB] - add pB, pB, #16 - + fmla v24.2d, v4.2d, v14.2d[0] fmla v28.2d, v4.2d, v15.2d[0] + + ldp q0, q1, [pA], #32 + + fmla v17.2d, v5.2d, v12.2d[0] fmla v25.2d, v5.2d, v14.2d[0] - ldp d10, d11, [pB] - add pB, pB, #16 + ldp d8, d9, [pB], #16 - fmla v22.2d, v6.2d, v13.2d[0] - fmla v19.2d, v7.2d, v12.2d[0] - - ld1 {v2.2d}, [pA], #16 - - fmla v24.2d, v4.2d, v14.2d[0] + fmla v21.2d, v5.2d, v13.2d[0] fmla v29.2d, v5.2d, v15.2d[0] - ld1 {v3.2d}, [pA], #16 + ldp d10, d11, [pB], #16 fmla v18.2d, v6.2d, v12.2d[0] + fmla v22.2d, v6.2d, v13.2d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v26.2d, v6.2d, v14.2d[0] + fmla v30.2d, v6.2d, v15.2d[0] + + fmla v19.2d, v7.2d, v12.2d[0] fmla v23.2d, v7.2d, v13.2d[0] - prfm PLDL1KEEP, [pB, #640] + ldp q2, q3, [pA], #32 + + fmla v27.2d, v7.2d, v14.2d[0] + fmla v31.2d, v7.2d, v15.2d[0] .endm .macro KERNEL8x4_E fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v6.2d, v12.2d[0] - fmla v19.2d, v7.2d, v12.2d[0] fmla v20.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v22.2d, v6.2d, v13.2d[0] - fmla v23.2d, v7.2d, v13.2d[0] fmla v24.2d, v4.2d, v14.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] - fmla v26.2d, v6.2d, v14.2d[0] - fmla v27.2d, v7.2d, v14.2d[0] fmla v28.2d, v4.2d, v15.2d[0] + + fmla v17.2d, v5.2d, v12.2d[0] + fmla v25.2d, v5.2d, v14.2d[0] + fmla v21.2d, v5.2d, v13.2d[0] fmla v29.2d, v5.2d, v15.2d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v18.2d, v6.2d, v12.2d[0] + fmla v22.2d, v6.2d, v13.2d[0] + fmla v26.2d, v6.2d, v14.2d[0] fmla v30.2d, v6.2d, v15.2d[0] + + fmla v19.2d, v7.2d, v12.2d[0] + fmla v23.2d, v7.2d, v13.2d[0] + fmla v27.2d, v7.2d, v14.2d[0] fmla v31.2d, v7.2d, v15.2d[0] .endm .macro KERNEL8x4_SUB - ld1 {v0.2d, v1.2d}, [pA] - add pA, pA, #32 - ld1 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 - ldp d8, d9, [pB] - add pB, pB, #16 - ldp d10, d11, [pB] - add pB, pB, #16 + ldp q0, q1, [pA], #32 + + ldp d8, d9, [pB], #16 fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] - fmla v20.2d, v0.2d, v9.2d[0] + + ldp d10, d11, [pB], #16 + + fmla v17.2d, v1.2d, v8.2d[0] fmla v21.2d, v1.2d, v9.2d[0] - fmla v22.2d, v2.2d, v9.2d[0] - fmla v23.2d, v3.2d, v9.2d[0] + + ldp q2, q3, [pA], #32 fmla v24.2d, v0.2d, v10.2d[0] + fmla v28.2d, v0.2d, v11.2d[0] + fmla v25.2d, v1.2d, v10.2d[0] + fmla v29.2d, v1.2d, v11.2d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmla v18.2d, v2.2d, v8.2d[0] + fmla v22.2d, v2.2d, v9.2d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + fmla v26.2d, v2.2d, v10.2d[0] + fmla v30.2d, v2.2d, v11.2d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v19.2d, v3.2d, v8.2d[0] fmla v27.2d, v3.2d, v10.2d[0] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] - fmla v30.2d, v2.2d, v11.2d[0] fmla v31.2d, v3.2d, v11.2d[0] + fmla v23.2d, v3.2d, v9.2d[0] .endm .macro SAVE8x4 fmov alpha0, alpha - ld1 {v0.2d, v1.2d}, [pCRow0] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ldp q0, q1, [pCRow0] fmla v0.2d, v16.2d, alphaV0 fmla v1.2d, v17.2d, alphaV0 - st1 {v0.2d, v1.2d}, [pCRow0] + stp q0, q1, [pCRow0] add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld1 {v2.2d, v3.2d}, [pCRow0] + ldp q2, q3, [pCRow0] fmla v2.2d, v18.2d, alphaV0 fmla v3.2d, v19.2d, alphaV0 - st1 {v2.2d, v3.2d}, [pCRow0] + stp q2, q3, [pCRow0] add pCRow0, pCRow0, #32 - ld1 {v4.2d, v5.2d}, [pCRow1] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ldp q4, q5, [pCRow1] fmla v4.2d, v20.2d, alphaV0 fmla v5.2d, v21.2d, alphaV0 - st1 {v4.2d, v5.2d}, [pCRow1] + stp q4, q5, [pCRow1] add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - ld1 {v6.2d, v7.2d}, [pCRow1] + ldp q6, q7, [pCRow1] fmla v6.2d, v22.2d, alphaV0 fmla v7.2d, v23.2d, alphaV0 - st1 {v6.2d, v7.2d}, [pCRow1] + stp q6, q7, [pCRow1] add pCRow1, pCRow1, #32 - ld1 {v0.2d, v1.2d}, [pCRow2] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + ldp q0, q1, [pCRow2] fmla v0.2d, v24.2d, alphaV0 fmla v1.2d, v25.2d, alphaV0 - st1 {v0.2d, v1.2d}, [pCRow2] + stp q0, q1, [pCRow2] add pCRow2, pCRow2, #32 - ld1 {v2.2d, v3.2d}, [pCRow2] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + ldp q2, q3, [pCRow2] fmla v2.2d, v26.2d, alphaV0 fmla v3.2d, v27.2d, alphaV0 - st1 {v2.2d, v3.2d}, [pCRow2] + stp q2, q3, [pCRow2] add pCRow2, pCRow2, #32 - ld1 {v4.2d, v5.2d}, [pCRow3] + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + ldp q4, q5, [pCRow3] fmla v4.2d, v28.2d, alphaV0 fmla v5.2d, v29.2d, alphaV0 - st1 {v4.2d, v5.2d}, [pCRow3] + stp q4, q5, [pCRow3] add pCRow3, pCRow3, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - ld1 {v6.2d, v7.2d}, [pCRow3] + ldp q6, q7, [pCRow3] fmla v6.2d, v30.2d, alphaV0 fmla v7.2d, v31.2d, alphaV0 - st1 {v6.2d, v7.2d}, [pCRow3] + stp q6, q7, [pCRow3] add pCRow3, pCRow3, #32 - - prfm PLDL2KEEP, [pCRow0, #128] - prfm PLDL2KEEP, [pCRow1, #128] - prfm PLDL2KEEP, [pCRow2, #128] - prfm PLDL2KEEP, [pCRow3, #128] .endm /******************************************************************************/ @@ -422,30 +433,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 + fmov alpha0, alpha ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 - fmla v9.2d, v17.2d, alphaV1 + fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2d, v13.2d}, [pCRow1] - fmla v12.2d, v20.2d, alphaV2 - fmla v13.2d, v21.2d, alphaV3 + fmla v12.2d, v20.2d, alphaV0 + fmla v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow2, pCRow1, LDC ld1 {v8.2d, v9.2d}, [pCRow2] fmla v8.2d, v24.2d, alphaV0 - fmla v9.2d, v25.2d, alphaV1 + fmla v9.2d, v25.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow2] add pCRow1, pCRow2, LDC ld1 {v12.2d, v13.2d}, [pCRow1] - fmla v12.2d, v28.2d, alphaV2 - fmla v13.2d, v29.2d, alphaV3 + fmla v12.2d, v28.2d, alphaV0 + fmla v13.2d, v29.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -474,6 +486,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 + fmov alpha0, alpha ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] @@ -481,19 +494,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow1, pCRow0, LDC ld1 {v12.2d}, [pCRow1] - fmla v12.2d, v20.2d, alphaV1 + fmla v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow2, pCRow1, LDC ld1 {v8.2d}, [pCRow2] - fmla v8.2d, v24.2d, alphaV2 + fmla v8.2d, v24.2d, alphaV0 st1 {v8.2d}, [pCRow2] add pCRow1, pCRow2, LDC ld1 {v12.2d}, [pCRow1] - fmla v12.2d, v28.2d, alphaV3 + fmla v12.2d, v28.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 @@ -518,6 +531,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 + fmov alpha0, alpha add pCRow1, pCRow0, LDC ld1 {v8.d}[0], [pCRow0] @@ -531,7 +545,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v12.d}[0], [pCRow2] ld1 {v12.d}[1], [pCRow1] - fmla v12.2d, v20.2d, alphaV1 + fmla v12.2d, v20.2d, alphaV0 st1 {v12.d}[0], [pCRow2] st1 {v12.d}[1], [pCRow1] @@ -571,20 +585,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0, alpha add pCRow1, pCRow0, LDC ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0 - fmla v1.2d, v17.2d, alphaV1 - fmla v2.2d, v18.2d, alphaV2 - fmla v3.2d, v19.2d, alphaV3 + fmla v1.2d, v17.2d, alphaV0 + fmla v2.2d, v18.2d, alphaV0 + fmla v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0 - fmla v5.2d, v21.2d, alphaV1 - fmla v6.2d, v22.2d, alphaV2 - fmla v7.2d, v23.2d, alphaV3 + fmla v5.2d, v21.2d, alphaV0 + fmla v6.2d, v22.2d, alphaV0 + fmla v7.2d, v23.2d, alphaV0 st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] add pCRow0, pCRow0, #64 @@ -612,16 +627,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0, alpha ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 - fmla v9.2d, v17.2d, alphaV1 + fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2d, v13.2d}, [pCRow1] - fmla v12.2d, v20.2d, alphaV2 - fmla v13.2d, v21.2d, alphaV3 + fmla v12.2d, v20.2d, alphaV0 + fmla v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -646,6 +662,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0, alpha ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] @@ -653,7 +670,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow1 , pCRow0, LDC ld1 {v12.2d}, [pCRow1] - fmla v12.2d, v20.2d, alphaV1 + fmla v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 @@ -676,6 +693,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0, alpha add pCRow1 , pCRow0, LDC ld1 {v8.d}[0], [pCRow0] @@ -713,11 +731,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0, alpha ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0 - fmla v1.2d, v17.2d, alphaV1 - fmla v2.2d, v18.2d, alphaV2 - fmla v3.2d, v19.2d, alphaV3 + fmla v1.2d, v17.2d, alphaV0 + fmla v2.2d, v18.2d, alphaV0 + fmla v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] add pCRow0, pCRow0, #64 @@ -743,9 +762,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0, alpha ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 - fmla v9.2d, v17.2d, alphaV1 + fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow0, pCRow0, #32 @@ -769,6 +789,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0, alpha ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] @@ -793,6 +814,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0, alpha ldr d8, [pCRow0] fmadd d8, d16, alpha0, d8 str d8, [pCRow0] @@ -820,6 +842,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + fmov alpha, d0 lsl LDC, LDC, #3 // ldc = ldc * 8 @@ -838,6 +863,7 @@ dgemm_kernel_L4_BEGIN: add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC add pCRow3, pCRow2, LDC + add pC, pCRow3, LDC mov pA, origPA // pA = start of A array @@ -849,6 +875,7 @@ dgemm_kernel_L4_M8_BEGIN: cmp counterI, #0 ble dgemm_kernel_L4_M4_BEGIN + .align 5 dgemm_kernel_L4_M8_20: mov pB, origPB @@ -868,8 +895,8 @@ dgemm_kernel_L4_M8_20: subs counterL, counterL, #2 // subtract 2 ble dgemm_kernel_L4_M8_22a - .align 5 + .align 5 dgemm_kernel_L4_M8_22: KERNEL8x4_M1 @@ -884,7 +911,7 @@ dgemm_kernel_L4_M8_22: subs counterL, counterL, #1 bgt dgemm_kernel_L4_M8_22 - + .align 5 dgemm_kernel_L4_M8_22a: KERNEL8x4_M1 @@ -898,6 +925,7 @@ dgemm_kernel_L4_M8_22a: b dgemm_kernel_L4_M8_44 + .align 5 dgemm_kernel_L4_M8_32: tst counterL, #1 @@ -923,6 +951,7 @@ dgemm_kernel_L4_M8_44: ands counterL , origK, #7 ble dgemm_kernel_L4_M8_100 + .align 5 dgemm_kernel_L4_M8_46: KERNEL8x4_SUB @@ -931,6 +960,9 @@ dgemm_kernel_L4_M8_46: bne dgemm_kernel_L4_M8_46 dgemm_kernel_L4_M8_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] SAVE8x4