diff --git a/kernel/arm64/cgemm_kernel_8x4.S b/kernel/arm64/cgemm_kernel_8x4.S index d58cef52d..5d1462808 100644 --- a/kernel/arm64/cgemm_kernel_8x4.S +++ b/kernel/arm64/cgemm_kernel_8x4.S @@ -46,17 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 +#define pCRow3 x15 +#define pA x16 +#define alphaR w17 +#define alphaI w18 #define alpha0_R s10 #define alphaV0_R v10.s[0] #define alpha0_I s11 #define alphaV0_I v11.s[0] -#define alpha1_R s14 -#define alphaV1_R v14.s[0] -#define alpha1_I s15 -#define alphaV1_I v15.s[0] +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla @@ -95,8 +97,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 -// 15 pA -// 16 +// 15 pCRow3 +// 16 pA // 17 // 18 must save // 19 must save @@ -121,14 +123,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I //v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R //v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I -//v08 must save pB0_00_R, pB0_01_R, pB0_02_R, pB0_03_R -//v09 must save pB0_00_I, pB0_01_I, pB0_02_I, pB0_03_I -//v10 must save ALPHA0_R -//v11 must save ALPHA0_I -//v12 must save pB1_00_R, pB1_01_R, pB1_02_R, pB1_03_R -//v13 must save pB1_00_I, pB1_01_I, pB1_02_I, pB1_03_I -//v14 must save ALPHA1_R -//v15 must save ALPHA1_I +//v08 must save pB0_00_R, pB0_01_R +//v09 must save pB0_00_I, pB0_01_I +//v10 must save pB0_02_R, pB0_03_R --> ALPHA0_R +//v11 must save pB0_02_I, pB0_03_I --> ALPHA0_I +//v12 must save pB1_00_R, pB1_01_R +//v13 must save pB1_00_I, pB1_01_I +//v14 must save pB1_02_R, pB1_03_R +//v15 must save pB1_02_I, pB1_03_I //v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R //v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I //v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R @@ -171,8 +173,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 ld2 {v2.4s, v3.4s}, [pA] @@ -189,6 +192,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v17.4s, v1.4s, v8.s[0] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 + fmul v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -200,6 +206,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + fmul v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -211,6 +220,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v14.2s, v15.2s}, [pB] + add pB, pB, #16 + fmul v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -222,56 +234,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v23.4s, v3.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 + + fmul v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.s[2] + fmls v25.4s, v0.4s, v11.s[0] #else - fmul v25.4s, v0.4s, v9.s[2] + fmul v25.4s, v0.4s, v11.s[0] #endif - OP_ir v25.4s, v1.4s, v8.s[2] + OP_ir v25.4s, v1.4s, v10.s[0] - fmul v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 + + fmul v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.4s, v2.4s, v9.s[2] + fmls v27.4s, v2.4s, v11.s[0] #else - fmul v27.4s, v2.4s, v9.s[2] + fmul v27.4s, v2.4s, v11.s[0] #endif - OP_ir v27.4s, v3.4s, v8.s[2] + OP_ir v27.4s, v3.4s, v10.s[0] - fmul v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmul v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.s[3] + fmls v29.4s, v0.4s, v11.s[1] #else - fmul v29.4s, v0.4s, v9.s[3] + fmul v29.4s, v0.4s, v11.s[1] #endif - OP_ir v29.4s, v1.4s, v8.s[3] + OP_ir v29.4s, v1.4s, v10.s[1] - fmul v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + fmul v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.4s, v2.4s, v9.s[3] + fmls v31.4s, v2.4s, v11.s[1] #else - fmul v31.4s, v2.4s, v9.s[3] + fmul v31.4s, v2.4s, v11.s[1] #endif - OP_ir v31.4s, v3.4s, v8.s[3] - - ld2 {v12.4s, v13.4s}, [pB] - add pB, pB, #32 - ld2 {v4.4s, v5.4s}, [pA] - add pA, pA, #32 - ld2 {v6.4s, v7.4s}, [pA] - add pA, pA, #32 + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro KERNEL8x4_M1 @@ -280,47 +295,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] + ld2 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] OP_ri v19.4s, v2.4s, v9.s[0] OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 + OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 + OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] - OP_ri v25.4s, v0.4s, v9.s[2] - OP_ir v25.4s, v1.4s, v8.s[2] + ld2 {v14.2s, v15.2s}, [pB] + add pB, pB, #16 - OP_rr v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] - OP_ri v27.4s, v2.4s, v9.s[2] - OP_ir v27.4s, v3.4s, v8.s[2] + OP_rr v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] + OP_ri v25.4s, v0.4s, v11.s[0] + OP_ir v25.4s, v1.4s, v10.s[0] - OP_rr v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] - OP_ri v29.4s, v0.4s, v9.s[3] - OP_ir v29.4s, v1.4s, v8.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - OP_rr v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] - OP_ri v31.4s, v2.4s, v9.s[3] - OP_ir v31.4s, v3.4s, v8.s[3] + OP_rr v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] + OP_ri v27.4s, v2.4s, v11.s[0] + OP_ir v27.4s, v3.4s, v10.s[0] - ld2 {v12.4s, v13.4s}, [pB] // For next round - add pB, pB, #32 - ld2 {v4.4s, v5.4s}, [pA] // For next round - add pA, pA, #32 - ld2 {v6.4s, v7.4s}, [pA] - add pA, pA, #32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + OP_rr v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] + OP_ri v29.4s, v0.4s, v11.s[1] + OP_ir v29.4s, v1.4s, v10.s[1] + + OP_rr v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] + OP_ri v31.4s, v2.4s, v11.s[1] + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro KERNEL8x4_M2 @@ -329,47 +353,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + OP_rr v18.4s, v6.4s, v12.s[0] OP_ii v18.4s, v7.4s, v13.s[0] OP_ri v19.4s, v6.4s, v13.s[0] OP_ir v19.4s, v7.4s, v12.s[0] + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.s[2] - OP_ii v24.4s, v5.4s, v13.s[2] - OP_ri v25.4s, v4.4s, v13.s[2] - OP_ir v25.4s, v5.4s, v12.s[2] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 - OP_rr v26.4s, v6.4s, v12.s[2] - OP_ii v26.4s, v7.4s, v13.s[2] - OP_ri v27.4s, v6.4s, v13.s[2] - OP_ir v27.4s, v7.4s, v12.s[2] + OP_rr v24.4s, v4.4s, v14.s[0] + OP_ii v24.4s, v5.4s, v15.s[0] + OP_ri v25.4s, v4.4s, v15.s[0] + OP_ir v25.4s, v5.4s, v14.s[0] - OP_rr v28.4s, v4.4s, v12.s[3] - OP_ii v28.4s, v5.4s, v13.s[3] - OP_ri v29.4s, v4.4s, v13.s[3] - OP_ir v29.4s, v5.4s, v12.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - OP_rr v30.4s, v6.4s, v12.s[3] - OP_ii v30.4s, v7.4s, v13.s[3] - OP_ri v31.4s, v6.4s, v13.s[3] - OP_ir v31.4s, v7.4s, v12.s[3] + OP_rr v26.4s, v6.4s, v14.s[0] + OP_ii v26.4s, v7.4s, v15.s[0] + OP_ri v27.4s, v6.4s, v15.s[0] + OP_ir v27.4s, v7.4s, v14.s[0] - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 - ld2 {v0.4s, v1.4s}, [pA] - add pA, pA, #32 - ld2 {v2.4s, v3.4s}, [pA] - add pA, pA, #32 + OP_rr v28.4s, v4.4s, v14.s[1] + OP_ii v28.4s, v5.4s, v15.s[1] + OP_ri v29.4s, v4.4s, v15.s[1] + OP_ir v29.4s, v5.4s, v14.s[1] + + OP_rr v30.4s, v6.4s, v14.s[1] + OP_ii v30.4s, v7.4s, v15.s[1] + OP_ri v31.4s, v6.4s, v15.s[1] + OP_ir v31.4s, v7.4s, v14.s[1] .endm .macro KERNEL8x4_E @@ -388,157 +419,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.s[2] - OP_ii v24.4s, v5.4s, v13.s[2] - OP_ri v25.4s, v4.4s, v13.s[2] - OP_ir v25.4s, v5.4s, v12.s[2] + OP_rr v24.4s, v4.4s, v14.s[0] + OP_ii v24.4s, v5.4s, v15.s[0] + OP_ri v25.4s, v4.4s, v15.s[0] + OP_ir v25.4s, v5.4s, v14.s[0] - OP_rr v26.4s, v6.4s, v12.s[2] - OP_ii v26.4s, v7.4s, v13.s[2] - OP_ri v27.4s, v6.4s, v13.s[2] - OP_ir v27.4s, v7.4s, v12.s[2] + OP_rr v26.4s, v6.4s, v14.s[0] + OP_ii v26.4s, v7.4s, v15.s[0] + OP_ri v27.4s, v6.4s, v15.s[0] + OP_ir v27.4s, v7.4s, v14.s[0] - OP_rr v28.4s, v4.4s, v12.s[3] - OP_ii v28.4s, v5.4s, v13.s[3] - OP_ri v29.4s, v4.4s, v13.s[3] - OP_ir v29.4s, v5.4s, v12.s[3] - - OP_rr v30.4s, v6.4s, v12.s[3] - OP_ii v30.4s, v7.4s, v13.s[3] - OP_ri v31.4s, v6.4s, v13.s[3] - OP_ir v31.4s, v7.4s, v12.s[3] + OP_rr v28.4s, v4.4s, v14.s[1] + OP_ii v28.4s, v5.4s, v15.s[1] + OP_ri v29.4s, v4.4s, v15.s[1] + OP_ir v29.4s, v5.4s, v14.s[1] + OP_rr v30.4s, v6.4s, v14.s[1] + OP_ii v30.4s, v7.4s, v15.s[1] + OP_ri v31.4s, v6.4s, v15.s[1] + OP_ir v31.4s, v7.4s, v14.s[1] .endm .macro KERNEL8x4_SUB - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - ld2 {v2.4s, v3.4s}, [pA] - add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.s[0] - OP_ii v18.4s, v3.4s, v9.s[0] - OP_ri v19.4s, v2.4s, v9.s[0] - OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 + + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] - OP_ri v25.4s, v0.4s, v9.s[2] - OP_ir v25.4s, v1.4s, v8.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] - OP_rr v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] - OP_ri v27.4s, v2.4s, v9.s[2] - OP_ir v27.4s, v3.4s, v8.s[2] + OP_rr v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] + OP_ri v25.4s, v0.4s, v11.s[0] + OP_ir v25.4s, v1.4s, v10.s[0] - OP_rr v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] - OP_ri v29.4s, v0.4s, v9.s[3] - OP_ir v29.4s, v1.4s, v8.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - OP_rr v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] - OP_ri v31.4s, v2.4s, v9.s[3] - OP_ir v31.4s, v3.4s, v8.s[3] + OP_rr v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] + OP_ri v27.4s, v2.4s, v11.s[0] + OP_ir v27.4s, v3.4s, v10.s[0] + OP_rr v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] + OP_ri v29.4s, v0.4s, v11.s[1] + OP_ir v29.4s, v1.4s, v10.s[1] + + OP_rr v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] + OP_ri v31.4s, v2.4s, v11.s[1] + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro SAVE8x4 - mov pCRow1, pCRow0 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI - ld2 {v0.4s, v1.4s}, [pCRow1] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R - st2 {v0.4s, v1.4s}, [pCRow1] + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R + st2 {v0.4s, v1.4s}, [pCRow0] - add pCRow2, pCRow1, #32 + add pCRow0, pCRow0, #32 - ld2 {v2.4s, v3.4s}, [pCRow2] + ld2 {v2.4s, v3.4s}, [pCRow0] fmla v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmla v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R - st2 {v2.4s, v3.4s}, [pCRow2] + fmla v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R + st2 {v2.4s, v3.4s}, [pCRow0] - add pCRow1, pCRow1, LDC + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmla v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmla v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] - add pCRow2, pCRow1, #32 + add pCRow1, pCRow1, #32 - ld2 {v6.4s, v7.4s}, [pCRow2] + ld2 {v6.4s, v7.4s}, [pCRow1] fmla v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I - fmla v7.4s, v22.4s, alphaV1_I - fmla v7.4s, v23.4s, alphaV1_R - st2 {v6.4s, v7.4s}, [pCRow2] + fmla v7.4s, v22.4s, alphaV0_I + fmla v7.4s, v23.4s, alphaV0_R + st2 {v6.4s, v7.4s}, [pCRow1] - add pCRow1, pCRow1, LDC + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] - ld2 {v0.4s, v1.4s}, [pCRow1] + ld2 {v0.4s, v1.4s}, [pCRow2] fmla v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I - fmla v1.4s, v24.4s, alphaV1_I - fmla v1.4s, v25.4s, alphaV1_R - st2 {v0.4s, v1.4s}, [pCRow1] + fmla v1.4s, v24.4s, alphaV0_I + fmla v1.4s, v25.4s, alphaV0_R + st2 {v0.4s, v1.4s}, [pCRow2] - add pCRow2, pCRow1, #32 + add pCRow2, pCRow2, #32 ld2 {v2.4s, v3.4s}, [pCRow2] fmla v2.4s, v26.4s, alphaV0_R fmls v2.4s, v27.4s, alphaV0_I - fmla v3.4s, v26.4s, alphaV1_I - fmla v3.4s, v27.4s, alphaV1_R + fmla v3.4s, v26.4s, alphaV0_I + fmla v3.4s, v27.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] - add pCRow1, pCRow1, LDC + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - ld2 {v4.4s, v5.4s}, [pCRow1] + ld2 {v4.4s, v5.4s}, [pCRow3] fmla v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I - fmla v5.4s, v28.4s, alphaV1_I - fmla v5.4s, v29.4s, alphaV1_R - st2 {v4.4s, v5.4s}, [pCRow1] + fmla v5.4s, v28.4s, alphaV0_I + fmla v5.4s, v29.4s, alphaV0_R + st2 {v4.4s, v5.4s}, [pCRow3] - add pCRow2, pCRow1, #32 + add pCRow3, pCRow3, #32 - ld2 {v6.4s, v7.4s}, [pCRow2] + ld2 {v6.4s, v7.4s}, [pCRow3] fmla v6.4s, v30.4s, alphaV0_R fmls v6.4s, v31.4s, alphaV0_I - fmla v7.4s, v30.4s, alphaV1_I - fmla v7.4s, v31.4s, alphaV1_R - st2 {v6.4s, v7.4s}, [pCRow2] + fmla v7.4s, v30.4s, alphaV0_I + fmla v7.4s, v31.4s, alphaV0_R + st2 {v6.4s, v7.4s}, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -720,13 +768,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -734,8 +785,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmla v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmla v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -743,8 +794,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I - fmla v1.4s, v24.4s, alphaV1_I - fmla v1.4s, v25.4s, alphaV1_R + fmla v1.4s, v24.4s, alphaV0_I + fmla v1.4s, v25.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -752,8 +803,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I - fmla v5.4s, v28.4s, alphaV1_I - fmla v5.4s, v29.4s, alphaV1_R + fmla v5.4s, v28.4s, alphaV0_I + fmla v5.4s, v29.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -800,13 +851,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmla v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmla v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -814,8 +868,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2s, v5.2s}, [pCRow1] fmla v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I - fmla v5.2s, v20.2s, alphaV1_I - fmla v5.2s, v21.2s, alphaV1_R + fmla v5.2s, v20.2s, alphaV0_I + fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -823,8 +877,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v24.2s, alphaV0_R fmls v0.2s, v25.2s, alphaV0_I - fmla v1.2s, v24.2s, alphaV1_I - fmla v1.2s, v25.2s, alphaV1_R + fmla v1.2s, v24.2s, alphaV0_I + fmla v1.2s, v25.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -832,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2s, v5.2s}, [pCRow1] fmla v4.2s, v28.2s, alphaV0_R fmls v4.2s, v29.2s, alphaV0_I - fmla v5.2s, v28.2s, alphaV1_I - fmla v5.2s, v29.2s, alphaV1_R + fmla v5.2s, v28.2s, alphaV0_I + fmla v5.2s, v29.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -880,13 +934,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmla s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmla s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -894,8 +951,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.s, v5.s}[0], [pCRow1] fmla s4, s20, alphaV0_R fmls s4, s21, alphaV0_I - fmla s5, s20, alphaV1_I - fmla s5, s21, alphaV1_R + fmla s5, s20, alphaV0_I + fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -903,8 +960,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s24, alphaV0_R fmls s0, s25, alphaV0_I - fmla s1, s24, alphaV1_I - fmla s1, s25, alphaV1_R + fmla s1, s24, alphaV0_I + fmla s1, s25, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -912,8 +969,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.s, v5.s}[0], [pCRow1] fmla s4, s28, alphaV0_R fmls s4, s29, alphaV0_I - fmla s5, s28, alphaV1_I - fmla s5, s29, alphaV1_R + fmla s5, s28, alphaV0_I + fmla s5, s29, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -962,13 +1019,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow2, pCRow1, #32 @@ -976,8 +1036,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pCRow2] fmla v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmla v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R + fmla v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] add pCRow1, pCRow1, LDC @@ -985,8 +1045,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmla v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmla v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow2, pCRow1, #32 @@ -994,8 +1054,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v6.4s, v7.4s}, [pCRow2] fmla v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I - fmla v7.4s, v22.4s, alphaV1_I - fmla v7.4s, v23.4s, alphaV1_R + fmla v7.4s, v22.4s, alphaV0_I + fmla v7.4s, v23.4s, alphaV0_R st2 {v6.4s, v7.4s}, [pCRow2] add pCRow0, pCRow0, #64 @@ -1028,13 +1088,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -1042,8 +1105,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmla v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmla v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1076,13 +1139,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmla v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmla v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -1090,8 +1156,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2s, v5.2s}, [pCRow1] fmla v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I - fmla v5.2s, v20.2s, alphaV1_I - fmla v5.2s, v21.2s, alphaV1_R + fmla v5.2s, v20.2s, alphaV0_I + fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -1124,13 +1190,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmla s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmla s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -1138,8 +1207,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.s, v5.s}[0], [pCRow1] fmla s4, s20, alphaV0_R fmls s4, s21, alphaV0_I - fmla s5, s20, alphaV1_I - fmla s5, s21, alphaV1_R + fmla s5, s20, alphaV0_I + fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -1174,13 +1243,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, #32 @@ -1188,8 +1260,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pCRow1] fmla v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmla v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R + fmla v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow1] add pCRow0, pCRow0, #64 @@ -1216,13 +1288,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1248,13 +1323,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmla v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmla v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -1281,13 +1359,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmla s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmla s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -1313,10 +1394,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha0_R, s0 - fmov alpha0_I, s1 - fmov alpha1_R, s0 - fmov alpha1_I, s1 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, s0 + fmov alphaI, s1 lsl LDC, LDC, #3 // ldc = ldc * 8 @@ -1330,8 +1412,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /******************************************************************************/ cgemm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC mov pA, origPA // pA = start of A array @@ -1342,44 +1428,69 @@ cgemm_kernel_L4_M8_BEGIN: cmp counterI, #0 ble cgemm_kernel_L4_M4_BEGIN + .align 5 cgemm_kernel_L4_M8_20: mov pB, origPB - asr counterL , origK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , origK, #3 + cmp counterL , #2 blt cgemm_kernel_L4_M8_32 - KERNEL8x4_I // do one in the K - KERNEL8x4_M2 // do another in the K + KERNEL8x4_I + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 ble cgemm_kernel_L4_M8_22a - .align 5 + .align 5 cgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #1 bgt cgemm_kernel_L4_M8_22 - + .align 5 cgemm_kernel_L4_M8_22a: + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b cgemm_kernel_L4_M8_44 + .align 5 cgemm_kernel_L4_M8_32: tst counterL, #1 ble cgemm_kernel_L4_M8_40 KERNEL8x4_I - + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 KERNEL8x4_E b cgemm_kernel_L4_M8_44 @@ -1390,14 +1501,21 @@ cgemm_kernel_L4_M8_40: cgemm_kernel_L4_M8_44: - ands counterL , origK, #1 + ands counterL , origK, #7 ble cgemm_kernel_L4_M8_100 + .align 5 cgemm_kernel_L4_M8_46: KERNEL8x4_SUB + subs counterL, counterL, #1 + bne cgemm_kernel_L4_M8_46 + cgemm_kernel_L4_M8_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] SAVE8x4 diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S index ce5cb0406..680fb56c3 100644 --- a/kernel/arm64/ctrmm_kernel_8x4.S +++ b/kernel/arm64/ctrmm_kernel_8x4.S @@ -46,20 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 -#define temp x16 -#define tempOffset x17 -#define tempK x18 +#define pCRow3 x15 +#define pA x16 +#define alphaR w17 +#define alphaI w18 +#define temp x19 +#define tempOffset x20 +#define tempK x21 #define alpha0_R s10 #define alphaV0_R v10.s[0] #define alpha0_I s11 #define alphaV0_I v11.s[0] -#define alpha1_R s14 -#define alphaV1_R v14.s[0] -#define alpha1_I s15 -#define alphaV1_I v15.s[0] +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla @@ -124,14 +126,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I //v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R //v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I -//v08 must save pB0_00_R, pB0_01_R, pB0_02_R, pB0_03_R -//v09 must save pB0_00_I, pB0_01_I, pB0_02_I, pB0_03_I -//v10 must save ALPHA0_R -//v11 must save ALPHA0_I -//v12 must save pB1_00_R, pB1_01_R, pB1_02_R, pB1_03_R -//v13 must save pB1_00_I, pB1_01_I, pB1_02_I, pB1_03_I -//v14 must save ALPHA1_R -//v15 must save ALPHA1_I +//v08 must save pB0_00_R, pB0_01_R +//v09 must save pB0_00_I, pB0_01_I +//v10 must save pB0_02_R, pB0_03_R --> ALPHA0_R +//v11 must save pB0_02_I, pB0_03_I --> ALPHA0_I +//v12 must save pB1_00_R, pB1_01_R +//v13 must save pB1_00_I, pB1_01_I +//v14 must save pB1_02_R, pB1_03_R +//v15 must save pB1_02_I, pB1_03_I //v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R //v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I //v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R @@ -149,6 +151,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v30 pC_28_R, pC_29_R, pC_30_R, pC_31_R //v31 pC_28_I, pC_29_I, pC_30_I, pC_31_I + /******************************************************************************* * Macro definitions *******************************************************************************/ @@ -173,8 +176,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 ld2 {v2.4s, v3.4s}, [pA] @@ -191,6 +195,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v17.4s, v1.4s, v8.s[0] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 + fmul v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -202,6 +209,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + fmul v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -213,6 +223,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v14.2s, v15.2s}, [pB] + add pB, pB, #16 + fmul v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -224,56 +237,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v23.4s, v3.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 + + fmul v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.s[2] + fmls v25.4s, v0.4s, v11.s[0] #else - fmul v25.4s, v0.4s, v9.s[2] + fmul v25.4s, v0.4s, v11.s[0] #endif - OP_ir v25.4s, v1.4s, v8.s[2] + OP_ir v25.4s, v1.4s, v10.s[0] - fmul v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 + + fmul v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.4s, v2.4s, v9.s[2] + fmls v27.4s, v2.4s, v11.s[0] #else - fmul v27.4s, v2.4s, v9.s[2] + fmul v27.4s, v2.4s, v11.s[0] #endif - OP_ir v27.4s, v3.4s, v8.s[2] + OP_ir v27.4s, v3.4s, v10.s[0] - fmul v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmul v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.s[3] + fmls v29.4s, v0.4s, v11.s[1] #else - fmul v29.4s, v0.4s, v9.s[3] + fmul v29.4s, v0.4s, v11.s[1] #endif - OP_ir v29.4s, v1.4s, v8.s[3] + OP_ir v29.4s, v1.4s, v10.s[1] - fmul v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + fmul v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.4s, v2.4s, v9.s[3] + fmls v31.4s, v2.4s, v11.s[1] #else - fmul v31.4s, v2.4s, v9.s[3] + fmul v31.4s, v2.4s, v11.s[1] #endif - OP_ir v31.4s, v3.4s, v8.s[3] - - ld2 {v12.4s, v13.4s}, [pB] - add pB, pB, #32 - ld2 {v4.4s, v5.4s}, [pA] - add pA, pA, #32 - ld2 {v6.4s, v7.4s}, [pA] - add pA, pA, #32 + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro KERNEL8x4_M1 @@ -282,47 +298,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] + ld2 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] OP_ri v19.4s, v2.4s, v9.s[0] OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 + OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 + OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] - OP_ri v25.4s, v0.4s, v9.s[2] - OP_ir v25.4s, v1.4s, v8.s[2] + ld2 {v14.2s, v15.2s}, [pB] + add pB, pB, #16 - OP_rr v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] - OP_ri v27.4s, v2.4s, v9.s[2] - OP_ir v27.4s, v3.4s, v8.s[2] + OP_rr v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] + OP_ri v25.4s, v0.4s, v11.s[0] + OP_ir v25.4s, v1.4s, v10.s[0] - OP_rr v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] - OP_ri v29.4s, v0.4s, v9.s[3] - OP_ir v29.4s, v1.4s, v8.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - OP_rr v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] - OP_ri v31.4s, v2.4s, v9.s[3] - OP_ir v31.4s, v3.4s, v8.s[3] + OP_rr v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] + OP_ri v27.4s, v2.4s, v11.s[0] + OP_ir v27.4s, v3.4s, v10.s[0] - ld2 {v12.4s, v13.4s}, [pB] // For next round - add pB, pB, #32 - ld2 {v4.4s, v5.4s}, [pA] // For next round - add pA, pA, #32 - ld2 {v6.4s, v7.4s}, [pA] - add pA, pA, #32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + OP_rr v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] + OP_ri v29.4s, v0.4s, v11.s[1] + OP_ir v29.4s, v1.4s, v10.s[1] + + OP_rr v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] + OP_ri v31.4s, v2.4s, v11.s[1] + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro KERNEL8x4_M2 @@ -331,47 +356,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + OP_rr v18.4s, v6.4s, v12.s[0] OP_ii v18.4s, v7.4s, v13.s[0] OP_ri v19.4s, v6.4s, v13.s[0] OP_ir v19.4s, v7.4s, v12.s[0] + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.s[2] - OP_ii v24.4s, v5.4s, v13.s[2] - OP_ri v25.4s, v4.4s, v13.s[2] - OP_ir v25.4s, v5.4s, v12.s[2] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 - OP_rr v26.4s, v6.4s, v12.s[2] - OP_ii v26.4s, v7.4s, v13.s[2] - OP_ri v27.4s, v6.4s, v13.s[2] - OP_ir v27.4s, v7.4s, v12.s[2] + OP_rr v24.4s, v4.4s, v14.s[0] + OP_ii v24.4s, v5.4s, v15.s[0] + OP_ri v25.4s, v4.4s, v15.s[0] + OP_ir v25.4s, v5.4s, v14.s[0] - OP_rr v28.4s, v4.4s, v12.s[3] - OP_ii v28.4s, v5.4s, v13.s[3] - OP_ri v29.4s, v4.4s, v13.s[3] - OP_ir v29.4s, v5.4s, v12.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - OP_rr v30.4s, v6.4s, v12.s[3] - OP_ii v30.4s, v7.4s, v13.s[3] - OP_ri v31.4s, v6.4s, v13.s[3] - OP_ir v31.4s, v7.4s, v12.s[3] + OP_rr v26.4s, v6.4s, v14.s[0] + OP_ii v26.4s, v7.4s, v15.s[0] + OP_ri v27.4s, v6.4s, v15.s[0] + OP_ir v27.4s, v7.4s, v14.s[0] - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 - ld2 {v0.4s, v1.4s}, [pA] - add pA, pA, #32 - ld2 {v2.4s, v3.4s}, [pA] - add pA, pA, #32 + OP_rr v28.4s, v4.4s, v14.s[1] + OP_ii v28.4s, v5.4s, v15.s[1] + OP_ri v29.4s, v4.4s, v15.s[1] + OP_ir v29.4s, v5.4s, v14.s[1] + + OP_rr v30.4s, v6.4s, v14.s[1] + OP_ii v30.4s, v7.4s, v15.s[1] + OP_ri v31.4s, v6.4s, v15.s[1] + OP_ir v31.4s, v7.4s, v14.s[1] .endm .macro KERNEL8x4_E @@ -390,157 +422,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.s[2] - OP_ii v24.4s, v5.4s, v13.s[2] - OP_ri v25.4s, v4.4s, v13.s[2] - OP_ir v25.4s, v5.4s, v12.s[2] + OP_rr v24.4s, v4.4s, v14.s[0] + OP_ii v24.4s, v5.4s, v15.s[0] + OP_ri v25.4s, v4.4s, v15.s[0] + OP_ir v25.4s, v5.4s, v14.s[0] - OP_rr v26.4s, v6.4s, v12.s[2] - OP_ii v26.4s, v7.4s, v13.s[2] - OP_ri v27.4s, v6.4s, v13.s[2] - OP_ir v27.4s, v7.4s, v12.s[2] + OP_rr v26.4s, v6.4s, v14.s[0] + OP_ii v26.4s, v7.4s, v15.s[0] + OP_ri v27.4s, v6.4s, v15.s[0] + OP_ir v27.4s, v7.4s, v14.s[0] - OP_rr v28.4s, v4.4s, v12.s[3] - OP_ii v28.4s, v5.4s, v13.s[3] - OP_ri v29.4s, v4.4s, v13.s[3] - OP_ir v29.4s, v5.4s, v12.s[3] - - OP_rr v30.4s, v6.4s, v12.s[3] - OP_ii v30.4s, v7.4s, v13.s[3] - OP_ri v31.4s, v6.4s, v13.s[3] - OP_ir v31.4s, v7.4s, v12.s[3] + OP_rr v28.4s, v4.4s, v14.s[1] + OP_ii v28.4s, v5.4s, v15.s[1] + OP_ri v29.4s, v4.4s, v15.s[1] + OP_ir v29.4s, v5.4s, v14.s[1] + OP_rr v30.4s, v6.4s, v14.s[1] + OP_ii v30.4s, v7.4s, v15.s[1] + OP_ri v31.4s, v6.4s, v15.s[1] + OP_ir v31.4s, v7.4s, v14.s[1] .endm .macro KERNEL8x4_SUB - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - ld2 {v2.4s, v3.4s}, [pA] - add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.s[0] - OP_ii v18.4s, v3.4s, v9.s[0] - OP_ri v19.4s, v2.4s, v9.s[0] - OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 + + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] - OP_ri v25.4s, v0.4s, v9.s[2] - OP_ir v25.4s, v1.4s, v8.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] - OP_rr v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] - OP_ri v27.4s, v2.4s, v9.s[2] - OP_ir v27.4s, v3.4s, v8.s[2] + OP_rr v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] + OP_ri v25.4s, v0.4s, v11.s[0] + OP_ir v25.4s, v1.4s, v10.s[0] - OP_rr v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] - OP_ri v29.4s, v0.4s, v9.s[3] - OP_ir v29.4s, v1.4s, v8.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - OP_rr v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] - OP_ri v31.4s, v2.4s, v9.s[3] - OP_ir v31.4s, v3.4s, v8.s[3] + OP_rr v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] + OP_ri v27.4s, v2.4s, v11.s[0] + OP_ir v27.4s, v3.4s, v10.s[0] + OP_rr v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] + OP_ri v29.4s, v0.4s, v11.s[1] + OP_ir v29.4s, v1.4s, v10.s[1] + + OP_rr v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] + OP_ri v31.4s, v2.4s, v11.s[1] + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro SAVE8x4 - mov pCRow1, pCRow0 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R - st2 {v0.4s, v1.4s}, [pCRow1] - - add pCRow2, pCRow1, #32 + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R + st2 {v0.4s, v1.4s}, [pCRow0] + add pCRow0, pCRow0, #32 fmul v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmul v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R - st2 {v2.4s, v3.4s}, [pCRow2] - - add pCRow1, pCRow1, LDC + fmul v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R + st2 {v2.4s, v3.4s}, [pCRow0] + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmul v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmul v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] - add pCRow2, pCRow1, #32 - + add pCRow1, pCRow1, #32 fmul v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I - fmul v7.4s, v22.4s, alphaV1_I - fmla v7.4s, v23.4s, alphaV1_R - st2 {v6.4s, v7.4s}, [pCRow2] - - add pCRow1, pCRow1, LDC + fmul v7.4s, v22.4s, alphaV0_I + fmla v7.4s, v23.4s, alphaV0_R + st2 {v6.4s, v7.4s}, [pCRow1] + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] fmul v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I - fmul v1.4s, v24.4s, alphaV1_I - fmla v1.4s, v25.4s, alphaV1_R - st2 {v0.4s, v1.4s}, [pCRow1] - - add pCRow2, pCRow1, #32 + fmul v1.4s, v24.4s, alphaV0_I + fmla v1.4s, v25.4s, alphaV0_R + st2 {v0.4s, v1.4s}, [pCRow2] + add pCRow2, pCRow2, #32 fmul v2.4s, v26.4s, alphaV0_R fmls v2.4s, v27.4s, alphaV0_I - fmul v3.4s, v26.4s, alphaV1_I - fmla v3.4s, v27.4s, alphaV1_R + fmul v3.4s, v26.4s, alphaV0_I + fmla v3.4s, v27.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] - add pCRow1, pCRow1, LDC - + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] fmul v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I - fmul v5.4s, v28.4s, alphaV1_I - fmla v5.4s, v29.4s, alphaV1_R - st2 {v4.4s, v5.4s}, [pCRow1] - - add pCRow2, pCRow1, #32 + fmul v5.4s, v28.4s, alphaV0_I + fmla v5.4s, v29.4s, alphaV0_R + st2 {v4.4s, v5.4s}, [pCRow3] + add pCRow3, pCRow3, #32 fmul v6.4s, v30.4s, alphaV0_R fmls v6.4s, v31.4s, alphaV0_I - fmul v7.4s, v30.4s, alphaV1_I - fmla v7.4s, v31.4s, alphaV1_R - st2 {v6.4s, v7.4s}, [pCRow2] + fmul v7.4s, v30.4s, alphaV0_I + fmla v7.4s, v31.4s, alphaV0_R + st2 {v6.4s, v7.4s}, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -722,13 +763,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -736,8 +780,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmul v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmul v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -745,8 +789,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I - fmul v1.4s, v24.4s, alphaV1_I - fmla v1.4s, v25.4s, alphaV1_R + fmul v1.4s, v24.4s, alphaV0_I + fmla v1.4s, v25.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -754,8 +798,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I - fmul v5.4s, v28.4s, alphaV1_I - fmla v5.4s, v29.4s, alphaV1_R + fmul v5.4s, v28.4s, alphaV0_I + fmla v5.4s, v29.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -802,13 +846,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmul v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmul v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -816,8 +863,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I - fmul v5.2s, v20.2s, alphaV1_I - fmla v5.2s, v21.2s, alphaV1_R + fmul v5.2s, v20.2s, alphaV0_I + fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -825,8 +872,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v0.2s, v24.2s, alphaV0_R fmls v0.2s, v25.2s, alphaV0_I - fmul v1.2s, v24.2s, alphaV1_I - fmla v1.2s, v25.2s, alphaV1_R + fmul v1.2s, v24.2s, alphaV0_I + fmla v1.2s, v25.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -834,8 +881,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.2s, v28.2s, alphaV0_R fmls v4.2s, v29.2s, alphaV0_I - fmul v5.2s, v28.2s, alphaV1_I - fmla v5.2s, v29.2s, alphaV1_R + fmul v5.2s, v28.2s, alphaV0_I + fmla v5.2s, v29.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -882,13 +929,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmul s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmul s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -896,8 +946,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul s4, s20, alphaV0_R fmls s4, s21, alphaV0_I - fmul s5, s20, alphaV1_I - fmla s5, s21, alphaV1_R + fmul s5, s20, alphaV0_I + fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -905,8 +955,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul s0, s24, alphaV0_R fmls s0, s25, alphaV0_I - fmul s1, s24, alphaV1_I - fmla s1, s25, alphaV1_R + fmul s1, s24, alphaV0_I + fmla s1, s25, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -914,8 +964,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul s4, s28, alphaV0_R fmls s4, s29, alphaV0_I - fmul s5, s28, alphaV1_I - fmla s5, s29, alphaV1_R + fmul s5, s28, alphaV0_I + fmla s5, s29, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -964,13 +1014,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow2, pCRow1, #32 @@ -978,8 +1031,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmul v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R + fmul v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] add pCRow1, pCRow1, LDC @@ -987,8 +1040,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmul v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmul v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow2, pCRow1, #32 @@ -996,8 +1049,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I - fmul v7.4s, v22.4s, alphaV1_I - fmla v7.4s, v23.4s, alphaV1_R + fmul v7.4s, v22.4s, alphaV0_I + fmla v7.4s, v23.4s, alphaV0_R st2 {v6.4s, v7.4s}, [pCRow2] add pCRow0, pCRow0, #64 @@ -1030,13 +1083,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -1044,8 +1100,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmul v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmul v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1078,13 +1134,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmul v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmul v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -1092,8 +1151,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I - fmul v5.2s, v20.2s, alphaV1_I - fmla v5.2s, v21.2s, alphaV1_R + fmul v5.2s, v20.2s, alphaV0_I + fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -1126,13 +1185,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmul s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmul s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -1140,8 +1202,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul s4, s20, alphaV0_R fmls s4, s21, alphaV0_I - fmul s5, s20, alphaV1_I - fmla s5, s21, alphaV1_R + fmul s5, s20, alphaV0_I + fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -1176,13 +1238,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, #32 @@ -1190,8 +1255,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmul v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R + fmul v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow1] add pCRow0, pCRow0, #64 @@ -1218,13 +1283,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1250,13 +1318,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmul v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmul v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -1283,13 +1354,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmul s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmul s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -1315,10 +1389,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha0_R, s0 - fmov alpha0_I, s1 - fmov alpha1_R, s0 - fmov alpha1_I, s1 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, s0 + fmov alphaI, s1 lsl LDC, LDC, #3 // ldc = ldc * 8 @@ -1335,8 +1410,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /******************************************************************************/ ctrmm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + #if defined(LEFT) mov tempOffset, offset @@ -1370,40 +1450,64 @@ ctrmm_kernel_L4_M8_20: add tempK, tempOffset, #4 #endif - asr counterL , tempK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , tempK, #3 + cmp counterL , #2 blt ctrmm_kernel_L4_M8_32 - KERNEL8x4_I // do one in the K - KERNEL8x4_M2 // do another in the K + KERNEL8x4_I + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 ble ctrmm_kernel_L4_M8_22a - .align 5 + .align 5 ctrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #1 bgt ctrmm_kernel_L4_M8_22 - + .align 5 ctrmm_kernel_L4_M8_22a: + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b ctrmm_kernel_L4_M8_44 + .align 5 ctrmm_kernel_L4_M8_32: tst counterL, #1 ble ctrmm_kernel_L4_M8_40 KERNEL8x4_I - + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 KERNEL8x4_E b ctrmm_kernel_L4_M8_44 @@ -1414,13 +1518,17 @@ ctrmm_kernel_L4_M8_40: ctrmm_kernel_L4_M8_44: - ands counterL , tempK, #1 + ands counterL , tempK, #7 ble ctrmm_kernel_L4_M8_100 + .align 5 ctrmm_kernel_L4_M8_46: KERNEL8x4_SUB + subs counterL, counterL, #1 + bne ctrmm_kernel_L4_M8_46 + ctrmm_kernel_L4_M8_100: SAVE8x4 @@ -1440,6 +1548,9 @@ ctrmm_kernel_L4_M8_100: #if defined(LEFT) add tempOffset, tempOffset, #8 #endif + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] ctrmm_kernel_L4_M8_END: subs counterI, counterI, #1 @@ -1454,9 +1565,8 @@ ctrmm_kernel_L4_M4_BEGIN: tst counterI, #4 ble ctrmm_kernel_L4_M2_BEGIN -ctrmm_kernel_L4_M4_20: - INIT4x4 +ctrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1475,38 +1585,47 @@ ctrmm_kernel_L4_M4_20: add tempK, tempOffset, #4 #endif - asr counterL , tempK, #3 // counterL = counterL / 8 - cmp counterL , #0 - ble ctrmm_kernel_L4_M4_40 + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt ctrmm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble ctrmm_kernel_L4_M4_22a + .align 5 + ctrmm_kernel_L4_M4_22: - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB + KERNEL4x4_M1 + KERNEL4x4_M2 subs counterL, counterL, #1 bgt ctrmm_kernel_L4_M4_22 - +ctrmm_kernel_L4_M4_22a: + KERNEL4x4_M1 + KERNEL4x4_E + b ctrmm_kernel_L4_M4_44 +ctrmm_kernel_L4_M4_32: + tst counterL, #1 + ble ctrmm_kernel_L4_M4_40 + KERNEL4x4_I + KERNEL4x4_E + b ctrmm_kernel_L4_M4_44 ctrmm_kernel_L4_M4_40: - ands counterL , tempK, #7 // counterL = counterL % 8 + INIT4x4 + +ctrmm_kernel_L4_M4_44: + ands counterL , tempK, #1 ble ctrmm_kernel_L4_M4_100 -ctrmm_kernel_L4_M4_42: - +ctrmm_kernel_L4_M4_46: KERNEL4x4_SUB - subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M4_42 - ctrmm_kernel_L4_M4_100: SAVE4x4 @@ -1528,7 +1647,6 @@ ctrmm_kernel_L4_M4_100: ctrmm_kernel_L4_M4_END: - ctrmm_kernel_L4_M2_BEGIN: mov counterI, origM diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S index b06c7560d..2b8173715 100644 --- a/kernel/arm64/dtrmm_kernel_8x4.S +++ b/kernel/arm64/dtrmm_kernel_8x4.S @@ -46,19 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 -#define temp x16 -#define tempOffset x17 -#define tempK x18 +#define pCRow3 x15 +#define pA x16 +#define alpha x17 +#define temp x18 +#define tempOffset x19 +#define tempK x20 #define alpha0 d10 #define alphaV0 v10.d[0] -#define alpha1 d11 -#define alphaV1 v11.d[0] -#define alpha2 d14 -#define alphaV2 v14.d[0] -#define alpha3 d15 -#define alphaV3 v15.d[0] + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 // 00 origM // 01 origN @@ -101,14 +101,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_2, pA1_3 //v06 pA1_4, pA1_5 //v07 pA1_6, pA1_7 -//v08 must save pB0_0, pB0_1 -//v09 must save pB0_2, pB0_3 -//v10 must save ALPHA0 -//v11 must save ALPHA1 -//v12 must save pB1_0, pB1_1 -//v13 must save pB1_2, pB1_3 -//v14 must save ALPHA2 -//v15 must save ALPHA3 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 --> ALPHA0 +//v11 must save pB0_3 +//v12 must save pB1_0 +//v13 must save pB1_1 +//v14 must save pB1_2 +//v15 must save pB1_3 //v16 must save C00, C01 //v17 must save C02, C03 //v18 C04, C05 @@ -150,186 +150,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld1 {v0.2d, v1.2d}, [pA] - add pA, pA, #32 - ld1 {v8.2d, v9.2d}, [pB] - add pB, pB, #32 - ld1 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 + ldp q0, q1, [pA], #32 + + ldp d8, d9, [pB], #16 fmul v16.2d, v0.2d, v8.d[0] + fmul v20.2d, v0.2d, v9.d[0] + + ldp d10, d11, [pB], #16 + fmul v17.2d, v1.2d, v8.d[0] + fmul v21.2d, v1.2d, v9.d[0] + + ldp q2, q3, [pA], #32 + + fmul v24.2d, v0.2d, v10.d[0] + fmul v28.2d, v0.2d, v11.d[0] + + ldp q4, q5, [pA], #32 + + fmul v25.2d, v1.2d, v10.d[0] + fmul v29.2d, v1.2d, v11.d[0] + + ldp d12, d13, [pB], #16 + fmul v18.2d, v2.2d, v8.d[0] + fmul v22.2d, v2.2d, v9.d[0] + + ldp d14, d15, [pB], #16 + + fmul v26.2d, v2.2d, v10.d[0] + fmul v30.2d, v2.2d, v11.d[0] + + ldp q6, q7, [pA], #32 + fmul v19.2d, v3.2d, v8.d[0] + fmul v27.2d, v3.2d, v10.d[0] - fmul v20.2d, v0.2d, v8.d[1] - fmul v21.2d, v1.2d, v8.d[1] - fmul v22.2d, v2.2d, v8.d[1] - fmul v23.2d, v3.2d, v8.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - fmul v24.2d, v0.2d, v9.d[0] - fmul v25.2d, v1.2d, v9.d[0] - fmul v26.2d, v2.2d, v9.d[0] - fmul v27.2d, v3.2d, v9.d[0] + fmul v31.2d, v3.2d, v11.d[0] + fmul v23.2d, v3.2d, v9.d[0] - fmul v28.2d, v0.2d, v9.d[1] - fmul v29.2d, v1.2d, v9.d[1] - fmul v30.2d, v2.2d, v9.d[1] - fmul v31.2d, v3.2d, v9.d[1] - - ld1 {v4.2d, v5.2d}, [pA] - add pA, pA, #32 - ld1 {v12.2d, v13.2d}, [pB] - add pB, pB, #32 - ld1 {v6.2d, v7.2d}, [pA] - add pA, pA, #32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL8x4_M1 fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v9.d[0] + + ldp q4, q5, [pA], #32 + + fmla v24.2d, v0.2d, v10.d[0] + fmla v28.2d, v0.2d, v11.d[0] + + ldp d12, d13, [pB], #16 + fmla v17.2d, v1.2d, v8.d[0] + fmla v25.2d, v1.2d, v10.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + fmla v21.2d, v1.2d, v9.d[0] + fmla v29.2d, v1.2d, v11.d[0] + + ldp d14, d15, [pB], #16 + fmla v18.2d, v2.2d, v8.d[0] + fmla v22.2d, v2.2d, v9.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmla v26.2d, v2.2d, v10.d[0] + fmla v30.2d, v2.2d, v11.d[0] fmla v19.2d, v3.2d, v8.d[0] + fmla v23.2d, v3.2d, v9.d[0] - fmla v20.2d, v0.2d, v8.d[1] - fmla v21.2d, v1.2d, v8.d[1] - fmla v22.2d, v2.2d, v8.d[1] - fmla v23.2d, v3.2d, v8.d[1] + ldp q6, q7, [pA], #32 - fmla v24.2d, v0.2d, v9.d[0] - fmla v25.2d, v1.2d, v9.d[0] - fmla v26.2d, v2.2d, v9.d[0] - fmla v27.2d, v3.2d, v9.d[0] - - fmla v28.2d, v0.2d, v9.d[1] - fmla v29.2d, v1.2d, v9.d[1] - fmla v30.2d, v2.2d, v9.d[1] - fmla v31.2d, v3.2d, v9.d[1] - - ld1 {v4.2d, v5.2d}, [pA] - add pA, pA, #32 - ld1 {v12.2d, v13.2d}, [pB] - add pB, pB, #32 - ld1 {v6.2d, v7.2d}, [pA] - add pA, pA, #32 - - prfm PLDL1KEEP, [pA, #512] + fmla v27.2d, v3.2d, v10.d[0] + fmla v31.2d, v3.2d, v11.d[0] .endm .macro KERNEL8x4_M2 fmla v16.2d, v4.2d, v12.d[0] + fmla v20.2d, v4.2d, v13.d[0] + fmla v24.2d, v4.2d, v14.d[0] + fmla v28.2d, v4.2d, v15.d[0] + + ldp q0, q1, [pA], #32 + fmla v17.2d, v5.2d, v12.d[0] + fmla v25.2d, v5.2d, v14.d[0] + + ldp d8, d9, [pB], #16 + + fmla v21.2d, v5.2d, v13.d[0] + fmla v29.2d, v5.2d, v15.d[0] + + ldp d10, d11, [pB], #16 + fmla v18.2d, v6.2d, v12.d[0] + fmla v22.2d, v6.2d, v13.d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v26.2d, v6.2d, v14.d[0] + fmla v30.2d, v6.2d, v15.d[0] + fmla v19.2d, v7.2d, v12.d[0] + fmla v23.2d, v7.2d, v13.d[0] - fmla v20.2d, v4.2d, v12.d[1] - fmla v21.2d, v5.2d, v12.d[1] - fmla v22.2d, v6.2d, v12.d[1] - fmla v23.2d, v7.2d, v12.d[1] + ldp q2, q3, [pA], #32 - fmla v24.2d, v4.2d, v13.d[0] - fmla v25.2d, v5.2d, v13.d[0] - fmla v26.2d, v6.2d, v13.d[0] - fmla v27.2d, v7.2d, v13.d[0] - - fmla v28.2d, v4.2d, v13.d[1] - fmla v29.2d, v5.2d, v13.d[1] - fmla v30.2d, v6.2d, v13.d[1] - fmla v31.2d, v7.2d, v13.d[1] - - ld1 {v0.2d, v1.2d}, [pA] - add pA, pA, #32 - ld1 {v8.2d, v9.2d}, [pB] - add pB, pB, #32 - ld1 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 - - prfm PLDL1KEEP, [pB, #512] + fmla v27.2d, v7.2d, v14.d[0] + fmla v31.2d, v7.2d, v15.d[0] .endm .macro KERNEL8x4_E fmla v16.2d, v4.2d, v12.d[0] + fmla v20.2d, v4.2d, v13.d[0] + fmla v24.2d, v4.2d, v14.d[0] + fmla v28.2d, v4.2d, v15.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v29.2d, v5.2d, v15.d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v18.2d, v6.2d, v12.d[0] + fmla v22.2d, v6.2d, v13.d[0] + fmla v26.2d, v6.2d, v14.d[0] + fmla v30.2d, v6.2d, v15.d[0] + fmla v19.2d, v7.2d, v12.d[0] - - fmla v20.2d, v4.2d, v12.d[1] - fmla v21.2d, v5.2d, v12.d[1] - fmla v22.2d, v6.2d, v12.d[1] - fmla v23.2d, v7.2d, v12.d[1] - - fmla v24.2d, v4.2d, v13.d[0] - fmla v25.2d, v5.2d, v13.d[0] - fmla v26.2d, v6.2d, v13.d[0] - fmla v27.2d, v7.2d, v13.d[0] - - fmla v28.2d, v4.2d, v13.d[1] - fmla v29.2d, v5.2d, v13.d[1] - fmla v30.2d, v6.2d, v13.d[1] - fmla v31.2d, v7.2d, v13.d[1] + fmla v23.2d, v7.2d, v13.d[0] + fmla v27.2d, v7.2d, v14.d[0] + fmla v31.2d, v7.2d, v15.d[0] .endm .macro KERNEL8x4_SUB - ld1 {v0.2d, v1.2d}, [pA] - add pA, pA, #32 - ld1 {v8.2d, v9.2d}, [pB] - add pB, pB, #32 - ld1 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 + ldp q0, q1, [pA], #32 + + ldp d8, d9, [pB], #16 fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v9.d[0] + + ldp d10, d11, [pB], #16 + fmla v17.2d, v1.2d, v8.d[0] + fmla v21.2d, v1.2d, v9.d[0] + + ldp q2, q3, [pA], #32 + + fmla v24.2d, v0.2d, v10.d[0] + fmla v28.2d, v0.2d, v11.d[0] + + fmla v25.2d, v1.2d, v10.d[0] + fmla v29.2d, v1.2d, v11.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v18.2d, v2.2d, v8.d[0] + fmla v22.2d, v2.2d, v9.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + fmla v26.2d, v2.2d, v10.d[0] + fmla v30.2d, v2.2d, v11.d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v19.2d, v3.2d, v8.d[0] + fmla v27.2d, v3.2d, v10.d[0] - fmla v20.2d, v0.2d, v8.d[1] - fmla v21.2d, v1.2d, v8.d[1] - fmla v22.2d, v2.2d, v8.d[1] - fmla v23.2d, v3.2d, v8.d[1] - - fmla v24.2d, v0.2d, v9.d[0] - fmla v25.2d, v1.2d, v9.d[0] - fmla v26.2d, v2.2d, v9.d[0] - fmla v27.2d, v3.2d, v9.d[0] - - fmla v28.2d, v0.2d, v9.d[1] - fmla v29.2d, v1.2d, v9.d[1] - fmla v30.2d, v2.2d, v9.d[1] - fmla v31.2d, v3.2d, v9.d[1] + fmla v31.2d, v3.2d, v11.d[0] + fmla v23.2d, v3.2d, v9.d[0] .endm .macro SAVE8x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] fmul v0.2d, v16.2d, alphaV0 - fmul v1.2d, v17.2d, alphaV1 - fmul v2.2d, v18.2d, alphaV2 - fmul v3.2d, v19.2d, alphaV3 - st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + fmul v1.2d, v17.2d, alphaV0 + stp q0, q1, [pCRow0] - add pCRow2, pCRow1, LDC + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + fmul v2.2d, v18.2d, alphaV0 + fmul v3.2d, v19.2d, alphaV0 + stp q2, q3, [pCRow0] + + add pCRow0, pCRow0, #32 + + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] fmul v4.2d, v20.2d, alphaV0 - fmul v5.2d, v21.2d, alphaV1 - fmul v6.2d, v22.2d, alphaV2 - fmul v7.2d, v23.2d, alphaV3 - st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] + fmul v5.2d, v21.2d, alphaV0 + stp q4, q5, [pCRow1] - add pCRow1, pCRow2, LDC + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmul v6.2d, v22.2d, alphaV0 + fmul v7.2d, v23.2d, alphaV0 + stp q6, q7, [pCRow1] + + add pCRow1, pCRow1, #32 + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] fmul v0.2d, v24.2d, alphaV0 - fmul v1.2d, v25.2d, alphaV1 - fmul v2.2d, v26.2d, alphaV2 - fmul v3.2d, v27.2d, alphaV3 - st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2] + fmul v1.2d, v25.2d, alphaV0 + stp q0, q1, [pCRow2] + + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + fmul v2.2d, v26.2d, alphaV0 + fmul v3.2d, v27.2d, alphaV0 + stp q2, q3, [pCRow2] + + add pCRow2, pCRow2, #32 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] fmul v4.2d, v28.2d, alphaV0 - fmul v5.2d, v29.2d, alphaV1 - fmul v6.2d, v30.2d, alphaV2 - fmul v7.2d, v31.2d, alphaV3 - st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] + fmul v5.2d, v29.2d, alphaV0 + stp q4, q5, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + fmul v6.2d, v30.2d, alphaV0 + fmul v7.2d, v31.2d, alphaV0 + stp q6, q7, [pCRow3] + + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -365,26 +428,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 - fmul v9.2d, v17.2d, alphaV1 + fmul v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC - fmul v12.2d, v20.2d, alphaV2 - fmul v13.2d, v21.2d, alphaV3 + fmul v12.2d, v20.2d, alphaV0 + fmul v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow2, pCRow1, LDC fmul v8.2d, v24.2d, alphaV0 - fmul v9.2d, v25.2d, alphaV1 + fmul v9.2d, v25.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow2] add pCRow1, pCRow2, LDC - fmul v12.2d, v28.2d, alphaV2 - fmul v13.2d, v29.2d, alphaV3 + fmul v12.2d, v28.2d, alphaV0 + fmul v13.2d, v29.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -413,22 +477,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow1, pCRow0, LDC - fmul v12.2d, v20.2d, alphaV1 + fmul v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow2, pCRow1, LDC - fmul v8.2d, v24.2d, alphaV2 + fmul v8.2d, v24.2d, alphaV0 st1 {v8.2d}, [pCRow2] add pCRow1, pCRow2, LDC - fmul v12.2d, v28.2d, alphaV3 + fmul v12.2d, v28.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 @@ -453,6 +518,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 + fmov alpha0, alpha + add pCRow1, pCRow0, LDC fmul v8.2d, v16.2d, alphaV0 @@ -462,7 +529,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow2, pCRow1, LDC add pCRow1, pCRow2, LDC - fmul v12.2d, v20.2d, alphaV1 + fmul v12.2d, v20.2d, alphaV0 st1 {v12.d}[0], [pCRow2] st1 {v12.d}[1], [pCRow1] @@ -502,18 +569,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0, alpha add pCRow1, pCRow0, LDC fmul v0.2d, v16.2d, alphaV0 - fmul v1.2d, v17.2d, alphaV1 - fmul v2.2d, v18.2d, alphaV2 - fmul v3.2d, v19.2d, alphaV3 + fmul v1.2d, v17.2d, alphaV0 + fmul v2.2d, v18.2d, alphaV0 + fmul v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] fmul v4.2d, v20.2d, alphaV0 - fmul v5.2d, v21.2d, alphaV1 - fmul v6.2d, v22.2d, alphaV2 - fmul v7.2d, v23.2d, alphaV3 + fmul v5.2d, v21.2d, alphaV0 + fmul v6.2d, v22.2d, alphaV0 + fmul v7.2d, v23.2d, alphaV0 st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] add pCRow0, pCRow0, #64 @@ -541,14 +609,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 - fmul v9.2d, v17.2d, alphaV1 + fmul v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC - fmul v12.2d, v20.2d, alphaV2 - fmul v13.2d, v21.2d, alphaV3 + fmul v12.2d, v20.2d, alphaV0 + fmul v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -573,12 +642,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow1 , pCRow0, LDC - fmul v12.2d, v20.2d, alphaV1 + fmul v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 @@ -601,6 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0, alpha add pCRow1 , pCRow0, LDC fmul v8.2d, v16.2d, alphaV0 @@ -636,10 +707,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0, alpha fmul v0.2d, v16.2d, alphaV0 - fmul v1.2d, v17.2d, alphaV1 - fmul v2.2d, v18.2d, alphaV2 - fmul v3.2d, v19.2d, alphaV3 + fmul v1.2d, v17.2d, alphaV0 + fmul v2.2d, v18.2d, alphaV0 + fmul v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] add pCRow0, pCRow0, #64 @@ -665,8 +737,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 - fmul v9.2d, v17.2d, alphaV1 + fmul v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow0, pCRow0, #32 @@ -690,6 +763,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] @@ -713,6 +787,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0, alpha fmul d8, d16, alpha0 str d8, [pCRow0] @@ -739,10 +814,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha0, d0 - fmov alpha1, d0 - fmov alpha2, d0 - fmov alpha3, d0 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, d0 lsl LDC, LDC, #3 // ldc = ldc * 8 @@ -759,8 +834,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /******************************************************************************/ dtrmm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + #if defined(LEFT) mov tempOffset, offset @@ -774,6 +854,7 @@ dtrmm_kernel_L4_M8_BEGIN: cmp counterI, #0 ble dtrmm_kernel_L4_M4_BEGIN + .align 5 dtrmm_kernel_L4_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -794,40 +875,64 @@ dtrmm_kernel_L4_M8_20: add tempK, tempOffset, #4 #endif - asr counterL , tempK, #1 // L = K / 2 + asr counterL , tempK, #3 // L = K / 8 cmp counterL , #2 // is there at least 4 to do? blt dtrmm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 ble dtrmm_kernel_L4_M8_22a - .align 5 + .align 5 dtrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #1 bgt dtrmm_kernel_L4_M8_22 - + .align 5 dtrmm_kernel_L4_M8_22a: + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b dtrmm_kernel_L4_M8_44 + .align 5 dtrmm_kernel_L4_M8_32: tst counterL, #1 ble dtrmm_kernel_L4_M8_40 KERNEL8x4_I - + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 KERNEL8x4_E b dtrmm_kernel_L4_M8_44 @@ -838,13 +943,17 @@ dtrmm_kernel_L4_M8_40: dtrmm_kernel_L4_M8_44: - ands counterL , tempK, #1 + ands counterL , tempK, #7 ble dtrmm_kernel_L4_M8_100 + .align 5 dtrmm_kernel_L4_M8_46: KERNEL8x4_SUB + subs counterL, counterL, #1 + bne dtrmm_kernel_L4_M8_46 + dtrmm_kernel_L4_M8_100: SAVE8x4 @@ -864,6 +973,9 @@ dtrmm_kernel_L4_M8_100: #if defined(LEFT) add tempOffset, tempOffset, #8 #endif + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] dtrmm_kernel_L4_M8_END: subs counterI, counterI, #1 diff --git a/kernel/arm64/sgemm_kernel_16x4.S b/kernel/arm64/sgemm_kernel_16x4.S index 68366d9f2..6e3645b76 100644 --- a/kernel/arm64/sgemm_kernel_16x4.S +++ b/kernel/arm64/sgemm_kernel_16x4.S @@ -46,16 +46,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 +#define pCRow3 x15 +#define pA x16 +#define alpha w17 #define alpha0 s10 #define alphaV0 v10.s[0] -#define alpha1 s11 -#define alphaV1 v11.s[0] -#define alpha2 s14 -#define alphaV2 v14.s[0] -#define alpha3 s15 -#define alphaV3 v15.s[0] + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 224 +#define C_PRE_SIZE 160 + // 00 origM // 01 origN @@ -98,14 +99,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_04, pA1_05, pA1_06, pA1_07 //v06 pA1_08, pA1_09, pA1_10, pA1_11 //v07 pA1_12, pA1_13, pA1_14, pA1_15 -//v08 must save pB00, pB01 -//v09 must save pB02, pB03 -//v10 must save ALPHA0 -//v11 must save ALPHA1 -//v12 must save pB10, pB11 -//v13 must save pB12, pB13 -//v14 must save ALPHA2 -//v15 must save ALPHA3 +//v08 must save pB00 +//v09 must save pB01 +//v10 must save pB02 +//v11 must save pB03 +//v12 must save pB10 +//v13 must save pB11 +//v14 must save pB12 +//v15 must save pB13 //v16 must save C00, C01, C02, C03 //v17 must save C04, C05, C06, C07 //v18 C08, C09, C10, C11 @@ -147,206 +148,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp q0, q1, [pA], #32 + + ldp s8, s9, [pB], #8 fmul v16.4s, v0.4s, v8.s[0] + fmul v20.4s, v0.4s, v9.s[0] + + ldp s10, s11, [pB], #8 + + fmul v24.4s, v0.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] + + ldp q2, q3, [pA], #32 + fmul v17.4s, v1.4s, v8.s[0] + fmul v21.4s, v1.4s, v9.s[0] + + ldp q4, q5, [pA], #32 + + fmul v25.4s, v1.4s, v10.s[0] + fmul v29.4s, v1.4s, v11.s[0] + + ldp s12, s13, [pB], #8 + fmul v18.4s, v2.4s, v8.s[0] + fmul v22.4s, v2.4s, v9.s[0] + + ldp s14, s15, [pB], #8 + fmul v19.4s, v3.4s, v8.s[0] + fmul v23.4s, v3.4s, v9.s[0] - fmul v20.4s, v0.4s, v8.s[1] - fmul v21.4s, v1.4s, v8.s[1] - fmul v22.4s, v2.4s, v8.s[1] - fmul v23.4s, v3.4s, v8.s[1] + ldp q6, q7, [pA], #32 - fmul v24.4s, v0.4s, v9.s[0] - fmul v25.4s, v1.4s, v9.s[0] - fmul v26.4s, v2.4s, v9.s[0] - fmul v27.4s, v3.4s, v9.s[0] + fmul v26.4s, v2.4s, v10.s[0] + fmul v30.4s, v2.4s, v11.s[0] - fmul v28.4s, v0.4s, v9.s[1] - fmul v29.4s, v1.4s, v9.s[1] - fmul v30.4s, v2.4s, v9.s[1] - fmul v31.4s, v3.4s, v9.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 - ld1 {v6.4s}, [pA] - add pA, pA, #16 - ld1 {v7.4s}, [pA] - add pA, pA, #16 + fmul v27.4s, v3.4s, v10.s[0] + fmul v31.4s, v3.4s, v11.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL16x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] + + ldp q4, q5, [pA], #32 + fmla v18.4s, v2.4s, v8.s[0] fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v22.4s, v2.4s, v8.s[1] - fmla v23.4s, v3.4s, v8.s[1] + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v26.4s, v2.4s, v9.s[0] - fmla v27.4s, v3.4s, v9.s[0] + ldp s12, s13, [pB], #8 - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - fmla v30.4s, v2.4s, v9.s[1] - fmla v31.4s, v3.4s, v9.s[1] + fmla v22.4s, v2.4s, v9.s[0] + fmla v23.4s, v3.4s, v9.s[0] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 - ld1 {v6.4s}, [pA] - add pA, pA, #16 - ld1 {v7.4s}, [pA] - add pA, pA, #16 + ldp s14, s15, [pB], #8 + + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + fmla v26.4s, v2.4s, v10.s[0] + fmla v27.4s, v3.4s, v10.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] + + ldp q6, q7, [pA], #32 + + fmla v30.4s, v2.4s, v11.s[0] + fmla v31.4s, v3.4s, v11.s[0] .endm .macro KERNEL16x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] + + ldp q0, q1, [pA], #32 + fmla v18.4s, v6.4s, v12.s[0] fmla v19.4s, v7.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v22.4s, v6.4s, v12.s[1] - fmla v23.4s, v7.4s, v12.s[1] + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v26.4s, v6.4s, v13.s[0] - fmla v27.4s, v7.4s, v13.s[0] + ldp s8, s9, [pB], #8 - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - fmla v30.4s, v6.4s, v13.s[1] - fmla v31.4s, v7.4s, v13.s[1] + fmla v22.4s, v6.4s, v13.s[0] + fmla v23.4s, v7.4s, v13.s[0] - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp s10, s11, [pB], #8 + + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v26.4s, v6.4s, v14.s[0] + fmla v27.4s, v7.4s, v14.s[0] + + ldp q2, q3, [pA], #32 + + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] + + fmla v30.4s, v6.4s, v15.s[0] + fmla v31.4s, v7.4s, v15.s[0] .endm .macro KERNEL16x4_E fmla v16.4s, v4.4s, v12.s[0] + fmla v20.4s, v4.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v29.4s, v5.4s, v15.s[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v18.4s, v6.4s, v12.s[0] + fmla v22.4s, v6.4s, v13.s[0] + fmla v26.4s, v6.4s, v14.s[0] + fmla v30.4s, v6.4s, v15.s[0] + fmla v19.4s, v7.4s, v12.s[0] - - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v22.4s, v6.4s, v12.s[1] - fmla v23.4s, v7.4s, v12.s[1] - - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v26.4s, v6.4s, v13.s[0] - fmla v27.4s, v7.4s, v13.s[0] - - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - fmla v30.4s, v6.4s, v13.s[1] - fmla v31.4s, v7.4s, v13.s[1] + fmla v23.4s, v7.4s, v13.s[0] + fmla v27.4s, v7.4s, v14.s[0] + fmla v31.4s, v7.4s, v15.s[0] .endm .macro KERNEL16x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp q0, q1, [pA], #32 + ldp s8, s9, [pB], #8 fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + + ldp s10, s11, [pB], #8 + + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + + ldp q2, q3, [pA], #32 + fmla v17.4s, v1.4s, v8.s[0] + fmla v21.4s, v1.4s, v9.s[0] + + fmla v25.4s, v1.4s, v10.s[0] + fmla v29.4s, v1.4s, v11.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v22.4s, v2.4s, v9.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v19.4s, v3.4s, v8.s[0] + fmla v23.4s, v3.4s, v9.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v22.4s, v2.4s, v8.s[1] - fmla v23.4s, v3.4s, v8.s[1] + fmla v26.4s, v2.4s, v10.s[0] + fmla v30.4s, v2.4s, v11.s[0] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v26.4s, v2.4s, v9.s[0] - fmla v27.4s, v3.4s, v9.s[0] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - fmla v30.4s, v2.4s, v9.s[1] - fmla v31.4s, v3.4s, v9.s[1] + fmla v27.4s, v3.4s, v10.s[0] + fmla v31.4s, v3.4s, v11.s[0] .endm .macro SAVE16x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ldp q0, q1, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 - fmla v2.4s, v18.4s, alphaV2 - fmla v3.4s, v19.4s, alphaV3 - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + fmla v1.4s, v17.4s, alphaV0 + stp q0, q1, [pCRow0] - add pCRow2, pCRow1, LDC + add pCRow0, pCRow0, #32 - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + ldp q2, q3, [pCRow0] + fmla v2.4s, v18.4s, alphaV0 + fmla v3.4s, v19.4s, alphaV0 + stp q2, q3, [pCRow0] + + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ldp q4, q5, [pCRow1] fmla v4.4s, v20.4s, alphaV0 - fmla v5.4s, v21.4s, alphaV1 - fmla v6.4s, v22.4s, alphaV2 - fmla v7.4s, v23.4s, alphaV3 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmla v5.4s, v21.4s, alphaV0 + stp q4, q5, [pCRow1] - add pCRow1, pCRow2, LDC + add pCRow1, pCRow1, #32 - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2] + ldp q6, q7, [pCRow1] + fmla v6.4s, v22.4s, alphaV0 + fmla v7.4s, v23.4s, alphaV0 + stp q6, q7, [pCRow1] + + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + ldp q0, q1, [pCRow2] fmla v0.4s, v24.4s, alphaV0 - fmla v1.4s, v25.4s, alphaV1 - fmla v2.4s, v26.4s, alphaV2 - fmla v3.4s, v27.4s, alphaV3 - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2] + fmla v1.4s, v25.4s, alphaV0 + stp q0, q1, [pCRow2] - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + add pCRow2, pCRow2, #32 + + ldp q2, q3, [pCRow2] + fmla v2.4s, v26.4s, alphaV0 + fmla v3.4s, v27.4s, alphaV0 + stp q2, q3, [pCRow2] + + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + ldp q4, q5, [pCRow3] fmla v4.4s, v28.4s, alphaV0 - fmla v5.4s, v29.4s, alphaV1 - fmla v6.4s, v30.4s, alphaV2 - fmla v7.4s, v31.4s, alphaV3 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmla v5.4s, v29.4s, alphaV0 + stp q4, q5, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 + + ldp q6, q7, [pCRow3] + fmla v6.4s, v30.4s, alphaV0 + fmla v7.4s, v31.4s, alphaV0 + stp q6, q7, [pCRow3] + + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -363,264 +407,217 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 fmul v16.4s, v0.4s, v8.s[0] fmul v17.4s, v1.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.s[1] - fmul v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v9.s[0] - fmul v25.4s, v1.4s, v9.s[0] - fmul v28.4s, v0.4s, v9.s[1] - fmul v29.4s, v1.4s, v9.s[1] + fmul v20.4s, v0.4s, v9.s[0] + fmul v21.4s, v1.4s, v9.s[0] + fmul v24.4s, v0.4s, v10.s[0] + fmul v25.4s, v1.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] + fmul v29.4s, v1.4s, v11.s[0] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 + + ldr q4, [pA], #16 + ldr q5, [pA], #16 .endm .macro KERNEL8x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 + + ldr q4, [pA], #16 + ldr q5, [pA], #16 .endm .macro KERNEL8x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 .endm .macro KERNEL8x4_E fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] .endm .macro KERNEL8x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] .endm .macro SAVE8x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha - ld1 {v0.4s, v1.4s}, [pCRow0] + ldp q0, q1, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow0] - - add pCRow2, pCRow1, LDC - - ld1 {v4.4s, v5.4s}, [pCRow1] - fmla v4.4s, v20.4s, alphaV0 - fmla v5.4s, v21.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] - - add pCRow1, pCRow2, LDC - - ld1 {v0.4s, v1.4s}, [pCRow2] - fmla v0.4s, v24.4s, alphaV0 - fmla v1.4s, v25.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow2] - - ld1 {v4.4s, v5.4s}, [pCRow1] - fmla v4.4s, v28.4s, alphaV0 - fmla v5.4s, v29.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] + fmla v1.4s, v17.4s, alphaV0 + stp q0, q1, [pCRow0] add pCRow0, pCRow0, #32 + + ldp q2, q3, [pCRow1] + fmla v2.4s, v20.4s, alphaV0 + fmla v3.4s, v21.4s, alphaV0 + stp q2, q3, [pCRow1] + + add pCRow1, pCRow1, #32 + + ldp q4, q5, [pCRow2] + fmla v4.4s, v24.4s, alphaV0 + fmla v5.4s, v25.4s, alphaV0 + stp q4, q5, [pCRow2] + + add pCRow2, pCRow2, #32 + + ldp q6, q7, [pCRow3] + fmla v6.4s, v28.4s, alphaV0 + fmla v7.4s, v29.4s, alphaV0 + stp q6, q7, [pCRow3] + + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT4x4 fmov s16, wzr - fmov s17, s16 - fmov s20, s17 - fmov s21, s16 - fmov s24, s17 - fmov s25, s16 - fmov s28, s17 - fmov s29, s16 + fmov s20, wzr + fmov s24, wzr + fmov s28, wzr .endm .macro KERNEL4x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s, v1.2s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmul v16.2s, v0.2s, v8.s[0] - fmul v29.2s, v1.2s, v9.s[1] + ldr q0, [pA], #16 - fmul v20.2s, v0.2s, v8.s[1] - fmul v25.2s, v1.2s, v9.s[0] + fmul v16.4s, v0.4s, v8.s[0] + fmul v20.4s, v0.4s, v9.s[0] + fmul v24.4s, v0.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] - fmul v24.2s, v0.2s, v9.s[0] - fmul v21.2s, v1.2s, v8.s[1] + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 - fmul v28.2s, v0.2s, v9.s[1] - fmul v17.2s, v1.2s, v8.s[0] - - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.2s, v5.2s}, [pA] - add pA, pA, #16 + ldr q1, [pA], #16 .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.s[0] - fmla v29.2s, v1.2s, v9.s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] - ld1 {v12.2s, v13.2s}, [pB] // For next round - add pB, pB, #16 + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 - fmla v20.2s, v0.2s, v8.s[1] - fmla v25.2s, v1.2s, v9.s[0] - - ld1 {v4.2s, v5.2s}, [pA] // For next round - add pA, pA, #16 - - fmla v24.2s, v0.2s, v9.s[0] - fmla v21.2s, v1.2s, v8.s[1] - - prfm PLDL1KEEP, [pB, #512] - - fmla v28.2s, v0.2s, v9.s[1] - fmla v17.2s, v1.2s, v8.s[0] + ldr q1, [pA], #16 .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.s[0] - fmla v29.2s, v5.2s, v13.s[1] + fmla v16.4s, v1.4s, v12.s[0] + fmla v20.4s, v1.4s, v13.s[0] + fmla v24.4s, v1.4s, v14.s[0] + fmla v28.4s, v1.4s, v15.s[0] - ld1 {v8.2s, v9.2s}, [pB] // For next round - add pB, pB, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmla v20.2s, v4.2s, v12.s[1] - fmla v25.2s, v5.2s, v13.s[0] - - ld1 {v0.2s, v1.2s}, [pA] // For next round - add pA, pA, #16 - - fmla v24.2s, v4.2s, v13.s[0] - fmla v21.2s, v5.2s, v12.s[1] - - prfm PLDL1KEEP, [pA, #512] - - fmla v28.2s, v4.2s, v13.s[1] - fmla v17.2s, v5.2s, v12.s[0] + ldr q0, [pA], #16 .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.s[0] - fmla v29.2s, v5.2s, v13.s[1] - - fmla v20.2s, v4.2s, v12.s[1] - fmla v25.2s, v5.2s, v13.s[0] - - fmla v24.2s, v4.2s, v13.s[0] - fmla v21.2s, v5.2s, v12.s[1] - - fmla v28.2s, v4.2s, v13.s[1] - fmla v17.2s, v5.2s, v12.s[0] + fmla v16.4s, v1.4s, v12.s[0] + fmla v20.4s, v1.4s, v13.s[0] + fmla v24.4s, v1.4s, v14.s[0] + fmla v28.4s, v1.4s, v15.s[0] .endm .macro KERNEL4x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s, v1.2s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmla v16.2s, v0.2s, v8.s[0] - fmla v29.2s, v1.2s, v9.s[1] + ldr q0, [pA], #16 - fmla v20.2s, v0.2s, v8.s[1] - fmla v25.2s, v1.2s, v9.s[0] - - fmla v24.2s, v0.2s, v9.s[0] - fmla v21.2s, v1.2s, v8.s[1] - - fmla v28.2s, v0.2s, v9.s[1] - fmla v17.2s, v1.2s, v8.s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] .endm .macro SAVE4x4 - ld1 {v8.2s, v9.2s}, [pCRow0] - fmla v8.2s, v16.2s, alphaV0 - fmla v9.2s, v17.2s, alphaV1 - st1 {v8.2s, v9.2s}, [pCRow0] + fmov alpha0, alpha - add pCRow1, pCRow0, LDC - ld1 {v12.2s, v13.2s}, [pCRow1] - fmla v12.2s, v20.2s, alphaV2 - fmla v13.2s, v21.2s, alphaV3 - st1 {v12.2s, v13.2s}, [pCRow1] - - add pCRow2, pCRow1, LDC - ld1 {v8.2s, v9.2s}, [pCRow2] - fmla v8.2s, v24.2s, alphaV0 - fmla v9.2s, v25.2s, alphaV1 - st1 {v8.2s, v9.2s}, [pCRow2] - - add pCRow1, pCRow2, LDC - ld1 {v12.2s, v13.2s}, [pCRow1] - fmla v12.2s, v28.2s, alphaV2 - fmla v13.2s, v29.2s, alphaV3 - st1 {v12.2s, v13.2s}, [pCRow1] + ldr q0, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + str q0, [pCRow0] add pCRow0, pCRow0, #16 + + ldr q1, [pCRow1] + fmla v1.4s, v20.4s, alphaV0 + str q1, [pCRow1] + + add pCRow1, pCRow1, #16 + + ldr q2, [pCRow2] + fmla v2.4s, v24.4s, alphaV0 + str q2, [pCRow2] + + add pCRow2, pCRow2, #16 + + ldr q3, [pCRow3] + fmla v3.4s, v28.4s, alphaV0 + str q3, [pCRow3] + + add pCRow3, pCRow3, #16 .endm /******************************************************************************/ @@ -633,38 +630,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL2x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s}, [pA] - add pA, pA, #8 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr d0, [pA], #8 fmla v16.2s, v0.2s, v8.s[0] - fmla v20.2s, v0.2s, v8.s[1] - fmla v24.2s, v0.2s, v9.s[0] - fmla v28.2s, v0.2s, v9.s[1] + fmla v20.2s, v0.2s, v9.s[0] + fmla v24.2s, v0.2s, v10.s[0] + fmla v28.2s, v0.2s, v11.s[0] .endm .macro SAVE2x4 - ld1 {v8.2s}, [pCRow0] - fmla v8.2s, v16.2s, alphaV0 - st1 {v8.2s}, [pCRow0] + fmov alpha0, alpha - add pCRow1, pCRow0, LDC - ld1 {v12.2s}, [pCRow1] - fmla v12.2s, v20.2s, alphaV1 - st1 {v12.2s}, [pCRow1] - - add pCRow2, pCRow1, LDC - ld1 {v8.2s}, [pCRow2] - fmla v8.2s, v24.2s, alphaV2 - st1 {v8.2s}, [pCRow2] - - add pCRow1, pCRow2, LDC - ld1 {v12.2s}, [pCRow1] - fmla v12.2s, v28.2s, alphaV3 - st1 {v12.2s}, [pCRow1] + ldr d0, [pCRow0] + fmla v0.2s, v16.2s, alphaV0 + str d0, [pCRow0] add pCRow0, pCRow0, #8 + + ldr d1, [pCRow1] + fmla v1.2s, v20.2s, alphaV0 + str d1, [pCRow1] + + add pCRow1, pCRow1, #8 + + ldr d0, [pCRow2] + fmla v0.2s, v24.2s, alphaV0 + str d0, [pCRow2] + + add pCRow2, pCRow2, #8 + + ldr d1, [pCRow3] + fmla v1.2s, v28.2s, alphaV0 + str d1, [pCRow3] + + add pCRow3, pCRow3, #8 .endm /******************************************************************************/ @@ -686,22 +688,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha + ld1 {v8.s}[0], [pCRow0] ld1 {v8.s}[1], [pCRow1] fmla v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] - add pCRow2, pCRow1, LDC - add pCRow1, pCRow2, LDC - ld1 {v12.s}[0], [pCRow2] - ld1 {v12.s}[1], [pCRow1] - fmla v12.2s, v20.2s, alphaV1 - st1 {v12.s}[0], [pCRow2] - st1 {v12.s}[1], [pCRow1] - add pCRow0, pCRow0, #4 + add pCRow1, pCRow1, #4 + + ld1 {v12.s}[0], [pCRow2] + ld1 {v12.s}[1], [pCRow3] + fmla v12.2s, v20.2s, alphaV0 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow3] + + add pCRow2, pCRow2, #4 + add pCRow3, pCRow3, #4 .endm /******************************************************************************/ @@ -741,20 +746,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE16x2 + fmov alpha0, alpha + add pCRow1, pCRow0, LDC ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 - fmla v2.4s, v18.4s, alphaV2 - fmla v3.4s, v19.4s, alphaV3 + fmla v1.4s, v17.4s, alphaV0 + fmla v2.4s, v18.4s, alphaV0 + fmla v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0 - fmla v5.4s, v21.4s, alphaV1 - fmla v6.4s, v22.4s, alphaV2 - fmla v7.4s, v23.4s, alphaV3 + fmla v5.4s, v21.4s, alphaV0 + fmla v6.4s, v22.4s, alphaV0 + fmla v7.4s, v23.4s, alphaV0 st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] add pCRow0, pCRow0, #64 @@ -785,18 +792,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0, alpha + add pCRow1, pCRow0, LDC ld1 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 + fmla v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow2, pCRow1, LDC ld1 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0 - fmla v5.4s, v21.4s, alphaV1 + fmla v5.4s, v21.4s, alphaV0 st1 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -824,15 +833,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0, alpha + ld1 {v8.2s, v9.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 - fmla v9.2s, v17.2s, alphaV1 + fmla v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2s, v13.2s}, [pCRow1] - fmla v12.2s, v20.2s, alphaV2 - fmla v13.2s, v21.2s, alphaV3 + fmla v12.2s, v20.2s, alphaV0 + fmla v13.2s, v21.2s, alphaV0 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -857,13 +868,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0, alpha + ld1 {v8.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow1 , pCRow0, LDC ld1 {v12.2s}, [pCRow1] - fmla v12.2s, v20.2s, alphaV1 + fmla v12.2s, v20.2s, alphaV0 st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 @@ -886,6 +899,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0, alpha + add pCRow1 , pCRow0, LDC ld1 {v8.s}[0], [pCRow0] ld1 {v8.s}[1], [pCRow1] @@ -925,11 +940,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE16x1 + fmov alpha0, alpha + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 - fmla v2.4s, v18.4s, alphaV2 - fmla v3.4s, v19.4s, alphaV3 + fmla v1.4s, v17.4s, alphaV0 + fmla v2.4s, v18.4s, alphaV0 + fmla v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] add pCRow0, pCRow0, #64 @@ -956,9 +973,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0, alpha + ld1 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 + fmla v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow0, pCRow0, #32 @@ -983,9 +1002,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0, alpha + ld1 {v8.2s, v9.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 - fmla v9.2s, v17.2s, alphaV1 + fmla v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow0, pCRow0, #16 @@ -1008,6 +1029,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0, alpha + ld1 {v8.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] @@ -1032,6 +1055,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0, alpha + ldr s8, [pCRow0] fmla s8, s16, alphaV0 str s8, [pCRow0] @@ -1061,10 +1086,10 @@ sgemm_kernel_begin: stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha0, s0 - fmov alpha1, s0 - fmov alpha2, s0 - fmov alpha3, s0 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, s0 lsl LDC, LDC, #2 // ldc = ldc * 4 @@ -1078,8 +1103,12 @@ sgemm_kernel_begin: /******************************************************************************/ sgemm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC mov pA, origPA // pA = start of A array @@ -1090,42 +1119,69 @@ sgemm_kernel_L4_M16_BEGIN: cmp counterI, #0 ble sgemm_kernel_L4_M8_BEGIN + .align 5 sgemm_kernel_L4_M16_20: mov pB, origPB - asr counterL , origK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , origK, #3 + cmp counterL , #2 blt sgemm_kernel_L4_M16_32 - KERNEL16x4_I // do one in the K - KERNEL16x4_M2 // do another in the K + KERNEL16x4_I + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 subs counterL, counterL, #2 ble sgemm_kernel_L4_M16_22a - .align 5 + .align 5 sgemm_kernel_L4_M16_22: KERNEL16x4_M1 KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 subs counterL, counterL, #1 bgt sgemm_kernel_L4_M16_22 + .align 5 sgemm_kernel_L4_M16_22a: + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_E b sgemm_kernel_L4_M16_44 + .align 5 sgemm_kernel_L4_M16_32: tst counterL, #1 ble sgemm_kernel_L4_M16_40 KERNEL16x4_I + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 KERNEL16x4_E b sgemm_kernel_L4_M16_44 @@ -1136,14 +1192,20 @@ sgemm_kernel_L4_M16_40: sgemm_kernel_L4_M16_44: - ands counterL , origK, #1 + ands counterL , origK, #7 ble sgemm_kernel_L4_M16_100 + .align 5 sgemm_kernel_L4_M16_46: KERNEL16x4_SUB + subs counterL, counterL, #1 + bne sgemm_kernel_L4_M16_46 sgemm_kernel_L4_M16_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] SAVE16x4 diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S index 28b321651..77e05103d 100644 --- a/kernel/arm64/strmm_kernel_16x4.S +++ b/kernel/arm64/strmm_kernel_16x4.S @@ -46,19 +46,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 -#define temp x16 -#define tempOffset x17 -#define tempK x18 +#define pCRow3 x15 +#define pA x16 +#define alpha w17 +#define temp x18 +#define tempOffset x19 +#define tempK x20 #define alpha0 s10 #define alphaV0 v10.s[0] -#define alpha1 s11 -#define alphaV1 v11.s[0] -#define alpha2 s14 -#define alphaV2 v14.s[0] -#define alpha3 s15 -#define alphaV3 v15.s[0] + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 224 +#define C_PRE_SIZE 160 + // 00 origM // 01 origN @@ -101,14 +102,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_04, pA1_05, pA1_06, pA1_07 //v06 pA1_08, pA1_09, pA1_10, pA1_11 //v07 pA1_12, pA1_13, pA1_14, pA1_15 -//v08 must save pB00, pB01 -//v09 must save pB02, pB03 -//v10 must save ALPHA0 -//v11 must save ALPHA1 -//v12 must save pB10, pB11 -//v13 must save pB12, pB13 -//v14 must save ALPHA2 -//v15 must save ALPHA3 +//v08 must save pB00 +//v09 must save pB01 +//v10 must save pB02 +//v11 must save pB03 +//v12 must save pB10 +//v13 must save pB11 +//v14 must save pB12 +//v15 must save pB13 //v16 must save C00, C01, C02, C03 //v17 must save C04, C05, C06, C07 //v18 C08, C09, C10, C11 @@ -150,202 +151,240 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp q0, q1, [pA], #32 + + ldp s8, s9, [pB], #8 fmul v16.4s, v0.4s, v8.s[0] + fmul v20.4s, v0.4s, v9.s[0] + + ldp s10, s11, [pB], #8 + + fmul v24.4s, v0.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] + + ldp q2, q3, [pA], #32 + fmul v17.4s, v1.4s, v8.s[0] + fmul v21.4s, v1.4s, v9.s[0] + + ldp q4, q5, [pA], #32 + + fmul v25.4s, v1.4s, v10.s[0] + fmul v29.4s, v1.4s, v11.s[0] + + ldp s12, s13, [pB], #8 + fmul v18.4s, v2.4s, v8.s[0] + fmul v22.4s, v2.4s, v9.s[0] + + ldp s14, s15, [pB], #8 + fmul v19.4s, v3.4s, v8.s[0] + fmul v23.4s, v3.4s, v9.s[0] - fmul v20.4s, v0.4s, v8.s[1] - fmul v21.4s, v1.4s, v8.s[1] - fmul v22.4s, v2.4s, v8.s[1] - fmul v23.4s, v3.4s, v8.s[1] + ldp q6, q7, [pA], #32 - fmul v24.4s, v0.4s, v9.s[0] - fmul v25.4s, v1.4s, v9.s[0] - fmul v26.4s, v2.4s, v9.s[0] - fmul v27.4s, v3.4s, v9.s[0] + fmul v26.4s, v2.4s, v10.s[0] + fmul v30.4s, v2.4s, v11.s[0] - fmul v28.4s, v0.4s, v9.s[1] - fmul v29.4s, v1.4s, v9.s[1] - fmul v30.4s, v2.4s, v9.s[1] - fmul v31.4s, v3.4s, v9.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 - ld1 {v6.4s}, [pA] - add pA, pA, #16 - ld1 {v7.4s}, [pA] - add pA, pA, #16 + fmul v27.4s, v3.4s, v10.s[0] + fmul v31.4s, v3.4s, v11.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL16x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] + + ldp q4, q5, [pA], #32 + fmla v18.4s, v2.4s, v8.s[0] fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v22.4s, v2.4s, v8.s[1] - fmla v23.4s, v3.4s, v8.s[1] + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v26.4s, v2.4s, v9.s[0] - fmla v27.4s, v3.4s, v9.s[0] + ldp s12, s13, [pB], #8 - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - fmla v30.4s, v2.4s, v9.s[1] - fmla v31.4s, v3.4s, v9.s[1] + fmla v22.4s, v2.4s, v9.s[0] + fmla v23.4s, v3.4s, v9.s[0] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 - ld1 {v6.4s}, [pA] - add pA, pA, #16 - ld1 {v7.4s}, [pA] - add pA, pA, #16 + ldp s14, s15, [pB], #8 + + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + fmla v26.4s, v2.4s, v10.s[0] + fmla v27.4s, v3.4s, v10.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] + + ldp q6, q7, [pA], #32 + + fmla v30.4s, v2.4s, v11.s[0] + fmla v31.4s, v3.4s, v11.s[0] .endm .macro KERNEL16x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] + + ldp q0, q1, [pA], #32 + fmla v18.4s, v6.4s, v12.s[0] fmla v19.4s, v7.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v22.4s, v6.4s, v12.s[1] - fmla v23.4s, v7.4s, v12.s[1] + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v26.4s, v6.4s, v13.s[0] - fmla v27.4s, v7.4s, v13.s[0] + ldp s8, s9, [pB], #8 - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - fmla v30.4s, v6.4s, v13.s[1] - fmla v31.4s, v7.4s, v13.s[1] + fmla v22.4s, v6.4s, v13.s[0] + fmla v23.4s, v7.4s, v13.s[0] - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp s10, s11, [pB], #8 + + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v26.4s, v6.4s, v14.s[0] + fmla v27.4s, v7.4s, v14.s[0] + + ldp q2, q3, [pA], #32 + + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] + + fmla v30.4s, v6.4s, v15.s[0] + fmla v31.4s, v7.4s, v15.s[0] .endm .macro KERNEL16x4_E fmla v16.4s, v4.4s, v12.s[0] + fmla v20.4s, v4.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v29.4s, v5.4s, v15.s[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v18.4s, v6.4s, v12.s[0] + fmla v22.4s, v6.4s, v13.s[0] + fmla v26.4s, v6.4s, v14.s[0] + fmla v30.4s, v6.4s, v15.s[0] + fmla v19.4s, v7.4s, v12.s[0] - - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v22.4s, v6.4s, v12.s[1] - fmla v23.4s, v7.4s, v12.s[1] - - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v26.4s, v6.4s, v13.s[0] - fmla v27.4s, v7.4s, v13.s[0] - - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - fmla v30.4s, v6.4s, v13.s[1] - fmla v31.4s, v7.4s, v13.s[1] + fmla v23.4s, v7.4s, v13.s[0] + fmla v27.4s, v7.4s, v14.s[0] + fmla v31.4s, v7.4s, v15.s[0] .endm .macro KERNEL16x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp q0, q1, [pA], #32 + ldp s8, s9, [pB], #8 fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + + ldp s10, s11, [pB], #8 + + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + + ldp q2, q3, [pA], #32 + fmla v17.4s, v1.4s, v8.s[0] + fmla v21.4s, v1.4s, v9.s[0] + + fmla v25.4s, v1.4s, v10.s[0] + fmla v29.4s, v1.4s, v11.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v22.4s, v2.4s, v9.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v19.4s, v3.4s, v8.s[0] + fmla v23.4s, v3.4s, v9.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v22.4s, v2.4s, v8.s[1] - fmla v23.4s, v3.4s, v8.s[1] + fmla v26.4s, v2.4s, v10.s[0] + fmla v30.4s, v2.4s, v11.s[0] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v26.4s, v2.4s, v9.s[0] - fmla v27.4s, v3.4s, v9.s[0] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - fmla v30.4s, v2.4s, v9.s[1] - fmla v31.4s, v3.4s, v9.s[1] + fmla v27.4s, v3.4s, v10.s[0] + fmla v31.4s, v3.4s, v11.s[0] .endm .macro SAVE16x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 - fmul v2.4s, v18.4s, alphaV2 - fmul v3.4s, v19.4s, alphaV3 - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + fmul v1.4s, v17.4s, alphaV0 + stp q0, q1, [pCRow0] - add pCRow2, pCRow1, LDC + add pCRow0, pCRow0, #32 + + fmul v2.4s, v18.4s, alphaV0 + fmul v3.4s, v19.4s, alphaV0 + stp q2, q3, [pCRow0] + + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] fmul v4.4s, v20.4s, alphaV0 - fmul v5.4s, v21.4s, alphaV1 - fmul v6.4s, v22.4s, alphaV2 - fmul v7.4s, v23.4s, alphaV3 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmul v5.4s, v21.4s, alphaV0 + stp q4, q5, [pCRow1] - add pCRow1, pCRow2, LDC + add pCRow1, pCRow1, #32 + + fmul v6.4s, v22.4s, alphaV0 + fmul v7.4s, v23.4s, alphaV0 + stp q6, q7, [pCRow1] + + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] fmul v0.4s, v24.4s, alphaV0 - fmul v1.4s, v25.4s, alphaV1 - fmul v2.4s, v26.4s, alphaV2 - fmul v3.4s, v27.4s, alphaV3 - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2] + fmul v1.4s, v25.4s, alphaV0 + stp q0, q1, [pCRow2] + + add pCRow2, pCRow2, #32 + + fmul v2.4s, v26.4s, alphaV0 + fmul v3.4s, v27.4s, alphaV0 + stp q2, q3, [pCRow2] + + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] fmul v4.4s, v28.4s, alphaV0 - fmul v5.4s, v29.4s, alphaV1 - fmul v6.4s, v30.4s, alphaV2 - fmul v7.4s, v31.4s, alphaV3 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmul v5.4s, v29.4s, alphaV0 + stp q4, q5, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 + fmul v6.4s, v30.4s, alphaV0 + fmul v7.4s, v31.4s, alphaV0 + stp q6, q7, [pCRow3] + + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -362,260 +401,209 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 fmul v16.4s, v0.4s, v8.s[0] fmul v17.4s, v1.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.s[1] - fmul v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v9.s[0] - fmul v25.4s, v1.4s, v9.s[0] - fmul v28.4s, v0.4s, v9.s[1] - fmul v29.4s, v1.4s, v9.s[1] + fmul v20.4s, v0.4s, v9.s[0] + fmul v21.4s, v1.4s, v9.s[0] + fmul v24.4s, v0.4s, v10.s[0] + fmul v25.4s, v1.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] + fmul v29.4s, v1.4s, v11.s[0] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 + + ldr q4, [pA], #16 + ldr q5, [pA], #16 .endm .macro KERNEL8x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 + + ldr q4, [pA], #16 + ldr q5, [pA], #16 .endm .macro KERNEL8x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 .endm .macro KERNEL8x4_E fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] .endm .macro KERNEL8x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] .endm .macro SAVE8x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow0] - - add pCRow2, pCRow1, LDC - - fmul v4.4s, v20.4s, alphaV0 - fmul v5.4s, v21.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] - - add pCRow1, pCRow2, LDC - - fmul v0.4s, v24.4s, alphaV0 - fmul v1.4s, v25.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow2] - - fmul v4.4s, v28.4s, alphaV0 - fmul v5.4s, v29.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] + fmul v1.4s, v17.4s, alphaV0 + stp q0, q1, [pCRow0] add pCRow0, pCRow0, #32 + + fmul v2.4s, v20.4s, alphaV0 + fmul v3.4s, v21.4s, alphaV0 + stp q2, q3, [pCRow1] + + add pCRow1, pCRow1, #32 + + fmul v4.4s, v24.4s, alphaV0 + fmul v5.4s, v25.4s, alphaV0 + stp q4, q5, [pCRow2] + + add pCRow2, pCRow2, #32 + + fmul v6.4s, v28.4s, alphaV0 + fmul v7.4s, v29.4s, alphaV0 + stp q6, q7, [pCRow3] + + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT4x4 fmov s16, wzr - fmov s17, s16 - fmov s20, s17 - fmov s21, s16 - fmov s24, s17 - fmov s25, s16 - fmov s28, s17 - fmov s29, s16 + fmov s20, wzr + fmov s24, wzr + fmov s28, wzr .endm .macro KERNEL4x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s, v1.2s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmul v16.2s, v0.2s, v8.s[0] - fmul v29.2s, v1.2s, v9.s[1] + ldr q0, [pA], #16 - fmul v20.2s, v0.2s, v8.s[1] - fmul v25.2s, v1.2s, v9.s[0] + fmul v16.4s, v0.4s, v8.s[0] + fmul v20.4s, v0.4s, v9.s[0] + fmul v24.4s, v0.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] - fmul v24.2s, v0.2s, v9.s[0] - fmul v21.2s, v1.2s, v8.s[1] + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 - fmul v28.2s, v0.2s, v9.s[1] - fmul v17.2s, v1.2s, v8.s[0] - - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.2s, v5.2s}, [pA] - add pA, pA, #16 + ldr q1, [pA], #16 .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.s[0] - fmla v29.2s, v1.2s, v9.s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] - ld1 {v12.2s, v13.2s}, [pB] // For next round - add pB, pB, #16 + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 - fmla v20.2s, v0.2s, v8.s[1] - fmla v25.2s, v1.2s, v9.s[0] - - ld1 {v4.2s, v5.2s}, [pA] // For next round - add pA, pA, #16 - - fmla v24.2s, v0.2s, v9.s[0] - fmla v21.2s, v1.2s, v8.s[1] - - prfm PLDL1KEEP, [pB, #512] - - fmla v28.2s, v0.2s, v9.s[1] - fmla v17.2s, v1.2s, v8.s[0] + ldr q1, [pA], #16 .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.s[0] - fmla v29.2s, v5.2s, v13.s[1] + fmla v16.4s, v1.4s, v12.s[0] + fmla v20.4s, v1.4s, v13.s[0] + fmla v24.4s, v1.4s, v14.s[0] + fmla v28.4s, v1.4s, v15.s[0] - ld1 {v8.2s, v9.2s}, [pB] // For next round - add pB, pB, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmla v20.2s, v4.2s, v12.s[1] - fmla v25.2s, v5.2s, v13.s[0] - - ld1 {v0.2s, v1.2s}, [pA] // For next round - add pA, pA, #16 - - fmla v24.2s, v4.2s, v13.s[0] - fmla v21.2s, v5.2s, v12.s[1] - - prfm PLDL1KEEP, [pA, #512] - - fmla v28.2s, v4.2s, v13.s[1] - fmla v17.2s, v5.2s, v12.s[0] + ldr q0, [pA], #16 .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.s[0] - fmla v29.2s, v5.2s, v13.s[1] - - fmla v20.2s, v4.2s, v12.s[1] - fmla v25.2s, v5.2s, v13.s[0] - - fmla v24.2s, v4.2s, v13.s[0] - fmla v21.2s, v5.2s, v12.s[1] - - fmla v28.2s, v4.2s, v13.s[1] - fmla v17.2s, v5.2s, v12.s[0] + fmla v16.4s, v1.4s, v12.s[0] + fmla v20.4s, v1.4s, v13.s[0] + fmla v24.4s, v1.4s, v14.s[0] + fmla v28.4s, v1.4s, v15.s[0] .endm .macro KERNEL4x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s, v1.2s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmla v16.2s, v0.2s, v8.s[0] - fmla v29.2s, v1.2s, v9.s[1] + ldr q0, [pA], #16 - fmla v20.2s, v0.2s, v8.s[1] - fmla v25.2s, v1.2s, v9.s[0] - - fmla v24.2s, v0.2s, v9.s[0] - fmla v21.2s, v1.2s, v8.s[1] - - fmla v28.2s, v0.2s, v9.s[1] - fmla v17.2s, v1.2s, v8.s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] .endm .macro SAVE4x4 + fmov alpha0, alpha - fmul v8.2s, v16.2s, alphaV0 - fmul v9.2s, v17.2s, alphaV1 - st1 {v8.2s, v9.2s}, [pCRow0] - - add pCRow1, pCRow0, LDC - - fmul v12.2s, v20.2s, alphaV2 - fmul v13.2s, v21.2s, alphaV3 - st1 {v12.2s, v13.2s}, [pCRow1] - - add pCRow2, pCRow1, LDC - - fmul v8.2s, v24.2s, alphaV0 - fmul v9.2s, v25.2s, alphaV1 - st1 {v8.2s, v9.2s}, [pCRow2] - - add pCRow1, pCRow2, LDC - - fmul v12.2s, v28.2s, alphaV2 - fmul v13.2s, v29.2s, alphaV3 - st1 {v12.2s, v13.2s}, [pCRow1] + fmul v0.4s, v16.4s, alphaV0 + str q0, [pCRow0] add pCRow0, pCRow0, #16 + + fmul v1.4s, v20.4s, alphaV0 + str q1, [pCRow1] + + add pCRow1, pCRow1, #16 + + fmul v2.4s, v24.4s, alphaV0 + str q2, [pCRow2] + + add pCRow2, pCRow2, #16 + + fmul v3.4s, v28.4s, alphaV0 + str q3, [pCRow3] + + add pCRow3, pCRow3, #16 .endm /******************************************************************************/ @@ -628,34 +616,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL2x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s}, [pA] - add pA, pA, #8 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr d0, [pA], #8 fmla v16.2s, v0.2s, v8.s[0] - fmla v20.2s, v0.2s, v8.s[1] - fmla v24.2s, v0.2s, v9.s[0] - fmla v28.2s, v0.2s, v9.s[1] + fmla v20.2s, v0.2s, v9.s[0] + fmla v24.2s, v0.2s, v10.s[0] + fmla v28.2s, v0.2s, v11.s[0] .endm .macro SAVE2x4 - fmul v8.2s, v16.2s, alphaV0 - st1 {v8.2s}, [pCRow0] + fmov alpha0, alpha - add pCRow1, pCRow0, LDC - fmul v12.2s, v20.2s, alphaV1 - st1 {v12.2s}, [pCRow1] - - add pCRow2, pCRow1, LDC - fmul v8.2s, v24.2s, alphaV2 - st1 {v8.2s}, [pCRow2] - - add pCRow1, pCRow2, LDC - fmul v12.2s, v28.2s, alphaV3 - st1 {v12.2s}, [pCRow1] + fmul v0.2s, v16.2s, alphaV0 + str d0, [pCRow0] add pCRow0, pCRow0, #8 + + fmul v1.2s, v20.2s, alphaV0 + str d1, [pCRow1] + + add pCRow1, pCRow1, #8 + + fmul v0.2s, v24.2s, alphaV0 + str d0, [pCRow2] + + add pCRow2, pCRow2, #8 + + fmul v1.2s, v28.2s, alphaV0 + str d1, [pCRow3] + + add pCRow3, pCRow3, #8 .endm /******************************************************************************/ @@ -677,20 +670,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha fmul v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] - add pCRow2, pCRow1, LDC - add pCRow1, pCRow2, LDC - - fmul v12.2s, v20.2s, alphaV1 - st1 {v12.s}[0], [pCRow2] - st1 {v12.s}[1], [pCRow1] - add pCRow0, pCRow0, #4 + add pCRow1, pCRow1, #4 + + fmul v12.2s, v20.2s, alphaV0 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow3] + + add pCRow2, pCRow2, #4 + add pCRow3, pCRow3, #4 .endm /******************************************************************************/ @@ -730,18 +724,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE16x2 + fmov alpha0, alpha + add pCRow1, pCRow0, LDC fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 - fmul v2.4s, v18.4s, alphaV2 - fmul v3.4s, v19.4s, alphaV3 + fmul v1.4s, v17.4s, alphaV0 + fmul v2.4s, v18.4s, alphaV0 + fmul v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] fmul v4.4s, v20.4s, alphaV0 - fmul v5.4s, v21.4s, alphaV1 - fmul v6.4s, v22.4s, alphaV2 - fmul v7.4s, v23.4s, alphaV3 + fmul v5.4s, v21.4s, alphaV0 + fmul v6.4s, v22.4s, alphaV0 + fmul v7.4s, v23.4s, alphaV0 st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] add pCRow0, pCRow0, #64 @@ -772,16 +768,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0, alpha + add pCRow1, pCRow0, LDC fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 + fmul v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow2, pCRow1, LDC fmul v4.4s, v20.4s, alphaV0 - fmul v5.4s, v21.4s, alphaV1 + fmul v5.4s, v21.4s, alphaV0 st1 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -809,15 +807,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0, alpha fmul v8.2s, v16.2s, alphaV0 - fmul v9.2s, v17.2s, alphaV1 + fmul v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow1, pCRow0, LDC - fmul v12.2s, v20.2s, alphaV2 - fmul v13.2s, v21.2s, alphaV3 + fmul v12.2s, v20.2s, alphaV0 + fmul v13.2s, v21.2s, alphaV0 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -842,12 +841,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0, alpha + fmul v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow1 , pCRow0, LDC - fmul v12.2s, v20.2s, alphaV1 + fmul v12.2s, v20.2s, alphaV0 st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 @@ -870,6 +871,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0, alpha + add pCRow1 , pCRow0, LDC fmul v8.2s, v16.2s, alphaV0 @@ -908,11 +911,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE16x1 + fmov alpha0, alpha fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 - fmul v2.4s, v18.4s, alphaV2 - fmul v3.4s, v19.4s, alphaV3 + fmul v1.4s, v17.4s, alphaV0 + fmul v2.4s, v18.4s, alphaV0 + fmul v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] add pCRow0, pCRow0, #64 @@ -939,9 +943,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0, alpha fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 + fmul v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow0, pCRow0, #32 @@ -966,9 +971,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0, alpha fmul v8.2s, v16.2s, alphaV0 - fmul v9.2s, v17.2s, alphaV1 + fmul v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow0, pCRow0, #16 @@ -991,6 +997,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0, alpha fmul v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] @@ -1015,6 +1022,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0, alpha fmul s8, s16, alpha0 str s8, [pCRow0] @@ -1043,10 +1051,10 @@ strmm_kernel_begin: stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha0, s0 - fmov alpha1, s0 - fmov alpha2, s0 - fmov alpha3, s0 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, s0 lsl LDC, LDC, #2 // ldc = ldc * 4 @@ -1063,8 +1071,13 @@ strmm_kernel_begin: /******************************************************************************/ strmm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + #if defined(LEFT) mov tempOffset, offset @@ -1078,6 +1091,7 @@ strmm_kernel_L4_M16_BEGIN: cmp counterI, #0 ble strmm_kernel_L4_M8_BEGIN + .align 5 strmm_kernel_L4_M16_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1098,38 +1112,64 @@ strmm_kernel_L4_M16_20: add tempK, tempOffset, #4 #endif - asr counterL , tempK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , tempK, #3 + cmp counterL , #2 blt strmm_kernel_L4_M16_32 - KERNEL16x4_I // do one in the K - KERNEL16x4_M2 // do another in the K + KERNEL16x4_I + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 subs counterL, counterL, #2 ble strmm_kernel_L4_M16_22a - .align 5 + .align 5 strmm_kernel_L4_M16_22: KERNEL16x4_M1 KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 subs counterL, counterL, #1 bgt strmm_kernel_L4_M16_22 + .align 5 strmm_kernel_L4_M16_22a: + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_E b strmm_kernel_L4_M16_44 + .align 5 strmm_kernel_L4_M16_32: tst counterL, #1 ble strmm_kernel_L4_M16_40 KERNEL16x4_I + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 KERNEL16x4_E b strmm_kernel_L4_M16_44 @@ -1140,12 +1180,15 @@ strmm_kernel_L4_M16_40: strmm_kernel_L4_M16_44: - ands counterL , tempK, #1 + ands counterL , tempK, #7 ble strmm_kernel_L4_M16_100 + .align 5 strmm_kernel_L4_M16_46: KERNEL16x4_SUB + subs counterL, counterL, #1 + bne strmm_kernel_L4_M16_46 strmm_kernel_L4_M16_100: @@ -1166,6 +1209,9 @@ strmm_kernel_L4_M16_100: #if defined(LEFT) add tempOffset, tempOffset, #16 #endif + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] strmm_kernel_L4_M16_END: subs counterI, counterI, #1 diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S index 1cb695e56..08a1531cf 100644 --- a/kernel/arm64/zgemm_kernel_4x4.S +++ b/kernel/arm64/zgemm_kernel_4x4.S @@ -46,20 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 -#define alpha_save_R x16 -#define alpha_save_I x17 +#define pCRow3 x15 +#define pA x16 +#define alphaR x17 +#define alphaI x18 #define alpha0_R d10 #define alphaV0_R v10.d[0] #define alpha0_I d11 #define alphaV0_I v11.d[0] -#define alpha1_R d14 -#define alphaV1_R v14.d[0] -#define alpha1_I d15 -#define alphaV1_I v15.d[0] - +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla @@ -98,10 +97,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 -// 15 pA -// 16 alpha_save_R -// 17 alpha_save_I -// 18 must save +// 15 pCRow3 +// 16 pA +// 17 alpha_save_R +// 18 must save alpha_save_I // 19 must save // 20 must save // 21 must save @@ -175,12 +174,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 - ld2 {v10.2d, v11.2d}, [pB] - add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - ld2 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 fmul v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] @@ -193,16 +188,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v17.2d, v1.2d, v8.d[0] - fmul v18.2d, v2.2d, v8.d[0] - OP_ii v18.2d, v3.2d, v9.d[0] -#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ - defined(RR) || defined(RC) || defined(CR) || defined(CC) - eor v19.16b, v19.16b, v19.16b - fmls v19.2d, v2.2d, v9.d[0] -#else - fmul v19.2d, v2.2d, v9.d[0] -#endif - OP_ir v19.2d, v3.2d, v8.d[0] + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 fmul v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] @@ -215,6 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v21.2d, v1.2d, v8.d[1] + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + fmul v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -226,6 +216,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v23.2d, v3.2d, v8.d[1] + ld2 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + + fmul v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v19.16b, v19.16b, v19.16b + fmls v19.2d, v2.2d, v9.d[0] +#else + fmul v19.2d, v2.2d, v9.d[0] +#endif + OP_ir v19.2d, v3.2d, v8.d[0] + + ld2 {v4.2d, v5.2d} , [pA] + add pA, pA, #32 + fmul v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -237,6 +244,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v25.2d, v1.2d, v10.d[0] + ld2 {v6.2d, v7.2d} , [pA] + add pA, pA, #32 + fmul v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -248,6 +258,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v27.2d, v3.2d, v10.d[0] + ld2 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 + fmul v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -259,6 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v29.2d, v1.2d, v10.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmul v30.2d, v2.2d, v10.d[1] OP_ii v30.2d, v3.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -270,14 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v31.2d, v3.2d, v10.d[1] - ld2 {v12.2d, v13.2d}, [pB] - add pB, pB, #32 - ld2 {v14.2d, v15.2d}, [pB] - add pB, pB, #32 - ld2 {v4.2d, v5.2d} , [pA] - add pA, pA, #32 - ld2 {v6.2d, v7.2d} , [pA] - add pA, pA, #32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL4x4_M1 @@ -286,7 +294,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] - ld2 {v12.2d, v13.2d}, [pB] // For next round + ld2 {v12.2d, v13.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v2.2d, v8.d[0] @@ -294,15 +302,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v19.2d, v2.2d, v9.d[0] OP_ir v19.2d, v3.2d, v8.d[0] - ld2 {v14.2d, v15.2d}, [pB] // For next round - add pB, pB, #32 + ld2 {v4.2d, v5.2d} , [pA] + add pA, pA, #32 OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] - ld2 {v4.2d, v5.2d} , [pA] // For next round + ld2 {v6.2d, v7.2d} , [pA] add pA, pA, #32 OP_rr v22.2d, v2.2d, v8.d[1] @@ -310,22 +318,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] - ld2 {v6.2d, v7.2d} , [pA] // For next round - add pA, pA, #32 + ld2 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] OP_ir v25.2d, v1.2d, v10.d[0] - prfm PLDL1KEEP, [pA, #512] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] OP_rr v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] OP_ri v27.2d, v2.2d, v11.d[0] OP_ir v27.2d, v3.2d, v10.d[0] - prfm PLDL1KEEP, [pB, #512] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] OP_rr v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] @@ -344,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.2d, v4.2d, v13.d[0] OP_ir v17.2d, v5.2d, v12.d[0] - ld2 {v8.2d, v9.2d}, [pB] // For next round + ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v6.2d, v12.d[0] @@ -352,15 +360,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v19.2d, v6.2d, v13.d[0] OP_ir v19.2d, v7.2d, v12.d[0] - ld2 {v10.2d, v11.2d}, [pB] // For next round - add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 OP_rr v20.2d, v4.2d, v12.d[1] OP_ii v20.2d, v5.2d, v13.d[1] OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v12.d[1] - ld2 {v0.2d, v1.2d}, [pA] // For next round + ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 OP_rr v22.2d, v6.2d, v12.d[1] @@ -368,22 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v23.2d, v6.2d, v13.d[1] OP_ir v23.2d, v7.2d, v12.d[1] - ld2 {v2.2d, v3.2d}, [pA] // For next round - add pA, pA, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 OP_rr v24.2d, v4.2d, v14.d[0] OP_ii v24.2d, v5.2d, v15.d[0] OP_ri v25.2d, v4.2d, v15.d[0] OP_ir v25.2d, v5.2d, v14.d[0] - prfm PLDL1KEEP, [pA, #512] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v15.d[0] OP_ri v27.2d, v6.2d, v15.d[0] OP_ir v27.2d, v7.2d, v14.d[0] - prfm PLDL1KEEP, [pB, #512] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] OP_rr v28.2d, v4.2d, v14.d[1] OP_ii v28.2d, v5.2d, v15.d[1] @@ -412,6 +420,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v12.d[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.2d, v6.2d, v12.d[1] OP_ii v22.2d, v7.2d, v13.d[1] OP_ri v23.2d, v6.2d, v13.d[1] @@ -422,6 +432,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v25.2d, v4.2d, v15.d[0] OP_ir v25.2d, v5.2d, v14.d[0] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] + OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v15.d[0] OP_ri v27.2d, v6.2d, v15.d[0] @@ -441,33 +453,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 - ld2 {v10.2d, v11.2d}, [pB] - add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - ld2 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v18.2d, v2.2d, v8.d[0] - OP_ii v18.2d, v3.2d, v9.d[0] - OP_ri v19.2d, v2.2d, v9.d[0] - OP_ir v19.2d, v3.2d, v8.d[0] + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] @@ -490,74 +509,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI - mov pCRow1, pCRow0 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2 {v0.2d, v1.2d}, [pCRow1] + ld2 {v0.2d, v1.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R - st2 {v0.2d, v1.2d}, [pCRow1] - add pCRow2, pCRow1, #32 - ld2 {v2.2d, v3.2d}, [pCRow2] + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R + st2 {v0.2d, v1.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 + + ld2 {v2.2d, v3.2d}, [pCRow0] fmla v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmla v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R - st2 {v2.2d, v3.2d}, [pCRow2] + fmla v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R + st2 {v2.2d, v3.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - add pCRow1, pCRow1, LDC ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmla v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmla v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] - add pCRow2, pCRow1, #32 - ld2 {v6.2d, v7.2d}, [pCRow2] + + add pCRow1, pCRow1, #32 + + ld2 {v6.2d, v7.2d}, [pCRow1] fmla v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I - fmla v7.2d, v22.2d, alphaV1_I - fmla v7.2d, v23.2d, alphaV1_R - st2 {v6.2d, v7.2d}, [pCRow2] + fmla v7.2d, v22.2d, alphaV0_I + fmla v7.2d, v23.2d, alphaV0_R + st2 {v6.2d, v7.2d}, [pCRow1] - add pCRow1, pCRow1, LDC - ld2 {v0.2d, v1.2d}, [pCRow1] + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + ld2 {v0.2d, v1.2d}, [pCRow2] fmla v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I - fmla v1.2d, v24.2d, alphaV1_I - fmla v1.2d, v25.2d, alphaV1_R - st2 {v0.2d, v1.2d}, [pCRow1] - add pCRow2, pCRow1, #32 + fmla v1.2d, v24.2d, alphaV0_I + fmla v1.2d, v25.2d, alphaV0_R + st2 {v0.2d, v1.2d}, [pCRow2] + + add pCRow2, pCRow2, #32 + ld2 {v2.2d, v3.2d}, [pCRow2] fmla v2.2d, v26.2d, alphaV0_R fmls v2.2d, v27.2d, alphaV0_I - fmla v3.2d, v26.2d, alphaV1_I - fmla v3.2d, v27.2d, alphaV1_R + fmla v3.2d, v26.2d, alphaV0_I + fmla v3.2d, v27.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] - add pCRow1, pCRow1, LDC + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - ld2 {v4.2d, v5.2d}, [pCRow1] + ld2 {v4.2d, v5.2d}, [pCRow3] fmla v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I - fmla v5.2d, v28.2d, alphaV1_I - fmla v5.2d, v29.2d, alphaV1_R - st2 {v4.2d, v5.2d}, [pCRow1] - add pCRow2, pCRow1, #32 - ld2 {v6.2d, v7.2d}, [pCRow2] + fmla v5.2d, v28.2d, alphaV0_I + fmla v5.2d, v29.2d, alphaV0_R + st2 {v4.2d, v5.2d}, [pCRow3] + + add pCRow3, pCRow3, #32 + + ld2 {v6.2d, v7.2d}, [pCRow3] fmla v6.2d, v30.2d, alphaV0_R fmls v6.2d, v31.2d, alphaV0_I - fmla v7.2d, v30.2d, alphaV1_I - fmla v7.2d, v31.2d, alphaV1_R - st2 {v6.2d, v7.2d}, [pCRow2] + fmla v7.2d, v30.2d, alphaV0_I + fmla v7.2d, v31.2d, alphaV0_R + st2 {v6.2d, v7.2d}, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -604,18 +634,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC @@ -623,8 +651,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmla v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmla v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow1, pCRow1, LDC @@ -632,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I - fmla v1.2d, v24.2d, alphaV1_I - fmla v1.2d, v25.2d, alphaV1_R + fmla v1.2d, v24.2d, alphaV0_I + fmla v1.2d, v25.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC @@ -641,8 +669,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I - fmla v5.2d, v28.2d, alphaV1_I - fmla v5.2d, v29.2d, alphaV1_R + fmla v5.2d, v28.2d, alphaV0_I + fmla v5.2d, v29.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -691,18 +719,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmla d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmla d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -710,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.d, v5.d}[0], [pCRow1] fmla d4, d20, alphaV0_R fmls d4, d21, alphaV0_I - fmla d5, d20, alphaV1_I - fmla d5, d21, alphaV1_R + fmla d5, d20, alphaV0_I + fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -719,8 +745,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d24, alphaV0_R fmls d0, d25, alphaV0_I - fmla d1, d24, alphaV1_I - fmla d1, d25, alphaV1_R + fmla d1, d24, alphaV0_I + fmla d1, d25, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -728,8 +754,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.d, v5.d}[0], [pCRow1] fmla d4, d28, alphaV0_R fmls d4, d29, alphaV0_I - fmla d5, d28, alphaV1_I - fmla d5, d29, alphaV1_R + fmla d5, d28, alphaV0_I + fmla d5, d29, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -778,25 +804,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v2.2d, v3.2d}, [pCRow2] fmla v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmla v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R + fmla v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow1, pCRow1, LDC @@ -804,15 +828,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmla v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmla v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v6.2d, v7.2d}, [pCRow2] fmla v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I - fmla v7.2d, v22.2d, alphaV1_I - fmla v7.2d, v23.2d, alphaV1_R + fmla v7.2d, v22.2d, alphaV0_I + fmla v7.2d, v23.2d, alphaV0_R st2 {v6.2d, v7.2d}, [pCRow2] add pCRow0, pCRow0, #64 @@ -845,18 +869,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC @@ -864,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmla v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmla v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -898,18 +920,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmla d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmla d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -917,8 +937,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.d, v5.d}[0], [pCRow1] fmla d4, d20, alphaV0_R fmls d4, d21, alphaV0_I - fmla d5, d20, alphaV1_I - fmla d5, d21, alphaV1_R + fmla d5, d20, alphaV0_I + fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -953,25 +973,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v2.2d, v3.2d}, [pCRow2] fmla v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmla v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R + fmla v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow0, pCRow0, #64 @@ -997,18 +1015,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1035,18 +1051,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmla d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmla d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -1072,8 +1086,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha_save_R, d0 - fmov alpha_save_I, d1 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, d0 + fmov alphaI, d1 lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 @@ -1085,8 +1102,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble zgemm_kernel_L2_BEGIN zgemm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + mov pA, origPA // pA = start of A array zgemm_kernel_L4_M4_BEGIN: @@ -1096,42 +1118,68 @@ zgemm_kernel_L4_M4_BEGIN: cmp counterI, #0 ble zgemm_kernel_L4_M2_BEGIN + .align 5 zgemm_kernel_L4_M4_20: mov pB, origPB - asr counterL , origK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , origK, #3 + cmp counterL , #2 blt zgemm_kernel_L4_M4_32 - KERNEL4x4_I // do one in the K - KERNEL4x4_M2 // do another in the K + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 subs counterL, counterL, #2 // subtract 2 ble zgemm_kernel_L4_M4_22a - .align 5 + .align 5 zgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 subs counterL, counterL, #1 bgt zgemm_kernel_L4_M4_22 - + .align 5 zgemm_kernel_L4_M4_22a: + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E b zgemm_kernel_L4_M4_44 + .align 5 zgemm_kernel_L4_M4_32: tst counterL, #1 ble zgemm_kernel_L4_M4_40 KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 KERNEL4x4_E b zgemm_kernel_L4_M4_44 @@ -1143,13 +1191,20 @@ zgemm_kernel_L4_M4_40: zgemm_kernel_L4_M4_44: - ands counterL , origK, #1 + ands counterL , origK, #7 ble zgemm_kernel_L4_M4_100 + .align 5 zgemm_kernel_L4_M4_46: KERNEL4x4_SUB + subs counterL, counterL, #1 + bne zgemm_kernel_L4_M4_46 + zgemm_kernel_L4_M4_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] SAVE4x4 diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S index 7945870d6..77a7857ff 100644 --- a/kernel/arm64/ztrmm_kernel_4x4.S +++ b/kernel/arm64/ztrmm_kernel_4x4.S @@ -46,23 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 -#define alpha_save_R x16 -#define alpha_save_I x17 -#define temp x18 -#define tempOffset x19 -#define tempK x20 +#define pCRow3 x15 +#define pA x16 +#define alphaR x17 +#define alphaI x18 +#define temp x19 +#define tempOffset x20 +#define tempK x21 #define alpha0_R d10 #define alphaV0_R v10.d[0] #define alpha0_I d11 #define alphaV0_I v11.d[0] -#define alpha1_R d14 -#define alphaV1_R v14.d[0] -#define alpha1_I d15 -#define alphaV1_I v15.d[0] - +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla @@ -93,7 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 04 origPB // 05 pC // 06 origLDC -> LDC -// 07 offset +// 07 offset -> temp // 08 counterL // 09 counterI // 10 counterJ @@ -101,13 +100,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 -// 15 pA -// 16 alpha_save_R -// 17 alpha_save_I -// 18 must save temp -// 19 must save tempOffset -// 20 must save tempK -// 21 must save +// 15 pCRow3 +// 16 pA +// 17 alpha_save_R +// 18 must save alpha_save_I +// 19 must save temp +// 20 must save tempOffset +// 21 must save tempK // 22 must save // 23 must save // 24 must save @@ -178,12 +177,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 - ld2 {v10.2d, v11.2d}, [pB] - add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - ld2 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 fmul v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] @@ -196,16 +191,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v17.2d, v1.2d, v8.d[0] - fmul v18.2d, v2.2d, v8.d[0] - OP_ii v18.2d, v3.2d, v9.d[0] -#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ - defined(RR) || defined(RC) || defined(CR) || defined(CC) - eor v19.16b, v19.16b, v19.16b - fmls v19.2d, v2.2d, v9.d[0] -#else - fmul v19.2d, v2.2d, v9.d[0] -#endif - OP_ir v19.2d, v3.2d, v8.d[0] + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 fmul v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] @@ -218,6 +205,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v21.2d, v1.2d, v8.d[1] + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + fmul v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -229,6 +219,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v23.2d, v3.2d, v8.d[1] + ld2 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + + fmul v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v19.16b, v19.16b, v19.16b + fmls v19.2d, v2.2d, v9.d[0] +#else + fmul v19.2d, v2.2d, v9.d[0] +#endif + OP_ir v19.2d, v3.2d, v8.d[0] + + ld2 {v4.2d, v5.2d} , [pA] + add pA, pA, #32 + fmul v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -240,6 +247,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v25.2d, v1.2d, v10.d[0] + ld2 {v6.2d, v7.2d} , [pA] + add pA, pA, #32 + fmul v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -251,6 +261,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v27.2d, v3.2d, v10.d[0] + ld2 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 + fmul v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -262,6 +275,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v29.2d, v1.2d, v10.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmul v30.2d, v2.2d, v10.d[1] OP_ii v30.2d, v3.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -273,14 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v31.2d, v3.2d, v10.d[1] - ld2 {v12.2d, v13.2d}, [pB] - add pB, pB, #32 - ld2 {v14.2d, v15.2d}, [pB] - add pB, pB, #32 - ld2 {v4.2d, v5.2d} , [pA] - add pA, pA, #32 - ld2 {v6.2d, v7.2d} , [pA] - add pA, pA, #32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL4x4_M1 @@ -289,7 +297,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] - ld2 {v12.2d, v13.2d}, [pB] // For next round + ld2 {v12.2d, v13.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v2.2d, v8.d[0] @@ -297,15 +305,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v19.2d, v2.2d, v9.d[0] OP_ir v19.2d, v3.2d, v8.d[0] - ld2 {v14.2d, v15.2d}, [pB] // For next round - add pB, pB, #32 + ld2 {v4.2d, v5.2d} , [pA] + add pA, pA, #32 OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] - ld2 {v4.2d, v5.2d} , [pA] // For next round + ld2 {v6.2d, v7.2d} , [pA] add pA, pA, #32 OP_rr v22.2d, v2.2d, v8.d[1] @@ -313,22 +321,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] - ld2 {v6.2d, v7.2d} , [pA] // For next round - add pA, pA, #32 + ld2 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] OP_ir v25.2d, v1.2d, v10.d[0] - prfm PLDL1KEEP, [pA, #512] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] OP_rr v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] OP_ri v27.2d, v2.2d, v11.d[0] OP_ir v27.2d, v3.2d, v10.d[0] - prfm PLDL1KEEP, [pB, #512] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] OP_rr v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] @@ -347,7 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.2d, v4.2d, v13.d[0] OP_ir v17.2d, v5.2d, v12.d[0] - ld2 {v8.2d, v9.2d}, [pB] // For next round + ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v6.2d, v12.d[0] @@ -355,15 +363,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v19.2d, v6.2d, v13.d[0] OP_ir v19.2d, v7.2d, v12.d[0] - ld2 {v10.2d, v11.2d}, [pB] // For next round - add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 OP_rr v20.2d, v4.2d, v12.d[1] OP_ii v20.2d, v5.2d, v13.d[1] OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v12.d[1] - ld2 {v0.2d, v1.2d}, [pA] // For next round + ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 OP_rr v22.2d, v6.2d, v12.d[1] @@ -371,22 +379,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v23.2d, v6.2d, v13.d[1] OP_ir v23.2d, v7.2d, v12.d[1] - ld2 {v2.2d, v3.2d}, [pA] // For next round - add pA, pA, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 OP_rr v24.2d, v4.2d, v14.d[0] OP_ii v24.2d, v5.2d, v15.d[0] OP_ri v25.2d, v4.2d, v15.d[0] OP_ir v25.2d, v5.2d, v14.d[0] - prfm PLDL1KEEP, [pA, #512] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v15.d[0] OP_ri v27.2d, v6.2d, v15.d[0] OP_ir v27.2d, v7.2d, v14.d[0] - prfm PLDL1KEEP, [pB, #512] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] OP_rr v28.2d, v4.2d, v14.d[1] OP_ii v28.2d, v5.2d, v15.d[1] @@ -415,6 +423,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v12.d[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.2d, v6.2d, v12.d[1] OP_ii v22.2d, v7.2d, v13.d[1] OP_ri v23.2d, v6.2d, v13.d[1] @@ -425,6 +435,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v25.2d, v4.2d, v15.d[0] OP_ir v25.2d, v5.2d, v14.d[0] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] + OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v15.d[0] OP_ri v27.2d, v6.2d, v15.d[0] @@ -444,33 +456,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 - ld2 {v10.2d, v11.2d}, [pB] - add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - ld2 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v18.2d, v2.2d, v8.d[0] - OP_ii v18.2d, v3.2d, v9.d[0] - OP_ri v19.2d, v2.2d, v9.d[0] - OP_ir v19.2d, v3.2d, v8.d[0] + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] @@ -493,66 +512,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI - mov pCRow1, pCRow0 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R - st2 {v0.2d, v1.2d}, [pCRow1] - add pCRow2, pCRow1, #32 + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R + st2 {v0.2d, v1.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 + fmul v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmul v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R - st2 {v2.2d, v3.2d}, [pCRow2] + fmul v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R + st2 {v2.2d, v3.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - add pCRow1, pCRow1, LDC fmul v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmul v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmul v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] - add pCRow2, pCRow1, #32 + + add pCRow1, pCRow1, #32 + fmul v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I - fmul v7.2d, v22.2d, alphaV1_I - fmla v7.2d, v23.2d, alphaV1_R - st2 {v6.2d, v7.2d}, [pCRow2] + fmul v7.2d, v22.2d, alphaV0_I + fmla v7.2d, v23.2d, alphaV0_R + st2 {v6.2d, v7.2d}, [pCRow1] + + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] - add pCRow1, pCRow1, LDC fmul v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I - fmul v1.2d, v24.2d, alphaV1_I - fmla v1.2d, v25.2d, alphaV1_R - st2 {v0.2d, v1.2d}, [pCRow1] - add pCRow2, pCRow1, #32 + fmul v1.2d, v24.2d, alphaV0_I + fmla v1.2d, v25.2d, alphaV0_R + st2 {v0.2d, v1.2d}, [pCRow2] + + add pCRow2, pCRow2, #32 + fmul v2.2d, v26.2d, alphaV0_R fmls v2.2d, v27.2d, alphaV0_I - fmul v3.2d, v26.2d, alphaV1_I - fmla v3.2d, v27.2d, alphaV1_R + fmul v3.2d, v26.2d, alphaV0_I + fmla v3.2d, v27.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] - add pCRow1, pCRow1, LDC + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] fmul v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I - fmul v5.2d, v28.2d, alphaV1_I - fmla v5.2d, v29.2d, alphaV1_R - st2 {v4.2d, v5.2d}, [pCRow1] - add pCRow2, pCRow1, #32 + fmul v5.2d, v28.2d, alphaV0_I + fmla v5.2d, v29.2d, alphaV0_R + st2 {v4.2d, v5.2d}, [pCRow3] + + add pCRow3, pCRow3, #32 + fmul v6.2d, v30.2d, alphaV0_R fmls v6.2d, v31.2d, alphaV0_I - fmul v7.2d, v30.2d, alphaV1_I - fmla v7.2d, v31.2d, alphaV1_R - st2 {v6.2d, v7.2d}, [pCRow2] + fmul v7.2d, v30.2d, alphaV0_I + fmla v7.2d, v31.2d, alphaV0_R + st2 {v6.2d, v7.2d}, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -599,41 +629,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmul v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmul v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow1, pCRow1, LDC fmul v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I - fmul v1.2d, v24.2d, alphaV1_I - fmla v1.2d, v25.2d, alphaV1_R + fmul v1.2d, v24.2d, alphaV0_I + fmla v1.2d, v25.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I - fmul v5.2d, v28.2d, alphaV1_I - fmla v5.2d, v29.2d, alphaV1_R + fmul v5.2d, v28.2d, alphaV0_I + fmla v5.2d, v29.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -682,41 +710,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmul d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmul d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul d4, d20, alphaV0_R fmls d4, d21, alphaV0_I - fmul d5, d20, alphaV1_I - fmla d5, d21, alphaV1_R + fmul d5, d20, alphaV0_I + fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul d0, d24, alphaV0_R fmls d0, d25, alphaV0_I - fmul d1, d24, alphaV1_I - fmla d1, d25, alphaV1_R + fmul d1, d24, alphaV0_I + fmla d1, d25, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul d4, d28, alphaV0_R fmls d4, d29, alphaV0_I - fmul d5, d28, alphaV1_I - fmla d5, d29, alphaV1_R + fmul d5, d28, alphaV0_I + fmla d5, d29, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -765,37 +791,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 fmul v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmul v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R + fmul v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow1, pCRow1, LDC fmul v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmul v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmul v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow2, pCRow1, #32 fmul v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I - fmul v7.2d, v22.2d, alphaV1_I - fmla v7.2d, v23.2d, alphaV1_R + fmul v7.2d, v22.2d, alphaV0_I + fmla v7.2d, v23.2d, alphaV0_R st2 {v6.2d, v7.2d}, [pCRow2] add pCRow0, pCRow0, #64 @@ -828,25 +852,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmul v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmul v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -879,25 +901,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmul d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmul d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul d4, d20, alphaV0_R fmls d4, d21, alphaV0_I - fmul d5, d20, alphaV1_I - fmla d5, d21, alphaV1_R + fmul d5, d20, alphaV0_I + fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -932,23 +952,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 fmul v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmul v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R + fmul v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow0, pCRow0, #64 @@ -974,17 +992,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1011,17 +1027,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmul d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmul d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -1047,8 +1061,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha_save_R, d0 - fmov alpha_save_I, d1 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, d0 + fmov alphaI, d1 lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 @@ -1064,8 +1081,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble ztrmm_kernel_L2_BEGIN ztrmm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + #if defined(LEFT) mov tempOffset, offset @@ -1079,6 +1101,7 @@ ztrmm_kernel_L4_M4_BEGIN: cmp counterI, #0 ble ztrmm_kernel_L4_M2_BEGIN + .align 5 ztrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1098,39 +1121,64 @@ ztrmm_kernel_L4_M4_20: add tempK, tempOffset, #4 #endif - asr counterL , tempK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , tempK, #3 + cmp counterL , #2 blt ztrmm_kernel_L4_M4_32 - KERNEL4x4_I // do one in the K - KERNEL4x4_M2 // do another in the K + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 subs counterL, counterL, #2 ble ztrmm_kernel_L4_M4_22a - .align 5 + .align 5 ztrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 subs counterL, counterL, #1 bgt ztrmm_kernel_L4_M4_22 - + .align 5 ztrmm_kernel_L4_M4_22a: + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E b ztrmm_kernel_L4_M4_44 + .align 5 ztrmm_kernel_L4_M4_32: tst counterL, #1 ble ztrmm_kernel_L4_M4_40 KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 KERNEL4x4_E b ztrmm_kernel_L4_M4_44 @@ -1142,12 +1190,16 @@ ztrmm_kernel_L4_M4_40: ztrmm_kernel_L4_M4_44: - ands counterL , tempK, #1 + ands counterL , tempK, #7 ble ztrmm_kernel_L4_M4_100 + .align 5 ztrmm_kernel_L4_M4_46: KERNEL4x4_SUB + subs counterL, counterL, #1 + bne ztrmm_kernel_L4_M4_46 + ztrmm_kernel_L4_M4_100: SAVE4x4 @@ -1167,6 +1219,10 @@ ztrmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + ztrmm_kernel_L4_M4_END: subs counterI, counterI, #1 bne ztrmm_kernel_L4_M4_20 diff --git a/param.h b/param.h index fdc9d1104..7635cb8fc 100644 --- a/param.h +++ b/param.h @@ -2341,13 +2341,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 4