diff --git a/kernel/arm64/cgemm_kernel_4x4.S b/kernel/arm64/cgemm_kernel_4x4.S index 7a70264ca..7f2ddea07 100644 --- a/kernel/arm64/cgemm_kernel_4x4.S +++ b/kernel/arm64/cgemm_kernel_4x4.S @@ -179,93 +179,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [ppA] add ppA, ppA, #32 - fmul v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] + fmul v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.4s, v0.4s, v9.4s[0] + fmls v17.4s, v0.4s, v9.s[0] #else - fmul v17.4s, v0.4s, v9.4s[0] + fmul v17.4s, v0.4s, v9.s[0] #endif - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] + fmul v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.4s, v0.4s, v9.4s[1] + fmls v21.4s, v0.4s, v9.s[1] #else - fmul v21.4s, v0.4s, v9.4s[1] + fmul v21.4s, v0.4s, v9.s[1] #endif - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] + fmul v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.4s[2] + fmls v25.4s, v0.4s, v9.s[2] #else - fmul v25.4s, v0.4s, v9.4s[2] + fmul v25.4s, v0.4s, v9.s[2] #endif - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - fmul v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] + fmul v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.4s[3] + fmls v29.4s, v0.4s, v9.s[3] #else - fmul v29.4s, v0.4s, v9.4s[3] + fmul v29.4s, v0.4s, v9.s[3] #endif - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_ir v29.4s, v1.4s, v8.s[3] - fmul v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] + fmul v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b - fmls v19.4s, v2.4s, v9.4s[0] + fmls v19.4s, v2.4s, v9.s[0] #else - fmul v19.4s, v2.4s, v9.4s[0] + fmul v19.4s, v2.4s, v9.s[0] #endif - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - fmul v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] + fmul v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b - fmls v23.4s, v2.4s, v9.4s[1] + fmls v23.4s, v2.4s, v9.s[1] #else - fmul v23.4s, v2.4s, v9.4s[1] + fmul v23.4s, v2.4s, v9.s[1] #endif - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_ir v23.4s, v3.4s, v8.s[1] - fmul v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] + fmul v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.4s, v2.4s, v9.4s[2] + fmls v27.4s, v2.4s, v9.s[2] #else - fmul v27.4s, v2.4s, v9.4s[2] + fmul v27.4s, v2.4s, v9.s[2] #endif - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - fmul v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] + fmul v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.4s, v2.4s, v9.4s[3] + fmls v31.4s, v2.4s, v9.s[3] #else - fmul v31.4s, v2.4s, v9.4s[3] + fmul v31.4s, v2.4s, v9.s[3] #endif - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_ir v31.4s, v3.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 @@ -276,159 +276,159 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v12.4s, v13.4s}, [pB] // for next round add pB, pB, #32 - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] prfm PLDL1KEEP, [pB, #512] - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] ld2 {v4.4s, v5.4s} , [pA] // for next round add pA, pA, #32 - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] prfm PLDL1KEEP, [pA, #512] - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] - OP_ri v19.4s, v2.4s, v9.4s[0] - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] ld2 {v6.4s, v7.4s} , [ppA] // for next round add ppA, ppA, #32 - OP_rr v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] - OP_ri v23.4s, v2.4s, v9.4s[1] - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] prfm PLDL1KEEP, [ppA, #512] - OP_rr v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] - OP_ri v27.4s, v2.4s, v9.4s[2] - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_rr v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] + OP_ri v27.4s, v2.4s, v9.s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - OP_rr v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] - OP_ri v31.4s, v2.4s, v9.4s[3] - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_rr v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] + OP_ri v31.4s, v2.4s, v9.s[3] + OP_ir v31.4s, v3.4s, v8.s[3] .endm .macro KERNEL8x4_M2 - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] ld2 {v8.4s, v9.4s}, [pB] // for next round add pB, pB, #32 - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] prfm PLDL1KEEP, [pA, #512] - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] ld2 {v0.4s, v1.4s}, [pA] // for next round add pA, pA, #32 - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] prfm PLDL1KEEP, [ppA, #512] - OP_rr v18.4s, v6.4s, v12.4s[0] - OP_ii v18.4s, v7.4s, v13.4s[0] - OP_ri v19.4s, v6.4s, v13.4s[0] - OP_ir v19.4s, v7.4s, v12.4s[0] + OP_rr v18.4s, v6.4s, v12.s[0] + OP_ii v18.4s, v7.4s, v13.s[0] + OP_ri v19.4s, v6.4s, v13.s[0] + OP_ir v19.4s, v7.4s, v12.s[0] ld2 {v2.4s, v3.4s}, [ppA] // for next round add ppA, ppA, #32 - OP_rr v22.4s, v6.4s, v12.4s[1] - OP_ii v22.4s, v7.4s, v13.4s[1] - OP_ri v23.4s, v6.4s, v13.4s[1] - OP_ir v23.4s, v7.4s, v12.4s[1] + OP_rr v22.4s, v6.4s, v12.s[1] + OP_ii v22.4s, v7.4s, v13.s[1] + OP_ri v23.4s, v6.4s, v13.s[1] + OP_ir v23.4s, v7.4s, v12.s[1] prfm PLDL1KEEP, [pB, #512] - OP_rr v26.4s, v6.4s, v12.4s[2] - OP_ii v26.4s, v7.4s, v13.4s[2] - OP_ri v27.4s, v6.4s, v13.4s[2] - OP_ir v27.4s, v7.4s, v12.4s[2] + OP_rr v26.4s, v6.4s, v12.s[2] + OP_ii v26.4s, v7.4s, v13.s[2] + OP_ri v27.4s, v6.4s, v13.s[2] + OP_ir v27.4s, v7.4s, v12.s[2] - OP_rr v30.4s, v6.4s, v12.4s[3] - OP_ii v30.4s, v7.4s, v13.4s[3] - OP_ri v31.4s, v6.4s, v13.4s[3] - OP_ir v31.4s, v7.4s, v12.4s[3] + OP_rr v30.4s, v6.4s, v12.s[3] + OP_ii v30.4s, v7.4s, v13.s[3] + OP_ri v31.4s, v6.4s, v13.s[3] + OP_ir v31.4s, v7.4s, v12.s[3] .endm .macro KERNEL8x4_E - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] - OP_rr v18.4s, v6.4s, v12.4s[0] - OP_ii v18.4s, v7.4s, v13.4s[0] - OP_ri v19.4s, v6.4s, v13.4s[0] - OP_ir v19.4s, v7.4s, v12.4s[0] + OP_rr v18.4s, v6.4s, v12.s[0] + OP_ii v18.4s, v7.4s, v13.s[0] + OP_ri v19.4s, v6.4s, v13.s[0] + OP_ir v19.4s, v7.4s, v12.s[0] - OP_rr v22.4s, v6.4s, v12.4s[1] - OP_ii v22.4s, v7.4s, v13.4s[1] - OP_ri v23.4s, v6.4s, v13.4s[1] - OP_ir v23.4s, v7.4s, v12.4s[1] + OP_rr v22.4s, v6.4s, v12.s[1] + OP_ii v22.4s, v7.4s, v13.s[1] + OP_ri v23.4s, v6.4s, v13.s[1] + OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v26.4s, v6.4s, v12.4s[2] - OP_ii v26.4s, v7.4s, v13.4s[2] - OP_ri v27.4s, v6.4s, v13.4s[2] - OP_ir v27.4s, v7.4s, v12.4s[2] + OP_rr v26.4s, v6.4s, v12.s[2] + OP_ii v26.4s, v7.4s, v13.s[2] + OP_ri v27.4s, v6.4s, v13.s[2] + OP_ir v27.4s, v7.4s, v12.s[2] - OP_rr v30.4s, v6.4s, v12.4s[3] - OP_ii v30.4s, v7.4s, v13.4s[3] - OP_ri v31.4s, v6.4s, v13.4s[3] - OP_ir v31.4s, v7.4s, v12.4s[3] + OP_rr v30.4s, v6.4s, v12.s[3] + OP_ii v30.4s, v7.4s, v13.s[3] + OP_ri v31.4s, v6.4s, v13.s[3] + OP_ir v31.4s, v7.4s, v12.s[3] .endm .macro KERNEL8x4_SUB @@ -437,48 +437,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v2.4s, v3.4s}, [ppA] add ppA, ppA, #32 - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] - OP_ri v19.4s, v2.4s, v9.4s[0] - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - OP_rr v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] - OP_ri v23.4s, v2.4s, v9.4s[1] - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] - OP_ri v27.4s, v2.4s, v9.4s[2] - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_rr v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] + OP_ri v27.4s, v2.4s, v9.s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - OP_rr v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] - OP_ri v31.4s, v2.4s, v9.4s[3] - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_rr v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] + OP_ri v31.4s, v2.4s, v9.s[3] + OP_ir v31.4s, v3.4s, v8.s[3] .endm .macro SAVE8x4 @@ -578,25 +578,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro SAVE4x4 @@ -658,25 +658,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.4s[0] - OP_ii v16.2s, v1.2s, v9.4s[0] - OP_ri v17.2s, v0.2s, v9.4s[0] - OP_ir v17.2s, v1.2s, v8.4s[0] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] - OP_rr v20.2s, v0.2s, v8.4s[1] - OP_ii v20.2s, v1.2s, v9.4s[1] - OP_ri v21.2s, v0.2s, v9.4s[1] - OP_ir v21.2s, v1.2s, v8.4s[1] + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] - OP_rr v24.2s, v0.2s, v8.4s[2] - OP_ii v24.2s, v1.2s, v9.4s[2] - OP_ri v25.2s, v0.2s, v9.4s[2] - OP_ir v25.2s, v1.2s, v8.4s[2] + OP_rr v24.2s, v0.2s, v8.s[2] + OP_ii v24.2s, v1.2s, v9.s[2] + OP_ri v25.2s, v0.2s, v9.s[2] + OP_ir v25.2s, v1.2s, v8.s[2] - OP_rr v28.2s, v0.2s, v8.4s[3] - OP_ii v28.2s, v1.2s, v9.4s[3] - OP_ri v29.2s, v0.2s, v9.4s[3] - OP_ir v29.2s, v1.2s, v8.4s[3] + OP_rr v28.2s, v0.2s, v8.s[3] + OP_ii v28.2s, v1.2s, v9.s[3] + OP_ri v29.2s, v0.2s, v9.s[3] + OP_ir v29.2s, v1.2s, v8.s[3] .endm .macro SAVE2x4 @@ -738,25 +738,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.4s[0] - OP_ii s16, s1, v9.4s[0] - OP_ri s17, s0, v9.4s[0] - OP_ir s17, s1, v8.4s[0] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] - OP_rr s20, s0, v8.4s[1] - OP_ii s20, s1, v9.4s[1] - OP_ri s21, s0, v9.4s[1] - OP_ir s21, s1, v8.4s[1] + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] - OP_rr s24, s0, v8.4s[2] - OP_ii s24, s1, v9.4s[2] - OP_ri s25, s0, v9.4s[2] - OP_ir s25, s1, v8.4s[2] + OP_rr s24, s0, v8.s[2] + OP_ii s24, s1, v9.s[2] + OP_ri s25, s0, v9.s[2] + OP_ir s25, s1, v8.s[2] - OP_rr s28, s0, v8.4s[3] - OP_ii s28, s1, v9.4s[3] - OP_ri s29, s0, v9.4s[3] - OP_ir s29, s1, v8.4s[3] + OP_rr s28, s0, v8.s[3] + OP_ii s28, s1, v9.s[3] + OP_ri s29, s0, v9.s[3] + OP_ir s29, s1, v8.s[3] .endm .macro SAVE1x4 @@ -814,15 +814,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.2s[0] - OP_ii v16.4s, v1.4s, v9.2s[0] - OP_ri v17.4s, v0.4s, v9.2s[0] - OP_ir v17.4s, v1.4s, v8.2s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.2s[1] - OP_ii v20.4s, v1.4s, v9.2s[1] - OP_ri v21.4s, v0.4s, v9.2s[1] - OP_ir v21.4s, v1.4s, v8.2s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] .endm .macro SAVE4x2 @@ -862,15 +862,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.2s[0] - OP_ii v16.2s, v1.2s, v9.2s[0] - OP_ri v17.2s, v0.2s, v9.2s[0] - OP_ir v17.2s, v1.2s, v8.2s[0] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] - OP_rr v20.2s, v0.2s, v8.2s[1] - OP_ii v20.2s, v1.2s, v9.2s[1] - OP_ri v21.2s, v0.2s, v9.2s[1] - OP_ir v21.2s, v1.2s, v8.2s[1] + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] .endm .macro SAVE2x2 @@ -910,15 +910,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.2s[0] - OP_ii s16, s1, v9.2s[0] - OP_ri s17, s0, v9.2s[0] - OP_ir s17, s1, v8.2s[0] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] - OP_rr s20, s0, v8.2s[1] - OP_ii s20, s1, v9.2s[1] - OP_ri s21, s0, v9.2s[1] - OP_ir s21, s1, v8.2s[1] + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] .endm .macro SAVE1x2 diff --git a/kernel/arm64/cgemm_kernel_8x4.S b/kernel/arm64/cgemm_kernel_8x4.S old mode 100755 new mode 100644 index 40b98cee2..d58cef52d --- a/kernel/arm64/cgemm_kernel_8x4.S +++ b/kernel/arm64/cgemm_kernel_8x4.S @@ -178,93 +178,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - fmul v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] + fmul v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.4s, v0.4s, v9.4s[0] + fmls v17.4s, v0.4s, v9.s[0] #else - fmul v17.4s, v0.4s, v9.4s[0] + fmul v17.4s, v0.4s, v9.s[0] #endif - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - fmul v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] + fmul v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b - fmls v19.4s, v2.4s, v9.4s[0] + fmls v19.4s, v2.4s, v9.s[0] #else - fmul v19.4s, v2.4s, v9.4s[0] + fmul v19.4s, v2.4s, v9.s[0] #endif - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] + fmul v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.4s, v0.4s, v9.4s[1] + fmls v21.4s, v0.4s, v9.s[1] #else - fmul v21.4s, v0.4s, v9.4s[1] + fmul v21.4s, v0.4s, v9.s[1] #endif - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - fmul v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] + fmul v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b - fmls v23.4s, v2.4s, v9.4s[1] + fmls v23.4s, v2.4s, v9.s[1] #else - fmul v23.4s, v2.4s, v9.4s[1] + fmul v23.4s, v2.4s, v9.s[1] #endif - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_ir v23.4s, v3.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] + fmul v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.4s[2] + fmls v25.4s, v0.4s, v9.s[2] #else - fmul v25.4s, v0.4s, v9.4s[2] + fmul v25.4s, v0.4s, v9.s[2] #endif - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - fmul v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] + fmul v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.4s, v2.4s, v9.4s[2] + fmls v27.4s, v2.4s, v9.s[2] #else - fmul v27.4s, v2.4s, v9.4s[2] + fmul v27.4s, v2.4s, v9.s[2] #endif - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - fmul v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] + fmul v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.4s[3] + fmls v29.4s, v0.4s, v9.s[3] #else - fmul v29.4s, v0.4s, v9.4s[3] + fmul v29.4s, v0.4s, v9.s[3] #endif - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_ir v29.4s, v1.4s, v8.s[3] - fmul v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] + fmul v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.4s, v2.4s, v9.4s[3] + fmls v31.4s, v2.4s, v9.s[3] #else - fmul v31.4s, v2.4s, v9.4s[3] + fmul v31.4s, v2.4s, v9.s[3] #endif - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_ir v31.4s, v3.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 @@ -275,45 +275,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] - OP_ri v19.4s, v2.4s, v9.4s[0] - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - OP_rr v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] - OP_ri v23.4s, v2.4s, v9.4s[1] - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - OP_rr v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] - OP_ri v27.4s, v2.4s, v9.4s[2] - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_rr v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] + OP_ri v27.4s, v2.4s, v9.s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] - OP_rr v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] - OP_ri v31.4s, v2.4s, v9.4s[3] - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_rr v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] + OP_ri v31.4s, v2.4s, v9.s[3] + OP_ir v31.4s, v3.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] // For next round add pB, pB, #32 @@ -324,45 +324,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M2 - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] - OP_rr v18.4s, v6.4s, v12.4s[0] - OP_ii v18.4s, v7.4s, v13.4s[0] - OP_ri v19.4s, v6.4s, v13.4s[0] - OP_ir v19.4s, v7.4s, v12.4s[0] + OP_rr v18.4s, v6.4s, v12.s[0] + OP_ii v18.4s, v7.4s, v13.s[0] + OP_ri v19.4s, v6.4s, v13.s[0] + OP_ir v19.4s, v7.4s, v12.s[0] - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] - OP_rr v22.4s, v6.4s, v12.4s[1] - OP_ii v22.4s, v7.4s, v13.4s[1] - OP_ri v23.4s, v6.4s, v13.4s[1] - OP_ir v23.4s, v7.4s, v12.4s[1] + OP_rr v22.4s, v6.4s, v12.s[1] + OP_ii v22.4s, v7.4s, v13.s[1] + OP_ri v23.4s, v6.4s, v13.s[1] + OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] - OP_rr v26.4s, v6.4s, v12.4s[2] - OP_ii v26.4s, v7.4s, v13.4s[2] - OP_ri v27.4s, v6.4s, v13.4s[2] - OP_ir v27.4s, v7.4s, v12.4s[2] + OP_rr v26.4s, v6.4s, v12.s[2] + OP_ii v26.4s, v7.4s, v13.s[2] + OP_ri v27.4s, v6.4s, v13.s[2] + OP_ir v27.4s, v7.4s, v12.s[2] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] - OP_rr v30.4s, v6.4s, v12.4s[3] - OP_ii v30.4s, v7.4s, v13.4s[3] - OP_ri v31.4s, v6.4s, v13.4s[3] - OP_ir v31.4s, v7.4s, v12.4s[3] + OP_rr v30.4s, v6.4s, v12.s[3] + OP_ii v30.4s, v7.4s, v13.s[3] + OP_ri v31.4s, v6.4s, v13.s[3] + OP_ir v31.4s, v7.4s, v12.s[3] ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 @@ -373,45 +373,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_E - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] - OP_rr v18.4s, v6.4s, v12.4s[0] - OP_ii v18.4s, v7.4s, v13.4s[0] - OP_ri v19.4s, v6.4s, v13.4s[0] - OP_ir v19.4s, v7.4s, v12.4s[0] + OP_rr v18.4s, v6.4s, v12.s[0] + OP_ii v18.4s, v7.4s, v13.s[0] + OP_ri v19.4s, v6.4s, v13.s[0] + OP_ir v19.4s, v7.4s, v12.s[0] - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] - OP_rr v22.4s, v6.4s, v12.4s[1] - OP_ii v22.4s, v7.4s, v13.4s[1] - OP_ri v23.4s, v6.4s, v13.4s[1] - OP_ir v23.4s, v7.4s, v12.4s[1] + OP_rr v22.4s, v6.4s, v12.s[1] + OP_ii v22.4s, v7.4s, v13.s[1] + OP_ri v23.4s, v6.4s, v13.s[1] + OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] - OP_rr v26.4s, v6.4s, v12.4s[2] - OP_ii v26.4s, v7.4s, v13.4s[2] - OP_ri v27.4s, v6.4s, v13.4s[2] - OP_ir v27.4s, v7.4s, v12.4s[2] + OP_rr v26.4s, v6.4s, v12.s[2] + OP_ii v26.4s, v7.4s, v13.s[2] + OP_ri v27.4s, v6.4s, v13.s[2] + OP_ir v27.4s, v7.4s, v12.s[2] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] - OP_rr v30.4s, v6.4s, v12.4s[3] - OP_ii v30.4s, v7.4s, v13.4s[3] - OP_ri v31.4s, v6.4s, v13.4s[3] - OP_ir v31.4s, v7.4s, v12.4s[3] + OP_rr v30.4s, v6.4s, v12.s[3] + OP_ii v30.4s, v7.4s, v13.s[3] + OP_ri v31.4s, v6.4s, v13.s[3] + OP_ir v31.4s, v7.4s, v12.s[3] .endm @@ -423,45 +423,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] - OP_ri v19.4s, v2.4s, v9.4s[0] - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - OP_rr v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] - OP_ri v23.4s, v2.4s, v9.4s[1] - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - OP_rr v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] - OP_ri v27.4s, v2.4s, v9.4s[2] - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_rr v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] + OP_ri v27.4s, v2.4s, v9.s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] - OP_rr v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] - OP_ri v31.4s, v2.4s, v9.4s[3] - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_rr v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] + OP_ri v31.4s, v2.4s, v9.s[3] + OP_ir v31.4s, v3.4s, v8.s[3] .endm @@ -560,49 +560,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - fmul v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] + fmul v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.4s, v0.4s, v9.4s[0] + fmls v17.4s, v0.4s, v9.s[0] #else - fmul v17.4s, v0.4s, v9.4s[0] + fmul v17.4s, v0.4s, v9.s[0] #endif - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] + fmul v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.4s, v0.4s, v9.4s[1] + fmls v21.4s, v0.4s, v9.s[1] #else - fmul v21.4s, v0.4s, v9.4s[1] + fmul v21.4s, v0.4s, v9.s[1] #endif - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] + fmul v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.4s[2] + fmls v25.4s, v0.4s, v9.s[2] #else - fmul v25.4s, v0.4s, v9.4s[2] + fmul v25.4s, v0.4s, v9.s[2] #endif - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - fmul v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] + fmul v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.4s[3] + fmls v29.4s, v0.4s, v9.s[3] #else - fmul v29.4s, v0.4s, v9.4s[3] + fmul v29.4s, v0.4s, v9.s[3] #endif - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_ir v29.4s, v1.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 @@ -611,85 +611,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v12.4s, v13.4s}, [pB] // For next round add pB, pB, #32 - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v4.4s, v5.4s}, [pA] // For next round add pA, pA, #32 - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] prfm PLDL1KEEP, [pA, #512] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro KERNEL4x4_M2 - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] ld2 {v8.4s, v9.4s}, [pB] // For next round add pB, pB, #32 - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] ld2 {v0.4s, v1.4s}, [pA] // For next round add pA, pA, #32 - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] prfm PLDL1KEEP, [pB, #512] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_E - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_SUB @@ -698,25 +698,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro SAVE4x4 @@ -778,25 +778,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.4s[0] - OP_ii v16.2s, v1.2s, v9.4s[0] - OP_ri v17.2s, v0.2s, v9.4s[0] - OP_ir v17.2s, v1.2s, v8.4s[0] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] - OP_rr v20.2s, v0.2s, v8.4s[1] - OP_ii v20.2s, v1.2s, v9.4s[1] - OP_ri v21.2s, v0.2s, v9.4s[1] - OP_ir v21.2s, v1.2s, v8.4s[1] + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] - OP_rr v24.2s, v0.2s, v8.4s[2] - OP_ii v24.2s, v1.2s, v9.4s[2] - OP_ri v25.2s, v0.2s, v9.4s[2] - OP_ir v25.2s, v1.2s, v8.4s[2] + OP_rr v24.2s, v0.2s, v8.s[2] + OP_ii v24.2s, v1.2s, v9.s[2] + OP_ri v25.2s, v0.2s, v9.s[2] + OP_ir v25.2s, v1.2s, v8.s[2] - OP_rr v28.2s, v0.2s, v8.4s[3] - OP_ii v28.2s, v1.2s, v9.4s[3] - OP_ri v29.2s, v0.2s, v9.4s[3] - OP_ir v29.2s, v1.2s, v8.4s[3] + OP_rr v28.2s, v0.2s, v8.s[3] + OP_ii v28.2s, v1.2s, v9.s[3] + OP_ri v29.2s, v0.2s, v9.s[3] + OP_ir v29.2s, v1.2s, v8.s[3] .endm .macro SAVE2x4 @@ -858,25 +858,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.4s[0] - OP_ii s16, s1, v9.4s[0] - OP_ri s17, s0, v9.4s[0] - OP_ir s17, s1, v8.4s[0] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] - OP_rr s20, s0, v8.4s[1] - OP_ii s20, s1, v9.4s[1] - OP_ri s21, s0, v9.4s[1] - OP_ir s21, s1, v8.4s[1] + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] - OP_rr s24, s0, v8.4s[2] - OP_ii s24, s1, v9.4s[2] - OP_ri s25, s0, v9.4s[2] - OP_ir s25, s1, v8.4s[2] + OP_rr s24, s0, v8.s[2] + OP_ii s24, s1, v9.s[2] + OP_ri s25, s0, v9.s[2] + OP_ir s25, s1, v8.s[2] - OP_rr s28, s0, v8.4s[3] - OP_ii s28, s1, v9.4s[3] - OP_ri s29, s0, v9.4s[3] - OP_ir s29, s1, v8.4s[3] + OP_rr s28, s0, v8.s[3] + OP_ii s28, s1, v9.s[3] + OP_ri s29, s0, v9.s[3] + OP_ir s29, s1, v8.s[3] .endm .macro SAVE1x4 @@ -940,25 +940,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.2s[0] - OP_ii v16.4s, v1.4s, v9.2s[0] - OP_ri v17.4s, v0.4s, v9.2s[0] - OP_ir v17.4s, v1.4s, v8.2s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.2s[0] - OP_ii v18.4s, v3.4s, v9.2s[0] - OP_ri v19.4s, v2.4s, v9.2s[0] - OP_ir v19.4s, v3.4s, v8.2s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.2s[1] - OP_ii v20.4s, v1.4s, v9.2s[1] - OP_ri v21.4s, v0.4s, v9.2s[1] - OP_ir v21.4s, v1.4s, v8.2s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - OP_rr v22.4s, v2.4s, v8.2s[1] - OP_ii v22.4s, v3.4s, v9.2s[1] - OP_ri v23.4s, v2.4s, v9.2s[1] - OP_ir v23.4s, v3.4s, v8.2s[1] + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] .endm .macro SAVE8x2 @@ -1016,15 +1016,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.2s[0] - OP_ii v16.4s, v1.4s, v9.2s[0] - OP_ri v17.4s, v0.4s, v9.2s[0] - OP_ir v17.4s, v1.4s, v8.2s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.2s[1] - OP_ii v20.4s, v1.4s, v9.2s[1] - OP_ri v21.4s, v0.4s, v9.2s[1] - OP_ir v21.4s, v1.4s, v8.2s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] .endm .macro SAVE4x2 @@ -1064,15 +1064,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.2s[0] - OP_ii v16.2s, v1.2s, v9.2s[0] - OP_ri v17.2s, v0.2s, v9.2s[0] - OP_ir v17.2s, v1.2s, v8.2s[0] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] - OP_rr v20.2s, v0.2s, v8.2s[1] - OP_ii v20.2s, v1.2s, v9.2s[1] - OP_ri v21.2s, v0.2s, v9.2s[1] - OP_ir v21.2s, v1.2s, v8.2s[1] + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] .endm .macro SAVE2x2 @@ -1112,15 +1112,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.2s[0] - OP_ii s16, s1, v9.2s[0] - OP_ri s17, s0, v9.2s[0] - OP_ir s17, s1, v8.2s[0] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] - OP_rr s20, s0, v8.2s[1] - OP_ii s20, s1, v9.2s[1] - OP_ri s21, s0, v9.2s[1] - OP_ir s21, s1, v8.2s[1] + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] .endm .macro SAVE1x2 @@ -1162,15 +1162,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v8.4s[1] - OP_ri v17.4s, v0.4s, v8.4s[1] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v8.s[1] + OP_ri v17.4s, v0.4s, v8.s[1] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v8.4s[1] - OP_ri v19.4s, v2.4s, v8.4s[1] - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v8.s[1] + OP_ri v19.4s, v2.4s, v8.s[1] + OP_ir v19.4s, v3.4s, v8.s[0] .endm .macro SAVE8x1 diff --git a/kernel/arm64/ctrmm_kernel_4x4.S b/kernel/arm64/ctrmm_kernel_4x4.S index be0e9bdef..3de27257a 100644 --- a/kernel/arm64/ctrmm_kernel_4x4.S +++ b/kernel/arm64/ctrmm_kernel_4x4.S @@ -170,49 +170,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - fmul v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] + fmul v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.4s, v0.4s, v9.4s[0] + fmls v17.4s, v0.4s, v9.s[0] #else - fmul v17.4s, v0.4s, v9.4s[0] + fmul v17.4s, v0.4s, v9.s[0] #endif - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] + fmul v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.4s, v0.4s, v9.4s[1] + fmls v21.4s, v0.4s, v9.s[1] #else - fmul v21.4s, v0.4s, v9.4s[1] + fmul v21.4s, v0.4s, v9.s[1] #endif - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] + fmul v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.4s[2] + fmls v25.4s, v0.4s, v9.s[2] #else - fmul v25.4s, v0.4s, v9.4s[2] + fmul v25.4s, v0.4s, v9.s[2] #endif - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - fmul v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] + fmul v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.4s[3] + fmls v29.4s, v0.4s, v9.s[3] #else - fmul v29.4s, v0.4s, v9.4s[3] + fmul v29.4s, v0.4s, v9.s[3] #endif - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_ir v29.4s, v1.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 @@ -221,85 +221,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v12.4s, v13.4s}, [pB] // For next round add pB, pB, #32 - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v4.4s, v5.4s}, [pA] // For next round add pA, pA, #32 - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] prfm PLDL1KEEP, [pA, #512] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro KERNEL4x4_M2 - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] ld2 {v8.4s, v9.4s}, [pB] // For next round add pB, pB, #32 - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] ld2 {v0.4s, v1.4s}, [pA] // For next round add pA, pA, #32 - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] prfm PLDL1KEEP, [pB, #512] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_E - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_SUB @@ -308,25 +308,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro SAVE4x4 @@ -384,25 +384,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.4s[0] - OP_ii v16.2s, v1.2s, v9.4s[0] - OP_ri v17.2s, v0.2s, v9.4s[0] - OP_ir v17.2s, v1.2s, v8.4s[0] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] - OP_rr v20.2s, v0.2s, v8.4s[1] - OP_ii v20.2s, v1.2s, v9.4s[1] - OP_ri v21.2s, v0.2s, v9.4s[1] - OP_ir v21.2s, v1.2s, v8.4s[1] + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] - OP_rr v24.2s, v0.2s, v8.4s[2] - OP_ii v24.2s, v1.2s, v9.4s[2] - OP_ri v25.2s, v0.2s, v9.4s[2] - OP_ir v25.2s, v1.2s, v8.4s[2] + OP_rr v24.2s, v0.2s, v8.s[2] + OP_ii v24.2s, v1.2s, v9.s[2] + OP_ri v25.2s, v0.2s, v9.s[2] + OP_ir v25.2s, v1.2s, v8.s[2] - OP_rr v28.2s, v0.2s, v8.4s[3] - OP_ii v28.2s, v1.2s, v9.4s[3] - OP_ri v29.2s, v0.2s, v9.4s[3] - OP_ir v29.2s, v1.2s, v8.4s[3] + OP_rr v28.2s, v0.2s, v8.s[3] + OP_ii v28.2s, v1.2s, v9.s[3] + OP_ri v29.2s, v0.2s, v9.s[3] + OP_ir v29.2s, v1.2s, v8.s[3] .endm .macro SAVE2x4 @@ -460,25 +460,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.4s[0] - OP_ii s16, s1, v9.4s[0] - OP_ri s17, s0, v9.4s[0] - OP_ir s17, s1, v8.4s[0] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] - OP_rr s20, s0, v8.4s[1] - OP_ii s20, s1, v9.4s[1] - OP_ri s21, s0, v9.4s[1] - OP_ir s21, s1, v8.4s[1] + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] - OP_rr s24, s0, v8.4s[2] - OP_ii s24, s1, v9.4s[2] - OP_ri s25, s0, v9.4s[2] - OP_ir s25, s1, v8.4s[2] + OP_rr s24, s0, v8.s[2] + OP_ii s24, s1, v9.s[2] + OP_ri s25, s0, v9.s[2] + OP_ir s25, s1, v8.s[2] - OP_rr s28, s0, v8.4s[3] - OP_ii s28, s1, v9.4s[3] - OP_ri s29, s0, v9.4s[3] - OP_ir s29, s1, v8.4s[3] + OP_rr s28, s0, v8.s[3] + OP_ii s28, s1, v9.s[3] + OP_ri s29, s0, v9.s[3] + OP_ir s29, s1, v8.s[3] .endm .macro SAVE1x4 @@ -532,15 +532,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.2s[0] - OP_ii v16.4s, v1.4s, v9.2s[0] - OP_ri v17.4s, v0.4s, v9.2s[0] - OP_ir v17.4s, v1.4s, v8.2s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.2s[1] - OP_ii v20.4s, v1.4s, v9.2s[1] - OP_ri v21.4s, v0.4s, v9.2s[1] - OP_ir v21.4s, v1.4s, v8.2s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] .endm .macro SAVE4x2 @@ -578,15 +578,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.2s[0] - OP_ii v16.2s, v1.2s, v9.2s[0] - OP_ri v17.2s, v0.2s, v9.2s[0] - OP_ir v17.2s, v1.2s, v8.2s[0] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] - OP_rr v20.2s, v0.2s, v8.2s[1] - OP_ii v20.2s, v1.2s, v9.2s[1] - OP_ri v21.2s, v0.2s, v9.2s[1] - OP_ir v21.2s, v1.2s, v8.2s[1] + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] .endm .macro SAVE2x2 @@ -624,15 +624,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.2s[0] - OP_ii s16, s1, v9.2s[0] - OP_ri s17, s0, v9.2s[0] - OP_ir s17, s1, v8.2s[0] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] - OP_rr s20, s0, v8.2s[1] - OP_ii s20, s1, v9.2s[1] - OP_ri s21, s0, v9.2s[1] - OP_ir s21, s1, v8.2s[1] + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] .endm .macro SAVE1x2 diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S old mode 100755 new mode 100644 index 3131541d4..ce5cb0406 --- a/kernel/arm64/ctrmm_kernel_8x4.S +++ b/kernel/arm64/ctrmm_kernel_8x4.S @@ -180,93 +180,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - fmul v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] + fmul v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.4s, v0.4s, v9.4s[0] + fmls v17.4s, v0.4s, v9.s[0] #else - fmul v17.4s, v0.4s, v9.4s[0] + fmul v17.4s, v0.4s, v9.s[0] #endif - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - fmul v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] + fmul v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b - fmls v19.4s, v2.4s, v9.4s[0] + fmls v19.4s, v2.4s, v9.s[0] #else - fmul v19.4s, v2.4s, v9.4s[0] + fmul v19.4s, v2.4s, v9.s[0] #endif - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] + fmul v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.4s, v0.4s, v9.4s[1] + fmls v21.4s, v0.4s, v9.s[1] #else - fmul v21.4s, v0.4s, v9.4s[1] + fmul v21.4s, v0.4s, v9.s[1] #endif - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - fmul v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] + fmul v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b - fmls v23.4s, v2.4s, v9.4s[1] + fmls v23.4s, v2.4s, v9.s[1] #else - fmul v23.4s, v2.4s, v9.4s[1] + fmul v23.4s, v2.4s, v9.s[1] #endif - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_ir v23.4s, v3.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] + fmul v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.4s[2] + fmls v25.4s, v0.4s, v9.s[2] #else - fmul v25.4s, v0.4s, v9.4s[2] + fmul v25.4s, v0.4s, v9.s[2] #endif - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - fmul v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] + fmul v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.4s, v2.4s, v9.4s[2] + fmls v27.4s, v2.4s, v9.s[2] #else - fmul v27.4s, v2.4s, v9.4s[2] + fmul v27.4s, v2.4s, v9.s[2] #endif - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - fmul v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] + fmul v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.4s[3] + fmls v29.4s, v0.4s, v9.s[3] #else - fmul v29.4s, v0.4s, v9.4s[3] + fmul v29.4s, v0.4s, v9.s[3] #endif - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_ir v29.4s, v1.4s, v8.s[3] - fmul v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] + fmul v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.4s, v2.4s, v9.4s[3] + fmls v31.4s, v2.4s, v9.s[3] #else - fmul v31.4s, v2.4s, v9.4s[3] + fmul v31.4s, v2.4s, v9.s[3] #endif - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_ir v31.4s, v3.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 @@ -277,45 +277,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] - OP_ri v19.4s, v2.4s, v9.4s[0] - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - OP_rr v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] - OP_ri v23.4s, v2.4s, v9.4s[1] - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - OP_rr v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] - OP_ri v27.4s, v2.4s, v9.4s[2] - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_rr v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] + OP_ri v27.4s, v2.4s, v9.s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] - OP_rr v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] - OP_ri v31.4s, v2.4s, v9.4s[3] - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_rr v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] + OP_ri v31.4s, v2.4s, v9.s[3] + OP_ir v31.4s, v3.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] // For next round add pB, pB, #32 @@ -326,45 +326,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M2 - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] - OP_rr v18.4s, v6.4s, v12.4s[0] - OP_ii v18.4s, v7.4s, v13.4s[0] - OP_ri v19.4s, v6.4s, v13.4s[0] - OP_ir v19.4s, v7.4s, v12.4s[0] + OP_rr v18.4s, v6.4s, v12.s[0] + OP_ii v18.4s, v7.4s, v13.s[0] + OP_ri v19.4s, v6.4s, v13.s[0] + OP_ir v19.4s, v7.4s, v12.s[0] - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] - OP_rr v22.4s, v6.4s, v12.4s[1] - OP_ii v22.4s, v7.4s, v13.4s[1] - OP_ri v23.4s, v6.4s, v13.4s[1] - OP_ir v23.4s, v7.4s, v12.4s[1] + OP_rr v22.4s, v6.4s, v12.s[1] + OP_ii v22.4s, v7.4s, v13.s[1] + OP_ri v23.4s, v6.4s, v13.s[1] + OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] - OP_rr v26.4s, v6.4s, v12.4s[2] - OP_ii v26.4s, v7.4s, v13.4s[2] - OP_ri v27.4s, v6.4s, v13.4s[2] - OP_ir v27.4s, v7.4s, v12.4s[2] + OP_rr v26.4s, v6.4s, v12.s[2] + OP_ii v26.4s, v7.4s, v13.s[2] + OP_ri v27.4s, v6.4s, v13.s[2] + OP_ir v27.4s, v7.4s, v12.s[2] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] - OP_rr v30.4s, v6.4s, v12.4s[3] - OP_ii v30.4s, v7.4s, v13.4s[3] - OP_ri v31.4s, v6.4s, v13.4s[3] - OP_ir v31.4s, v7.4s, v12.4s[3] + OP_rr v30.4s, v6.4s, v12.s[3] + OP_ii v30.4s, v7.4s, v13.s[3] + OP_ri v31.4s, v6.4s, v13.s[3] + OP_ir v31.4s, v7.4s, v12.s[3] ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 @@ -375,45 +375,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_E - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] - OP_rr v18.4s, v6.4s, v12.4s[0] - OP_ii v18.4s, v7.4s, v13.4s[0] - OP_ri v19.4s, v6.4s, v13.4s[0] - OP_ir v19.4s, v7.4s, v12.4s[0] + OP_rr v18.4s, v6.4s, v12.s[0] + OP_ii v18.4s, v7.4s, v13.s[0] + OP_ri v19.4s, v6.4s, v13.s[0] + OP_ir v19.4s, v7.4s, v12.s[0] - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] - OP_rr v22.4s, v6.4s, v12.4s[1] - OP_ii v22.4s, v7.4s, v13.4s[1] - OP_ri v23.4s, v6.4s, v13.4s[1] - OP_ir v23.4s, v7.4s, v12.4s[1] + OP_rr v22.4s, v6.4s, v12.s[1] + OP_ii v22.4s, v7.4s, v13.s[1] + OP_ri v23.4s, v6.4s, v13.s[1] + OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] - OP_rr v26.4s, v6.4s, v12.4s[2] - OP_ii v26.4s, v7.4s, v13.4s[2] - OP_ri v27.4s, v6.4s, v13.4s[2] - OP_ir v27.4s, v7.4s, v12.4s[2] + OP_rr v26.4s, v6.4s, v12.s[2] + OP_ii v26.4s, v7.4s, v13.s[2] + OP_ri v27.4s, v6.4s, v13.s[2] + OP_ir v27.4s, v7.4s, v12.s[2] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] - OP_rr v30.4s, v6.4s, v12.4s[3] - OP_ii v30.4s, v7.4s, v13.4s[3] - OP_ri v31.4s, v6.4s, v13.4s[3] - OP_ir v31.4s, v7.4s, v12.4s[3] + OP_rr v30.4s, v6.4s, v12.s[3] + OP_ii v30.4s, v7.4s, v13.s[3] + OP_ri v31.4s, v6.4s, v13.s[3] + OP_ir v31.4s, v7.4s, v12.s[3] .endm @@ -425,45 +425,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] - OP_ri v19.4s, v2.4s, v9.4s[0] - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - OP_rr v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] - OP_ri v23.4s, v2.4s, v9.4s[1] - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - OP_rr v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] - OP_ri v27.4s, v2.4s, v9.4s[2] - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_rr v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] + OP_ri v27.4s, v2.4s, v9.s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] - OP_rr v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] - OP_ri v31.4s, v2.4s, v9.4s[3] - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_rr v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] + OP_ri v31.4s, v2.4s, v9.s[3] + OP_ir v31.4s, v3.4s, v8.s[3] .endm @@ -562,49 +562,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - fmul v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] + fmul v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.4s, v0.4s, v9.4s[0] + fmls v17.4s, v0.4s, v9.s[0] #else - fmul v17.4s, v0.4s, v9.4s[0] + fmul v17.4s, v0.4s, v9.s[0] #endif - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] + fmul v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.4s, v0.4s, v9.4s[1] + fmls v21.4s, v0.4s, v9.s[1] #else - fmul v21.4s, v0.4s, v9.4s[1] + fmul v21.4s, v0.4s, v9.s[1] #endif - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] + fmul v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.4s[2] + fmls v25.4s, v0.4s, v9.s[2] #else - fmul v25.4s, v0.4s, v9.4s[2] + fmul v25.4s, v0.4s, v9.s[2] #endif - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - fmul v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] + fmul v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.4s[3] + fmls v29.4s, v0.4s, v9.s[3] #else - fmul v29.4s, v0.4s, v9.4s[3] + fmul v29.4s, v0.4s, v9.s[3] #endif - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_ir v29.4s, v1.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 @@ -613,85 +613,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v12.4s, v13.4s}, [pB] // For next round add pB, pB, #32 - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v4.4s, v5.4s}, [pA] // For next round add pA, pA, #32 - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] prfm PLDL1KEEP, [pA, #512] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro KERNEL4x4_M2 - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] ld2 {v8.4s, v9.4s}, [pB] // For next round add pB, pB, #32 - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] ld2 {v0.4s, v1.4s}, [pA] // For next round add pA, pA, #32 - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] prfm PLDL1KEEP, [pB, #512] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_E - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_SUB @@ -700,25 +700,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro SAVE4x4 @@ -780,25 +780,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.4s[0] - OP_ii v16.2s, v1.2s, v9.4s[0] - OP_ri v17.2s, v0.2s, v9.4s[0] - OP_ir v17.2s, v1.2s, v8.4s[0] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] - OP_rr v20.2s, v0.2s, v8.4s[1] - OP_ii v20.2s, v1.2s, v9.4s[1] - OP_ri v21.2s, v0.2s, v9.4s[1] - OP_ir v21.2s, v1.2s, v8.4s[1] + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] - OP_rr v24.2s, v0.2s, v8.4s[2] - OP_ii v24.2s, v1.2s, v9.4s[2] - OP_ri v25.2s, v0.2s, v9.4s[2] - OP_ir v25.2s, v1.2s, v8.4s[2] + OP_rr v24.2s, v0.2s, v8.s[2] + OP_ii v24.2s, v1.2s, v9.s[2] + OP_ri v25.2s, v0.2s, v9.s[2] + OP_ir v25.2s, v1.2s, v8.s[2] - OP_rr v28.2s, v0.2s, v8.4s[3] - OP_ii v28.2s, v1.2s, v9.4s[3] - OP_ri v29.2s, v0.2s, v9.4s[3] - OP_ir v29.2s, v1.2s, v8.4s[3] + OP_rr v28.2s, v0.2s, v8.s[3] + OP_ii v28.2s, v1.2s, v9.s[3] + OP_ri v29.2s, v0.2s, v9.s[3] + OP_ir v29.2s, v1.2s, v8.s[3] .endm .macro SAVE2x4 @@ -860,25 +860,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.4s[0] - OP_ii s16, s1, v9.4s[0] - OP_ri s17, s0, v9.4s[0] - OP_ir s17, s1, v8.4s[0] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] - OP_rr s20, s0, v8.4s[1] - OP_ii s20, s1, v9.4s[1] - OP_ri s21, s0, v9.4s[1] - OP_ir s21, s1, v8.4s[1] + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] - OP_rr s24, s0, v8.4s[2] - OP_ii s24, s1, v9.4s[2] - OP_ri s25, s0, v9.4s[2] - OP_ir s25, s1, v8.4s[2] + OP_rr s24, s0, v8.s[2] + OP_ii s24, s1, v9.s[2] + OP_ri s25, s0, v9.s[2] + OP_ir s25, s1, v8.s[2] - OP_rr s28, s0, v8.4s[3] - OP_ii s28, s1, v9.4s[3] - OP_ri s29, s0, v9.4s[3] - OP_ir s29, s1, v8.4s[3] + OP_rr s28, s0, v8.s[3] + OP_ii s28, s1, v9.s[3] + OP_ri s29, s0, v9.s[3] + OP_ir s29, s1, v8.s[3] .endm .macro SAVE1x4 @@ -942,25 +942,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.2s[0] - OP_ii v16.4s, v1.4s, v9.2s[0] - OP_ri v17.4s, v0.4s, v9.2s[0] - OP_ir v17.4s, v1.4s, v8.2s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.2s[0] - OP_ii v18.4s, v3.4s, v9.2s[0] - OP_ri v19.4s, v2.4s, v9.2s[0] - OP_ir v19.4s, v3.4s, v8.2s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.2s[1] - OP_ii v20.4s, v1.4s, v9.2s[1] - OP_ri v21.4s, v0.4s, v9.2s[1] - OP_ir v21.4s, v1.4s, v8.2s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - OP_rr v22.4s, v2.4s, v8.2s[1] - OP_ii v22.4s, v3.4s, v9.2s[1] - OP_ri v23.4s, v2.4s, v9.2s[1] - OP_ir v23.4s, v3.4s, v8.2s[1] + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] .endm .macro SAVE8x2 @@ -1018,15 +1018,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.2s[0] - OP_ii v16.4s, v1.4s, v9.2s[0] - OP_ri v17.4s, v0.4s, v9.2s[0] - OP_ir v17.4s, v1.4s, v8.2s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.2s[1] - OP_ii v20.4s, v1.4s, v9.2s[1] - OP_ri v21.4s, v0.4s, v9.2s[1] - OP_ir v21.4s, v1.4s, v8.2s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] .endm .macro SAVE4x2 @@ -1066,15 +1066,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.2s[0] - OP_ii v16.2s, v1.2s, v9.2s[0] - OP_ri v17.2s, v0.2s, v9.2s[0] - OP_ir v17.2s, v1.2s, v8.2s[0] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] - OP_rr v20.2s, v0.2s, v8.2s[1] - OP_ii v20.2s, v1.2s, v9.2s[1] - OP_ri v21.2s, v0.2s, v9.2s[1] - OP_ir v21.2s, v1.2s, v8.2s[1] + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] .endm .macro SAVE2x2 @@ -1114,15 +1114,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.2s[0] - OP_ii s16, s1, v9.2s[0] - OP_ri s17, s0, v9.2s[0] - OP_ir s17, s1, v8.2s[0] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] - OP_rr s20, s0, v8.2s[1] - OP_ii s20, s1, v9.2s[1] - OP_ri s21, s0, v9.2s[1] - OP_ir s21, s1, v8.2s[1] + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] .endm .macro SAVE1x2 @@ -1164,15 +1164,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v8.4s[1] - OP_ri v17.4s, v0.4s, v8.4s[1] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v8.s[1] + OP_ri v17.4s, v0.4s, v8.s[1] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v8.4s[1] - OP_ri v19.4s, v2.4s, v8.4s[1] - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v8.s[1] + OP_ri v19.4s, v2.4s, v8.s[1] + OP_ir v19.4s, v3.4s, v8.s[0] .endm .macro SAVE8x1 diff --git a/kernel/arm64/dgemm_kernel_4x4.S b/kernel/arm64/dgemm_kernel_4x4.S index e2ad11492..44b0f7ff2 100644 --- a/kernel/arm64/dgemm_kernel_4x4.S +++ b/kernel/arm64/dgemm_kernel_4x4.S @@ -161,150 +161,150 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldp q0, q1, [pA] add pA, pA, #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v29.2d, v1.2d, v11.2d[0] + fmul v16.2d, v0.2d, v8.d[0] + fmul v29.2d, v1.2d, v11.d[0] ldp q2, q3, [ppA] add ppA, ppA, #32 - fmul v20.2d, v0.2d, v9.2d[0] - fmul v25.2d, v1.2d, v10.2d[0] + fmul v20.2d, v0.2d, v9.d[0] + fmul v25.2d, v1.2d, v10.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - fmul v18.2d, v2.2d, v8.2d[0] - fmul v31.2d, v3.2d, v11.2d[0] + fmul v18.2d, v2.2d, v8.d[0] + fmul v31.2d, v3.2d, v11.d[0] prfm PLDL1KEEP, [ppA, #A_PRE_SIZE] - fmul v22.2d, v2.2d, v9.2d[0] - fmul v27.2d, v3.2d, v10.2d[0] + fmul v22.2d, v2.2d, v9.d[0] + fmul v27.2d, v3.2d, v10.d[0] ldp d12, d13, [pB] add pB, pB, #16 - fmul v24.2d, v0.2d, v10.2d[0] - fmul v21.2d, v1.2d, v9.2d[0] + fmul v24.2d, v0.2d, v10.d[0] + fmul v21.2d, v1.2d, v9.d[0] ldp q4, q5, [pA] // for next round add pA, pA, #32 - fmul v26.2d, v2.2d, v10.2d[0] - fmul v23.2d, v3.2d, v9.2d[0] + fmul v26.2d, v2.2d, v10.d[0] + fmul v23.2d, v3.2d, v9.d[0] ldp q6, q7, [ppA] // for next round add ppA, ppA, #32 - fmul v28.2d, v0.2d, v11.2d[0] - fmul v17.2d, v1.2d, v8.2d[0] + fmul v28.2d, v0.2d, v11.d[0] + fmul v17.2d, v1.2d, v8.d[0] ldp d14, d15, [pB] add pB, pB, #16 - fmul v30.2d, v2.2d, v11.2d[0] - fmul v19.2d, v3.2d, v8.2d[0] + fmul v30.2d, v2.2d, v11.d[0] + fmul v19.2d, v3.2d, v8.d[0] .endm .macro KERNEL8x4_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] + fmla v16.2d, v4.2d, v12.d[0] + fmla v29.2d, v5.2d, v15.d[0] ldp d8, d9, [pB] add pB, pB, #16 - fmla v18.2d, v6.2d, v12.2d[0] - fmla v31.2d, v7.2d, v15.2d[0] + fmla v18.2d, v6.2d, v12.d[0] + fmla v31.2d, v7.2d, v15.d[0] ldp d10, d11, [pB] add pB, pB, #16 - fmla v20.2d, v4.2d, v13.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] + fmla v20.2d, v4.2d, v13.d[0] + fmla v25.2d, v5.2d, v14.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v22.2d, v6.2d, v13.2d[0] - fmla v27.2d, v7.2d, v14.2d[0] - fmla v24.2d, v4.2d, v14.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] + fmla v22.2d, v6.2d, v13.d[0] + fmla v27.2d, v7.2d, v14.d[0] + fmla v24.2d, v4.2d, v14.d[0] + fmla v21.2d, v5.2d, v13.d[0] ldp q0, q1, [pA] add pA, pA, #32 - fmla v26.2d, v6.2d, v14.2d[0] - fmla v23.2d, v7.2d, v13.2d[0] - fmla v28.2d, v4.2d, v15.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v26.2d, v6.2d, v14.d[0] + fmla v23.2d, v7.2d, v13.d[0] + fmla v28.2d, v4.2d, v15.d[0] + fmla v17.2d, v5.2d, v12.d[0] ldp q2, q3, [ppA] add ppA, ppA, #32 - fmla v30.2d, v6.2d, v15.2d[0] - fmla v19.2d, v7.2d, v12.2d[0] + fmla v30.2d, v6.2d, v15.d[0] + fmla v19.2d, v7.2d, v12.d[0] .endm .macro KERNEL8x4_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v11.d[0] ldp d12, d13, [pB] add pB, pB, #16 - fmla v18.2d, v2.2d, v8.2d[0] - fmla v31.2d, v3.2d, v11.2d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v31.2d, v3.2d, v11.d[0] ldp d14, d15, [pB] add pB, pB, #16 - fmla v20.2d, v0.2d, v9.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] + fmla v20.2d, v0.2d, v9.d[0] + fmla v25.2d, v1.2d, v10.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - fmla v22.2d, v2.2d, v9.2d[0] - fmla v27.2d, v3.2d, v10.2d[0] + fmla v22.2d, v2.2d, v9.d[0] + fmla v27.2d, v3.2d, v10.d[0] prfm PLDL1KEEP, [ppA, #A_PRE_SIZE] - fmla v24.2d, v0.2d, v10.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] + fmla v24.2d, v0.2d, v10.d[0] + fmla v21.2d, v1.2d, v9.d[0] ldp q4, q5, [pA] add pA, pA, #32 - fmla v26.2d, v2.2d, v10.2d[0] - fmla v23.2d, v3.2d, v9.2d[0] + fmla v26.2d, v2.2d, v10.d[0] + fmla v23.2d, v3.2d, v9.d[0] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v11.d[0] + fmla v17.2d, v1.2d, v8.d[0] ldp q6, q7, [ppA] add ppA, ppA, #32 - fmla v30.2d, v2.2d, v11.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v30.2d, v2.2d, v11.d[0] + fmla v19.2d, v3.2d, v8.d[0] .endm .macro KERNEL8x4_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] - fmla v18.2d, v6.2d, v12.2d[0] - fmla v27.2d, v7.2d, v14.2d[0] + fmla v16.2d, v4.2d, v12.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v18.2d, v6.2d, v12.d[0] + fmla v27.2d, v7.2d, v14.d[0] - fmla v20.2d, v4.2d, v13.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] - fmla v22.2d, v6.2d, v13.2d[0] - fmla v31.2d, v7.2d, v15.2d[0] + fmla v20.2d, v4.2d, v13.d[0] + fmla v29.2d, v5.2d, v15.d[0] + fmla v22.2d, v6.2d, v13.d[0] + fmla v31.2d, v7.2d, v15.d[0] - fmla v24.2d, v4.2d, v14.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v26.2d, v6.2d, v14.2d[0] - fmla v19.2d, v7.2d, v12.2d[0] + fmla v24.2d, v4.2d, v14.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v26.2d, v6.2d, v14.d[0] + fmla v19.2d, v7.2d, v12.d[0] - fmla v28.2d, v4.2d, v15.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v30.2d, v6.2d, v15.2d[0] - fmla v23.2d, v7.2d, v13.2d[0] + fmla v28.2d, v4.2d, v15.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v30.2d, v6.2d, v15.d[0] + fmla v23.2d, v7.2d, v13.d[0] .endm .macro KERNEL8x4_SUB @@ -315,28 +315,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldp q0, q1, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] - fmla v20.2d, v0.2d, v9.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v11.d[0] + fmla v20.2d, v0.2d, v9.d[0] + fmla v25.2d, v1.2d, v10.d[0] ldp q2, q3, [ppA] add ppA, ppA, #32 - fmla v24.2d, v0.2d, v10.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v24.2d, v0.2d, v10.d[0] + fmla v21.2d, v1.2d, v9.d[0] + fmla v28.2d, v0.2d, v11.d[0] + fmla v17.2d, v1.2d, v8.d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v31.2d, v3.2d, v11.2d[0] - fmla v22.2d, v2.2d, v9.2d[0] - fmla v27.2d, v3.2d, v10.2d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v31.2d, v3.2d, v11.d[0] + fmla v22.2d, v2.2d, v9.d[0] + fmla v27.2d, v3.2d, v10.d[0] - fmla v26.2d, v2.2d, v10.2d[0] - fmla v23.2d, v3.2d, v9.2d[0] - fmla v30.2d, v2.2d, v11.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v26.2d, v2.2d, v10.d[0] + fmla v23.2d, v3.2d, v9.d[0] + fmla v30.2d, v2.2d, v11.d[0] + fmla v19.2d, v3.2d, v8.d[0] .endm .macro SAVE8x4 @@ -422,17 +422,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 @@ -482,10 +482,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v28.2d, v0.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 @@ -572,10 +572,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 @@ -610,8 +610,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 @@ -643,7 +643,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr d0 , [pA] add pA, pA, #8 - fmla v16.2d, v8.2d, v0.2d[0] + fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 @@ -674,8 +674,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 @@ -705,7 +705,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA , pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/dgemm_kernel_4x8.S b/kernel/arm64/dgemm_kernel_4x8.S old mode 100755 new mode 100644 index 88e9a773d..b04dbb5d5 --- a/kernel/arm64/dgemm_kernel_4x8.S +++ b/kernel/arm64/dgemm_kernel_4x8.S @@ -154,25 +154,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v17.2d, v1.2d, v8.2d[0] - fmul v18.2d, v0.2d, v8.2d[1] - fmul v19.2d, v1.2d, v8.2d[1] + fmul v16.2d, v0.2d, v8.d[0] + fmul v17.2d, v1.2d, v8.d[0] + fmul v18.2d, v0.2d, v8.d[1] + fmul v19.2d, v1.2d, v8.d[1] - fmul v20.2d, v0.2d, v9.2d[0] - fmul v21.2d, v1.2d, v9.2d[0] - fmul v22.2d, v0.2d, v9.2d[1] - fmul v23.2d, v1.2d, v9.2d[1] + fmul v20.2d, v0.2d, v9.d[0] + fmul v21.2d, v1.2d, v9.d[0] + fmul v22.2d, v0.2d, v9.d[1] + fmul v23.2d, v1.2d, v9.d[1] - fmul v24.2d, v0.2d, v10.2d[0] - fmul v25.2d, v1.2d, v10.2d[0] - fmul v26.2d, v0.2d, v10.2d[1] - fmul v27.2d, v1.2d, v10.2d[1] + fmul v24.2d, v0.2d, v10.d[0] + fmul v25.2d, v1.2d, v10.d[0] + fmul v26.2d, v0.2d, v10.d[1] + fmul v27.2d, v1.2d, v10.d[1] - fmul v28.2d, v0.2d, v11.2d[0] - fmul v29.2d, v1.2d, v11.2d[0] - fmul v30.2d, v0.2d, v11.2d[1] - fmul v31.2d, v1.2d, v11.2d[1] + fmul v28.2d, v0.2d, v11.d[0] + fmul v29.2d, v1.2d, v11.d[0] + fmul v30.2d, v0.2d, v11.d[1] + fmul v31.2d, v1.2d, v11.d[1] ld1 {v12.2d, v13.2d}, [pB] add pB, pB, #32 @@ -183,25 +183,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v0.2d, v8.2d[1] - fmla v19.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v0.2d, v8.d[1] + fmla v19.2d, v1.2d, v8.d[1] - fmla v20.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v22.2d, v0.2d, v9.2d[1] - fmla v23.2d, v1.2d, v9.2d[1] + fmla v20.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v9.d[0] + fmla v22.2d, v0.2d, v9.d[1] + fmla v23.2d, v1.2d, v9.d[1] - fmla v24.2d, v0.2d, v10.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] - fmla v26.2d, v0.2d, v10.2d[1] - fmla v27.2d, v1.2d, v10.2d[1] + fmla v24.2d, v0.2d, v10.d[0] + fmla v25.2d, v1.2d, v10.d[0] + fmla v26.2d, v0.2d, v10.d[1] + fmla v27.2d, v1.2d, v10.d[1] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] - fmla v30.2d, v0.2d, v11.2d[1] - fmla v31.2d, v1.2d, v11.2d[1] + fmla v28.2d, v0.2d, v11.d[0] + fmla v29.2d, v1.2d, v11.d[0] + fmla v30.2d, v0.2d, v11.d[1] + fmla v31.2d, v1.2d, v11.d[1] ld1 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 @@ -214,25 +214,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v4.2d, v12.2d[1] - fmla v19.2d, v5.2d, v12.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v18.2d, v4.2d, v12.d[1] + fmla v19.2d, v5.2d, v12.d[1] - fmla v20.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v22.2d, v4.2d, v13.2d[1] - fmla v23.2d, v5.2d, v13.2d[1] + fmla v20.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v22.2d, v4.2d, v13.d[1] + fmla v23.2d, v5.2d, v13.d[1] - fmla v24.2d, v4.2d, v14.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] - fmla v26.2d, v4.2d, v14.2d[1] - fmla v27.2d, v5.2d, v14.2d[1] + fmla v24.2d, v4.2d, v14.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v26.2d, v4.2d, v14.d[1] + fmla v27.2d, v5.2d, v14.d[1] - fmla v28.2d, v4.2d, v15.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] - fmla v30.2d, v4.2d, v15.2d[1] - fmla v31.2d, v5.2d, v15.2d[1] + fmla v28.2d, v4.2d, v15.d[0] + fmla v29.2d, v5.2d, v15.d[0] + fmla v30.2d, v4.2d, v15.d[1] + fmla v31.2d, v5.2d, v15.d[1] ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 @@ -245,25 +245,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v4.2d, v12.2d[1] - fmla v19.2d, v5.2d, v12.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v18.2d, v4.2d, v12.d[1] + fmla v19.2d, v5.2d, v12.d[1] - fmla v20.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v22.2d, v4.2d, v13.2d[1] - fmla v23.2d, v5.2d, v13.2d[1] + fmla v20.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v22.2d, v4.2d, v13.d[1] + fmla v23.2d, v5.2d, v13.d[1] - fmla v24.2d, v4.2d, v14.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] - fmla v26.2d, v4.2d, v14.2d[1] - fmla v27.2d, v5.2d, v14.2d[1] + fmla v24.2d, v4.2d, v14.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v26.2d, v4.2d, v14.d[1] + fmla v27.2d, v5.2d, v14.d[1] - fmla v28.2d, v4.2d, v15.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] - fmla v30.2d, v4.2d, v15.2d[1] - fmla v31.2d, v5.2d, v15.2d[1] + fmla v28.2d, v4.2d, v15.d[0] + fmla v29.2d, v5.2d, v15.d[0] + fmla v30.2d, v4.2d, v15.d[1] + fmla v31.2d, v5.2d, v15.d[1] .endm .macro KERNEL4x8_SUB @@ -274,25 +274,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v0.2d, v8.2d[1] - fmla v19.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v0.2d, v8.d[1] + fmla v19.2d, v1.2d, v8.d[1] - fmla v20.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v22.2d, v0.2d, v9.2d[1] - fmla v23.2d, v1.2d, v9.2d[1] + fmla v20.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v9.d[0] + fmla v22.2d, v0.2d, v9.d[1] + fmla v23.2d, v1.2d, v9.d[1] - fmla v24.2d, v0.2d, v10.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] - fmla v26.2d, v0.2d, v10.2d[1] - fmla v27.2d, v1.2d, v10.2d[1] + fmla v24.2d, v0.2d, v10.d[0] + fmla v25.2d, v1.2d, v10.d[0] + fmla v26.2d, v0.2d, v10.d[1] + fmla v27.2d, v1.2d, v10.d[1] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] - fmla v30.2d, v0.2d, v11.2d[1] - fmla v31.2d, v1.2d, v11.2d[1] + fmla v28.2d, v0.2d, v11.d[0] + fmla v29.2d, v1.2d, v11.d[0] + fmla v30.2d, v0.2d, v11.d[1] + fmla v31.2d, v1.2d, v11.d[1] .endm .macro SAVE4x8 @@ -374,17 +374,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v18.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v18.2d, v0.2d, v8.d[1] - fmla v20.2d, v0.2d, v9.2d[0] - fmla v22.2d, v0.2d, v9.2d[1] + fmla v20.2d, v0.2d, v9.d[0] + fmla v22.2d, v0.2d, v9.d[1] - fmla v24.2d, v0.2d, v10.2d[0] - fmla v26.2d, v0.2d, v10.2d[1] + fmla v24.2d, v0.2d, v10.d[0] + fmla v26.2d, v0.2d, v10.d[1] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v30.2d, v0.2d, v11.2d[1] + fmla v28.2d, v0.2d, v11.d[0] + fmla v30.2d, v0.2d, v11.d[1] .endm .macro SAVE2x8 @@ -520,17 +520,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v29.2d, v1.2d, v9.2d[1] + fmul v16.2d, v0.2d, v8.d[0] + fmul v29.2d, v1.2d, v9.d[1] - fmul v20.2d, v0.2d, v8.2d[1] - fmul v25.2d, v1.2d, v9.2d[0] + fmul v20.2d, v0.2d, v8.d[1] + fmul v25.2d, v1.2d, v9.d[0] - fmul v24.2d, v0.2d, v9.2d[0] - fmul v21.2d, v1.2d, v8.2d[1] + fmul v24.2d, v0.2d, v9.d[0] + fmul v21.2d, v1.2d, v8.d[1] - fmul v28.2d, v0.2d, v9.2d[1] - fmul v17.2d, v1.2d, v8.2d[0] + fmul v28.2d, v0.2d, v9.d[1] + fmul v17.2d, v1.2d, v8.d[0] ld1 {v12.2d, v13.2d}, [pB] add pB, pB, #32 @@ -539,61 +539,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] ld1 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] ld1 {v4.2d, v5.2d}, [pA] // For next round add pA, pA, #32 - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro KERNEL4x4_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v13.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v29.2d, v5.2d, v13.d[1] ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 - fmla v20.2d, v4.2d, v12.2d[1] - fmla v25.2d, v5.2d, v13.2d[0] + fmla v20.2d, v4.2d, v12.d[1] + fmla v25.2d, v5.2d, v13.d[0] ld1 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 - fmla v24.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v12.2d[1] + fmla v24.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v12.d[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v28.2d, v4.2d, v13.d[1] + fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v13.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v29.2d, v5.2d, v13.d[1] - fmla v20.2d, v4.2d, v12.2d[1] - fmla v25.2d, v5.2d, v13.2d[0] + fmla v20.2d, v4.2d, v12.d[1] + fmla v25.2d, v5.2d, v13.d[0] - fmla v24.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v12.2d[1] + fmla v24.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v12.d[1] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v28.2d, v4.2d, v13.d[1] + fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_SUB @@ -602,17 +602,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 @@ -660,10 +660,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v28.2d, v0.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 @@ -746,10 +746,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 @@ -782,8 +782,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 @@ -813,7 +813,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr d0 , [pA] add pA, pA, #8 - fmla v16.2d, v8.2d, v0.2d[0] + fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 @@ -842,8 +842,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 @@ -871,7 +871,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA , pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/dgemm_kernel_8x4.S b/kernel/arm64/dgemm_kernel_8x4.S old mode 100755 new mode 100644 index a607fecc4..f3c3d5c35 --- a/kernel/arm64/dgemm_kernel_8x4.S +++ b/kernel/arm64/dgemm_kernel_8x4.S @@ -52,12 +52,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alpha0 d10 #define alphaV0 v10.d[0] -#define alpha1 d11 -#define alphaV1 v11.d[0] -#define alpha2 d14 -#define alphaV2 v14.d[0] -#define alpha3 d15 -#define alphaV3 v15.d[0] + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 // 00 origM // 01 origN @@ -74,8 +72,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 -// 15 pA -// 16 +// 15 pCRow3 +// 16 pA // 17 // 18 must save // 19 must save @@ -100,14 +98,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_2, pA1_3 //v06 pA1_4, pA1_5 //v07 pA1_6, pA1_7 -//v08 must save pB0_0, pB0_1 -//v09 must save pB0_2, pB0_3 -//v10 must save ALPHA0 -//v11 must save ALPHA1 -//v12 must save pB1_0, pB1_1 -//v13 must save pB1_2, pB1_3 -//v14 must save ALPHA2 -//v15 must save ALPHA3 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 --> ALPHA0 +//v11 must save pB0_3 +//v12 must save pB1_0 +//v13 must save pB1_1 +//v14 must save pB1_2 +//v15 must save pB1_3 //v16 must save C00, C01 //v17 must save C02, C03 //v18 C04, C05 @@ -149,244 +147,257 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld1 {v0.2d, v1.2d}, [pA] - add pA, pA, #32 - ld1 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 - ldp d8, d9, [pB] - add pB, pB, #16 - ldp d10, d11, [pB] - add pB, pB, #16 + ldp q0, q1, [pA], #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v17.2d, v1.2d, v8.2d[0] + ldp d8, d9, [pB], #16 - fmul v18.2d, v2.2d, v8.2d[0] - fmul v19.2d, v3.2d, v8.2d[0] + fmul v16.2d, v0.2d, v8.d[0] + fmul v20.2d, v0.2d, v9.d[0] - fmul v20.2d, v0.2d, v9.2d[0] - fmul v21.2d, v1.2d, v9.2d[0] + ldp d10, d11, [pB], #16 - fmul v22.2d, v2.2d, v9.2d[0] - fmul v23.2d, v3.2d, v9.2d[0] + fmul v17.2d, v1.2d, v8.d[0] + fmul v21.2d, v1.2d, v9.d[0] - fmul v24.2d, v0.2d, v10.2d[0] - fmul v25.2d, v1.2d, v10.2d[0] + ldp q2, q3, [pA], #32 - fmul v26.2d, v2.2d, v10.2d[0] - fmul v27.2d, v3.2d, v10.2d[0] + fmul v24.2d, v0.2d, v10.d[0] + fmul v28.2d, v0.2d, v11.d[0] - fmul v28.2d, v0.2d, v11.2d[0] - fmul v29.2d, v1.2d, v11.2d[0] + ldp q4, q5, [pA], #32 - fmul v30.2d, v2.2d, v11.2d[0] - fmul v31.2d, v3.2d, v11.2d[0] + fmul v25.2d, v1.2d, v10.d[0] + fmul v29.2d, v1.2d, v11.d[0] - ld1 {v4.2d, v5.2d}, [pA] - add pA, pA, #32 - ld1 {v6.2d, v7.2d}, [pA] - add pA, pA, #32 - ldp d12, d13, [pB] - add pB, pB, #16 - ldp d14, d15, [pB] - add pB, pB, #16 + ldp d12, d13, [pB], #16 + + fmul v18.2d, v2.2d, v8.d[0] + fmul v22.2d, v2.2d, v9.d[0] + + ldp d14, d15, [pB], #16 + + fmul v26.2d, v2.2d, v10.d[0] + fmul v30.2d, v2.2d, v11.d[0] + + ldp q6, q7, [pA], #32 + + fmul v19.2d, v3.2d, v8.d[0] + fmul v27.2d, v3.2d, v10.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmul v31.2d, v3.2d, v11.d[0] + fmul v23.2d, v3.2d, v9.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL8x4_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v26.2d, v2.2d, v10.2d[0] - fmla v31.2d, v3.2d, v11.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v9.d[0] - ld1 {v4.2d}, [pA], #16 + ldp q4, q5, [pA], #32 - fmla v20.2d, v0.2d, v9.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v24.2d, v0.2d, v10.d[0] + fmla v28.2d, v0.2d, v11.d[0] - ld1 {v5.2d}, [pA], #16 + ldp d12, d13, [pB], #16 - fmla v30.2d, v2.2d, v11.2d[0] - fmla v27.2d, v3.2d, v10.2d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v25.2d, v1.2d, v10.d[0] - ldp d12, d13, [pB] - add pB, pB, #16 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] + fmla v21.2d, v1.2d, v9.d[0] + fmla v29.2d, v1.2d, v11.d[0] - ldp d14, d15, [pB] - add pB, pB, #16 + ldp d14, d15, [pB], #16 - fmla v18.2d, v2.2d, v8.2d[0] - fmla v23.2d, v3.2d, v9.2d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v22.2d, v2.2d, v9.d[0] - ld1 {v6.2d}, [pA], #16 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - fmla v24.2d, v0.2d, v10.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] + fmla v26.2d, v2.2d, v10.d[0] + fmla v30.2d, v2.2d, v11.d[0] + fmla v19.2d, v3.2d, v8.d[0] + fmla v23.2d, v3.2d, v9.d[0] - ld1 {v7.2d}, [pA], #16 + ldp q6, q7, [pA], #32 - fmla v22.2d, v2.2d, v9.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] - - prfm PLDL1KEEP, [pA, #224] - prfm PLDL1KEEP, [pA, #224+64] + fmla v27.2d, v3.2d, v10.d[0] + fmla v31.2d, v3.2d, v11.d[0] .endm .macro KERNEL8x4_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v26.2d, v6.2d, v14.2d[0] - fmla v31.2d, v7.2d, v15.2d[0] + fmla v16.2d, v4.2d, v12.d[0] + fmla v20.2d, v4.2d, v13.d[0] + fmla v24.2d, v4.2d, v14.d[0] + fmla v28.2d, v4.2d, v15.d[0] - ld1 {v0.2d}, [pA], #16 + ldp q0, q1, [pA], #32 - fmla v20.2d, v4.2d, v13.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v25.2d, v5.2d, v14.d[0] - ld1 {v1.2d}, [pA], #16 + ldp d8, d9, [pB], #16 - fmla v30.2d, v6.2d, v15.2d[0] - fmla v27.2d, v7.2d, v14.2d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v29.2d, v5.2d, v15.d[0] - ldp d8, d9, [pB] - add pB, pB, #16 + ldp d10, d11, [pB], #16 - fmla v28.2d, v4.2d, v15.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] + fmla v18.2d, v6.2d, v12.d[0] + fmla v22.2d, v6.2d, v13.d[0] - ldp d10, d11, [pB] - add pB, pB, #16 + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v22.2d, v6.2d, v13.2d[0] - fmla v19.2d, v7.2d, v12.2d[0] + fmla v26.2d, v6.2d, v14.d[0] + fmla v30.2d, v6.2d, v15.d[0] - ld1 {v2.2d}, [pA], #16 + fmla v19.2d, v7.2d, v12.d[0] + fmla v23.2d, v7.2d, v13.d[0] - fmla v24.2d, v4.2d, v14.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] + ldp q2, q3, [pA], #32 - ld1 {v3.2d}, [pA], #16 - - fmla v18.2d, v6.2d, v12.2d[0] - fmla v23.2d, v7.2d, v13.2d[0] - - prfm PLDL1KEEP, [pB, #640] + fmla v27.2d, v7.2d, v14.d[0] + fmla v31.2d, v7.2d, v15.d[0] .endm .macro KERNEL8x4_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v6.2d, v12.2d[0] - fmla v19.2d, v7.2d, v12.2d[0] - fmla v20.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v22.2d, v6.2d, v13.2d[0] - fmla v23.2d, v7.2d, v13.2d[0] - fmla v24.2d, v4.2d, v14.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] - fmla v26.2d, v6.2d, v14.2d[0] - fmla v27.2d, v7.2d, v14.2d[0] - fmla v28.2d, v4.2d, v15.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] - fmla v30.2d, v6.2d, v15.2d[0] - fmla v31.2d, v7.2d, v15.2d[0] + fmla v16.2d, v4.2d, v12.d[0] + fmla v20.2d, v4.2d, v13.d[0] + fmla v24.2d, v4.2d, v14.d[0] + fmla v28.2d, v4.2d, v15.d[0] + + fmla v17.2d, v5.2d, v12.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v29.2d, v5.2d, v15.d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v18.2d, v6.2d, v12.d[0] + fmla v22.2d, v6.2d, v13.d[0] + fmla v26.2d, v6.2d, v14.d[0] + fmla v30.2d, v6.2d, v15.d[0] + + fmla v19.2d, v7.2d, v12.d[0] + fmla v23.2d, v7.2d, v13.d[0] + fmla v27.2d, v7.2d, v14.d[0] + fmla v31.2d, v7.2d, v15.d[0] .endm .macro KERNEL8x4_SUB - ld1 {v0.2d, v1.2d}, [pA] - add pA, pA, #32 - ld1 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 - ldp d8, d9, [pB] - add pB, pB, #16 - ldp d10, d11, [pB] - add pB, pB, #16 + ldp q0, q1, [pA], #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + ldp d8, d9, [pB], #16 - fmla v20.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v22.2d, v2.2d, v9.2d[0] - fmla v23.2d, v3.2d, v9.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v9.d[0] - fmla v24.2d, v0.2d, v10.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] - fmla v26.2d, v2.2d, v10.2d[0] - fmla v27.2d, v3.2d, v10.2d[0] + ldp d10, d11, [pB], #16 - fmla v28.2d, v0.2d, v11.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] - fmla v30.2d, v2.2d, v11.2d[0] - fmla v31.2d, v3.2d, v11.2d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v21.2d, v1.2d, v9.d[0] + + ldp q2, q3, [pA], #32 + + fmla v24.2d, v0.2d, v10.d[0] + fmla v28.2d, v0.2d, v11.d[0] + + fmla v25.2d, v1.2d, v10.d[0] + fmla v29.2d, v1.2d, v11.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmla v18.2d, v2.2d, v8.d[0] + fmla v22.2d, v2.2d, v9.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + fmla v26.2d, v2.2d, v10.d[0] + fmla v30.2d, v2.2d, v11.d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v19.2d, v3.2d, v8.d[0] + fmla v27.2d, v3.2d, v10.d[0] + + fmla v31.2d, v3.2d, v11.d[0] + fmla v23.2d, v3.2d, v9.d[0] .endm .macro SAVE8x4 fmov alpha0, alpha - ld1 {v0.2d, v1.2d}, [pCRow0] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ldp q0, q1, [pCRow0] fmla v0.2d, v16.2d, alphaV0 fmla v1.2d, v17.2d, alphaV0 - st1 {v0.2d, v1.2d}, [pCRow0] + stp q0, q1, [pCRow0] add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld1 {v2.2d, v3.2d}, [pCRow0] + ldp q2, q3, [pCRow0] fmla v2.2d, v18.2d, alphaV0 fmla v3.2d, v19.2d, alphaV0 - st1 {v2.2d, v3.2d}, [pCRow0] + stp q2, q3, [pCRow0] add pCRow0, pCRow0, #32 - ld1 {v4.2d, v5.2d}, [pCRow1] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ldp q4, q5, [pCRow1] fmla v4.2d, v20.2d, alphaV0 fmla v5.2d, v21.2d, alphaV0 - st1 {v4.2d, v5.2d}, [pCRow1] + stp q4, q5, [pCRow1] add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - ld1 {v6.2d, v7.2d}, [pCRow1] + ldp q6, q7, [pCRow1] fmla v6.2d, v22.2d, alphaV0 fmla v7.2d, v23.2d, alphaV0 - st1 {v6.2d, v7.2d}, [pCRow1] + stp q6, q7, [pCRow1] add pCRow1, pCRow1, #32 - ld1 {v0.2d, v1.2d}, [pCRow2] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + ldp q0, q1, [pCRow2] fmla v0.2d, v24.2d, alphaV0 fmla v1.2d, v25.2d, alphaV0 - st1 {v0.2d, v1.2d}, [pCRow2] + stp q0, q1, [pCRow2] add pCRow2, pCRow2, #32 - ld1 {v2.2d, v3.2d}, [pCRow2] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + ldp q2, q3, [pCRow2] fmla v2.2d, v26.2d, alphaV0 fmla v3.2d, v27.2d, alphaV0 - st1 {v2.2d, v3.2d}, [pCRow2] + stp q2, q3, [pCRow2] add pCRow2, pCRow2, #32 - ld1 {v4.2d, v5.2d}, [pCRow3] + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + ldp q4, q5, [pCRow3] fmla v4.2d, v28.2d, alphaV0 fmla v5.2d, v29.2d, alphaV0 - st1 {v4.2d, v5.2d}, [pCRow3] + stp q4, q5, [pCRow3] add pCRow3, pCRow3, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - ld1 {v6.2d, v7.2d}, [pCRow3] + ldp q6, q7, [pCRow3] fmla v6.2d, v30.2d, alphaV0 fmla v7.2d, v31.2d, alphaV0 - st1 {v6.2d, v7.2d}, [pCRow3] + stp q6, q7, [pCRow3] add pCRow3, pCRow3, #32 - - prfm PLDL2KEEP, [pCRow0, #128] - prfm PLDL2KEEP, [pCRow1, #128] - prfm PLDL2KEEP, [pCRow2, #128] - prfm PLDL2KEEP, [pCRow3, #128] .endm /******************************************************************************/ @@ -408,44 +419,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 + fmov alpha0, alpha ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 - fmla v9.2d, v17.2d, alphaV1 + fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2d, v13.2d}, [pCRow1] - fmla v12.2d, v20.2d, alphaV2 - fmla v13.2d, v21.2d, alphaV3 + fmla v12.2d, v20.2d, alphaV0 + fmla v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow2, pCRow1, LDC ld1 {v8.2d, v9.2d}, [pCRow2] fmla v8.2d, v24.2d, alphaV0 - fmla v9.2d, v25.2d, alphaV1 + fmla v9.2d, v25.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow2] add pCRow1, pCRow2, LDC ld1 {v12.2d, v13.2d}, [pCRow1] - fmla v12.2d, v28.2d, alphaV2 - fmla v13.2d, v29.2d, alphaV3 + fmla v12.2d, v28.2d, alphaV0 + fmla v13.2d, v29.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -467,13 +479,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v28.2d, v0.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 + fmov alpha0, alpha ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] @@ -481,19 +494,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow1, pCRow0, LDC ld1 {v12.2d}, [pCRow1] - fmla v12.2d, v20.2d, alphaV1 + fmla v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow2, pCRow1, LDC ld1 {v8.2d}, [pCRow2] - fmla v8.2d, v24.2d, alphaV2 + fmla v8.2d, v24.2d, alphaV0 st1 {v8.2d}, [pCRow2] add pCRow1, pCRow2, LDC ld1 {v12.2d}, [pCRow1] - fmla v12.2d, v28.2d, alphaV3 + fmla v12.2d, v28.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 @@ -518,6 +531,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 + fmov alpha0, alpha add pCRow1, pCRow0, LDC ld1 {v8.d}[0], [pCRow0] @@ -531,7 +545,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v12.d}[0], [pCRow2] ld1 {v12.d}[1], [pCRow1] - fmla v12.2d, v20.2d, alphaV1 + fmla v12.2d, v20.2d, alphaV0 st1 {v12.d}[0], [pCRow2] st1 {v12.d}[1], [pCRow1] @@ -559,32 +573,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v19.2d, v3.2d, v8.d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] - fmla v22.2d, v2.2d, v8.2d[1] - fmla v23.2d, v3.2d, v8.2d[1] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] + fmla v22.2d, v2.2d, v8.d[1] + fmla v23.2d, v3.2d, v8.d[1] .endm .macro SAVE8x2 + fmov alpha0, alpha add pCRow1, pCRow0, LDC ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0 - fmla v1.2d, v17.2d, alphaV1 - fmla v2.2d, v18.2d, alphaV2 - fmla v3.2d, v19.2d, alphaV3 + fmla v1.2d, v17.2d, alphaV0 + fmla v2.2d, v18.2d, alphaV0 + fmla v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0 - fmla v5.2d, v21.2d, alphaV1 - fmla v6.2d, v22.2d, alphaV2 - fmla v7.2d, v23.2d, alphaV3 + fmla v5.2d, v21.2d, alphaV0 + fmla v6.2d, v22.2d, alphaV0 + fmla v7.2d, v23.2d, alphaV0 st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] add pCRow0, pCRow0, #64 @@ -605,23 +620,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 + fmov alpha0, alpha ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 - fmla v9.2d, v17.2d, alphaV1 + fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2d, v13.2d}, [pCRow1] - fmla v12.2d, v20.2d, alphaV2 - fmla v13.2d, v21.2d, alphaV3 + fmla v12.2d, v20.2d, alphaV0 + fmla v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -641,11 +657,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 + fmov alpha0, alpha ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] @@ -653,7 +670,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow1 , pCRow0, LDC ld1 {v12.2d}, [pCRow1] - fmla v12.2d, v20.2d, alphaV1 + fmla v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 @@ -672,10 +689,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr d0 , [pA] add pA, pA, #8 - fmla v16.2d, v8.2d, v0.2d[0] + fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 + fmov alpha0, alpha add pCRow1 , pCRow0, LDC ld1 {v8.d}[0], [pCRow0] @@ -706,18 +724,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v19.2d, v3.2d, v8.d[0] .endm .macro SAVE8x1 + fmov alpha0, alpha ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0 - fmla v1.2d, v17.2d, alphaV1 - fmla v2.2d, v18.2d, alphaV2 - fmla v3.2d, v19.2d, alphaV3 + fmla v1.2d, v17.2d, alphaV0 + fmla v2.2d, v18.2d, alphaV0 + fmla v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] add pCRow0, pCRow0, #64 @@ -738,14 +757,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 + fmov alpha0, alpha ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 - fmla v9.2d, v17.2d, alphaV1 + fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow0, pCRow0, #32 @@ -765,10 +785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA , pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 + fmov alpha0, alpha ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] @@ -793,6 +814,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0, alpha ldr d8, [pCRow0] fmadd d8, d16, alpha0, d8 str d8, [pCRow0] @@ -820,6 +842,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + fmov alpha, d0 lsl LDC, LDC, #3 // ldc = ldc * 8 @@ -838,6 +863,7 @@ dgemm_kernel_L4_BEGIN: add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC add pCRow3, pCRow2, LDC + add pC, pCRow3, LDC mov pA, origPA // pA = start of A array @@ -849,6 +875,7 @@ dgemm_kernel_L4_M8_BEGIN: cmp counterI, #0 ble dgemm_kernel_L4_M4_BEGIN + .align 5 dgemm_kernel_L4_M8_20: mov pB, origPB @@ -868,8 +895,8 @@ dgemm_kernel_L4_M8_20: subs counterL, counterL, #2 // subtract 2 ble dgemm_kernel_L4_M8_22a - .align 5 + .align 5 dgemm_kernel_L4_M8_22: KERNEL8x4_M1 @@ -884,7 +911,7 @@ dgemm_kernel_L4_M8_22: subs counterL, counterL, #1 bgt dgemm_kernel_L4_M8_22 - + .align 5 dgemm_kernel_L4_M8_22a: KERNEL8x4_M1 @@ -898,6 +925,7 @@ dgemm_kernel_L4_M8_22a: b dgemm_kernel_L4_M8_44 + .align 5 dgemm_kernel_L4_M8_32: tst counterL, #1 @@ -923,6 +951,7 @@ dgemm_kernel_L4_M8_44: ands counterL , origK, #7 ble dgemm_kernel_L4_M8_100 + .align 5 dgemm_kernel_L4_M8_46: KERNEL8x4_SUB @@ -931,6 +960,9 @@ dgemm_kernel_L4_M8_46: bne dgemm_kernel_L4_M8_46 dgemm_kernel_L4_M8_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] SAVE8x4 diff --git a/kernel/arm64/dtrmm_kernel_4x4.S b/kernel/arm64/dtrmm_kernel_4x4.S index 0d1b12881..34fb8c233 100644 --- a/kernel/arm64/dtrmm_kernel_4x4.S +++ b/kernel/arm64/dtrmm_kernel_4x4.S @@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v29.2d, v1.2d, v9.2d[1] + fmul v16.2d, v0.2d, v8.d[0] + fmul v29.2d, v1.2d, v9.d[1] - fmul v20.2d, v0.2d, v8.2d[1] - fmul v25.2d, v1.2d, v9.2d[0] + fmul v20.2d, v0.2d, v8.d[1] + fmul v25.2d, v1.2d, v9.d[0] - fmul v24.2d, v0.2d, v9.2d[0] - fmul v21.2d, v1.2d, v8.2d[1] + fmul v24.2d, v0.2d, v9.d[0] + fmul v21.2d, v1.2d, v8.d[1] - fmul v28.2d, v0.2d, v9.2d[1] - fmul v17.2d, v1.2d, v8.2d[0] + fmul v28.2d, v0.2d, v9.d[1] + fmul v17.2d, v1.2d, v8.d[0] ld1 {v12.2d, v13.2d}, [pB] add pB, pB, #32 @@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] ld1 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] ld1 {v4.2d, v5.2d}, [pA] // For next round add pA, pA, #32 - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro KERNEL4x4_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v13.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v29.2d, v5.2d, v13.d[1] ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 - fmla v20.2d, v4.2d, v12.2d[1] - fmla v25.2d, v5.2d, v13.2d[0] + fmla v20.2d, v4.2d, v12.d[1] + fmla v25.2d, v5.2d, v13.d[0] ld1 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 - fmla v24.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v12.2d[1] + fmla v24.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v12.d[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v28.2d, v4.2d, v13.d[1] + fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v13.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v29.2d, v5.2d, v13.d[1] - fmla v20.2d, v4.2d, v12.2d[1] - fmla v25.2d, v5.2d, v13.2d[0] + fmla v20.2d, v4.2d, v12.d[1] + fmla v25.2d, v5.2d, v13.d[0] - fmla v24.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v12.2d[1] + fmla v24.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v12.d[1] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v28.2d, v4.2d, v13.d[1] + fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_SUB @@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 @@ -283,10 +283,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v28.2d, v0.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 @@ -361,10 +361,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 @@ -395,8 +395,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 @@ -424,7 +424,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr d0 , [pA] add pA, pA, #8 - fmla v16.2d, v8.2d, v0.2d[0] + fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 @@ -451,8 +451,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 @@ -479,7 +479,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA , pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/dtrmm_kernel_4x8.S b/kernel/arm64/dtrmm_kernel_4x8.S old mode 100755 new mode 100644 index eb7397faa..4aecf28eb --- a/kernel/arm64/dtrmm_kernel_4x8.S +++ b/kernel/arm64/dtrmm_kernel_4x8.S @@ -157,25 +157,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v17.2d, v1.2d, v8.2d[0] - fmul v18.2d, v0.2d, v8.2d[1] - fmul v19.2d, v1.2d, v8.2d[1] + fmul v16.2d, v0.2d, v8.d[0] + fmul v17.2d, v1.2d, v8.d[0] + fmul v18.2d, v0.2d, v8.d[1] + fmul v19.2d, v1.2d, v8.d[1] - fmul v20.2d, v0.2d, v9.2d[0] - fmul v21.2d, v1.2d, v9.2d[0] - fmul v22.2d, v0.2d, v9.2d[1] - fmul v23.2d, v1.2d, v9.2d[1] + fmul v20.2d, v0.2d, v9.d[0] + fmul v21.2d, v1.2d, v9.d[0] + fmul v22.2d, v0.2d, v9.d[1] + fmul v23.2d, v1.2d, v9.d[1] - fmul v24.2d, v0.2d, v10.2d[0] - fmul v25.2d, v1.2d, v10.2d[0] - fmul v26.2d, v0.2d, v10.2d[1] - fmul v27.2d, v1.2d, v10.2d[1] + fmul v24.2d, v0.2d, v10.d[0] + fmul v25.2d, v1.2d, v10.d[0] + fmul v26.2d, v0.2d, v10.d[1] + fmul v27.2d, v1.2d, v10.d[1] - fmul v28.2d, v0.2d, v11.2d[0] - fmul v29.2d, v1.2d, v11.2d[0] - fmul v30.2d, v0.2d, v11.2d[1] - fmul v31.2d, v1.2d, v11.2d[1] + fmul v28.2d, v0.2d, v11.d[0] + fmul v29.2d, v1.2d, v11.d[0] + fmul v30.2d, v0.2d, v11.d[1] + fmul v31.2d, v1.2d, v11.d[1] ld1 {v12.2d, v13.2d}, [pB] add pB, pB, #32 @@ -186,25 +186,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v0.2d, v8.2d[1] - fmla v19.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v0.2d, v8.d[1] + fmla v19.2d, v1.2d, v8.d[1] - fmla v20.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v22.2d, v0.2d, v9.2d[1] - fmla v23.2d, v1.2d, v9.2d[1] + fmla v20.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v9.d[0] + fmla v22.2d, v0.2d, v9.d[1] + fmla v23.2d, v1.2d, v9.d[1] - fmla v24.2d, v0.2d, v10.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] - fmla v26.2d, v0.2d, v10.2d[1] - fmla v27.2d, v1.2d, v10.2d[1] + fmla v24.2d, v0.2d, v10.d[0] + fmla v25.2d, v1.2d, v10.d[0] + fmla v26.2d, v0.2d, v10.d[1] + fmla v27.2d, v1.2d, v10.d[1] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] - fmla v30.2d, v0.2d, v11.2d[1] - fmla v31.2d, v1.2d, v11.2d[1] + fmla v28.2d, v0.2d, v11.d[0] + fmla v29.2d, v1.2d, v11.d[0] + fmla v30.2d, v0.2d, v11.d[1] + fmla v31.2d, v1.2d, v11.d[1] ld1 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 @@ -217,25 +217,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v4.2d, v12.2d[1] - fmla v19.2d, v5.2d, v12.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v18.2d, v4.2d, v12.d[1] + fmla v19.2d, v5.2d, v12.d[1] - fmla v20.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v22.2d, v4.2d, v13.2d[1] - fmla v23.2d, v5.2d, v13.2d[1] + fmla v20.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v22.2d, v4.2d, v13.d[1] + fmla v23.2d, v5.2d, v13.d[1] - fmla v24.2d, v4.2d, v14.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] - fmla v26.2d, v4.2d, v14.2d[1] - fmla v27.2d, v5.2d, v14.2d[1] + fmla v24.2d, v4.2d, v14.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v26.2d, v4.2d, v14.d[1] + fmla v27.2d, v5.2d, v14.d[1] - fmla v28.2d, v4.2d, v15.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] - fmla v30.2d, v4.2d, v15.2d[1] - fmla v31.2d, v5.2d, v15.2d[1] + fmla v28.2d, v4.2d, v15.d[0] + fmla v29.2d, v5.2d, v15.d[0] + fmla v30.2d, v4.2d, v15.d[1] + fmla v31.2d, v5.2d, v15.d[1] ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 @@ -248,25 +248,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v4.2d, v12.2d[1] - fmla v19.2d, v5.2d, v12.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v18.2d, v4.2d, v12.d[1] + fmla v19.2d, v5.2d, v12.d[1] - fmla v20.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v22.2d, v4.2d, v13.2d[1] - fmla v23.2d, v5.2d, v13.2d[1] + fmla v20.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v22.2d, v4.2d, v13.d[1] + fmla v23.2d, v5.2d, v13.d[1] - fmla v24.2d, v4.2d, v14.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] - fmla v26.2d, v4.2d, v14.2d[1] - fmla v27.2d, v5.2d, v14.2d[1] + fmla v24.2d, v4.2d, v14.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v26.2d, v4.2d, v14.d[1] + fmla v27.2d, v5.2d, v14.d[1] - fmla v28.2d, v4.2d, v15.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] - fmla v30.2d, v4.2d, v15.2d[1] - fmla v31.2d, v5.2d, v15.2d[1] + fmla v28.2d, v4.2d, v15.d[0] + fmla v29.2d, v5.2d, v15.d[0] + fmla v30.2d, v4.2d, v15.d[1] + fmla v31.2d, v5.2d, v15.d[1] .endm .macro KERNEL4x8_SUB @@ -277,25 +277,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v0.2d, v8.2d[1] - fmla v19.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v0.2d, v8.d[1] + fmla v19.2d, v1.2d, v8.d[1] - fmla v20.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v22.2d, v0.2d, v9.2d[1] - fmla v23.2d, v1.2d, v9.2d[1] + fmla v20.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v9.d[0] + fmla v22.2d, v0.2d, v9.d[1] + fmla v23.2d, v1.2d, v9.d[1] - fmla v24.2d, v0.2d, v10.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] - fmla v26.2d, v0.2d, v10.2d[1] - fmla v27.2d, v1.2d, v10.2d[1] + fmla v24.2d, v0.2d, v10.d[0] + fmla v25.2d, v1.2d, v10.d[0] + fmla v26.2d, v0.2d, v10.d[1] + fmla v27.2d, v1.2d, v10.d[1] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] - fmla v30.2d, v0.2d, v11.2d[1] - fmla v31.2d, v1.2d, v11.2d[1] + fmla v28.2d, v0.2d, v11.d[0] + fmla v29.2d, v1.2d, v11.d[0] + fmla v30.2d, v0.2d, v11.d[1] + fmla v31.2d, v1.2d, v11.d[1] .endm .macro SAVE4x8 @@ -369,17 +369,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v18.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v18.2d, v0.2d, v8.d[1] - fmla v20.2d, v0.2d, v9.2d[0] - fmla v22.2d, v0.2d, v9.2d[1] + fmla v20.2d, v0.2d, v9.d[0] + fmla v22.2d, v0.2d, v9.d[1] - fmla v24.2d, v0.2d, v10.2d[0] - fmla v26.2d, v0.2d, v10.2d[1] + fmla v24.2d, v0.2d, v10.d[0] + fmla v26.2d, v0.2d, v10.d[1] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v30.2d, v0.2d, v11.2d[1] + fmla v28.2d, v0.2d, v11.d[0] + fmla v30.2d, v0.2d, v11.d[1] .endm .macro SAVE2x8 @@ -499,17 +499,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v29.2d, v1.2d, v9.2d[1] + fmul v16.2d, v0.2d, v8.d[0] + fmul v29.2d, v1.2d, v9.d[1] - fmul v20.2d, v0.2d, v8.2d[1] - fmul v25.2d, v1.2d, v9.2d[0] + fmul v20.2d, v0.2d, v8.d[1] + fmul v25.2d, v1.2d, v9.d[0] - fmul v24.2d, v0.2d, v9.2d[0] - fmul v21.2d, v1.2d, v8.2d[1] + fmul v24.2d, v0.2d, v9.d[0] + fmul v21.2d, v1.2d, v8.d[1] - fmul v28.2d, v0.2d, v9.2d[1] - fmul v17.2d, v1.2d, v8.2d[0] + fmul v28.2d, v0.2d, v9.d[1] + fmul v17.2d, v1.2d, v8.d[0] ld1 {v12.2d, v13.2d}, [pB] add pB, pB, #32 @@ -518,61 +518,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] ld1 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] ld1 {v4.2d, v5.2d}, [pA] // For next round add pA, pA, #32 - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro KERNEL4x4_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v13.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v29.2d, v5.2d, v13.d[1] ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 - fmla v20.2d, v4.2d, v12.2d[1] - fmla v25.2d, v5.2d, v13.2d[0] + fmla v20.2d, v4.2d, v12.d[1] + fmla v25.2d, v5.2d, v13.d[0] ld1 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 - fmla v24.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v12.2d[1] + fmla v24.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v12.d[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v28.2d, v4.2d, v13.d[1] + fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v13.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v29.2d, v5.2d, v13.d[1] - fmla v20.2d, v4.2d, v12.2d[1] - fmla v25.2d, v5.2d, v13.2d[0] + fmla v20.2d, v4.2d, v12.d[1] + fmla v25.2d, v5.2d, v13.d[0] - fmla v24.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v12.2d[1] + fmla v24.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v12.d[1] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v28.2d, v4.2d, v13.d[1] + fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_SUB @@ -581,17 +581,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 @@ -635,10 +635,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v28.2d, v0.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 @@ -713,10 +713,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 @@ -747,8 +747,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 @@ -776,7 +776,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr d0 , [pA] add pA, pA, #8 - fmla v16.2d, v8.2d, v0.2d[0] + fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 @@ -803,8 +803,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 @@ -831,7 +831,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA , pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S old mode 100755 new mode 100644 index 6890505bd..b06c7560d --- a/kernel/arm64/dtrmm_kernel_8x4.S +++ b/kernel/arm64/dtrmm_kernel_8x4.S @@ -157,25 +157,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v17.2d, v1.2d, v8.2d[0] - fmul v18.2d, v2.2d, v8.2d[0] - fmul v19.2d, v3.2d, v8.2d[0] + fmul v16.2d, v0.2d, v8.d[0] + fmul v17.2d, v1.2d, v8.d[0] + fmul v18.2d, v2.2d, v8.d[0] + fmul v19.2d, v3.2d, v8.d[0] - fmul v20.2d, v0.2d, v8.2d[1] - fmul v21.2d, v1.2d, v8.2d[1] - fmul v22.2d, v2.2d, v8.2d[1] - fmul v23.2d, v3.2d, v8.2d[1] + fmul v20.2d, v0.2d, v8.d[1] + fmul v21.2d, v1.2d, v8.d[1] + fmul v22.2d, v2.2d, v8.d[1] + fmul v23.2d, v3.2d, v8.d[1] - fmul v24.2d, v0.2d, v9.2d[0] - fmul v25.2d, v1.2d, v9.2d[0] - fmul v26.2d, v2.2d, v9.2d[0] - fmul v27.2d, v3.2d, v9.2d[0] + fmul v24.2d, v0.2d, v9.d[0] + fmul v25.2d, v1.2d, v9.d[0] + fmul v26.2d, v2.2d, v9.d[0] + fmul v27.2d, v3.2d, v9.d[0] - fmul v28.2d, v0.2d, v9.2d[1] - fmul v29.2d, v1.2d, v9.2d[1] - fmul v30.2d, v2.2d, v9.2d[1] - fmul v31.2d, v3.2d, v9.2d[1] + fmul v28.2d, v0.2d, v9.d[1] + fmul v29.2d, v1.2d, v9.d[1] + fmul v30.2d, v2.2d, v9.d[1] + fmul v31.2d, v3.2d, v9.d[1] ld1 {v4.2d, v5.2d}, [pA] add pA, pA, #32 @@ -186,25 +186,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v19.2d, v3.2d, v8.d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] - fmla v22.2d, v2.2d, v8.2d[1] - fmla v23.2d, v3.2d, v8.2d[1] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] + fmla v22.2d, v2.2d, v8.d[1] + fmla v23.2d, v3.2d, v8.d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v25.2d, v1.2d, v9.2d[0] - fmla v26.2d, v2.2d, v9.2d[0] - fmla v27.2d, v3.2d, v9.2d[0] + fmla v24.2d, v0.2d, v9.d[0] + fmla v25.2d, v1.2d, v9.d[0] + fmla v26.2d, v2.2d, v9.d[0] + fmla v27.2d, v3.2d, v9.d[0] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v29.2d, v1.2d, v9.2d[1] - fmla v30.2d, v2.2d, v9.2d[1] - fmla v31.2d, v3.2d, v9.2d[1] + fmla v28.2d, v0.2d, v9.d[1] + fmla v29.2d, v1.2d, v9.d[1] + fmla v30.2d, v2.2d, v9.d[1] + fmla v31.2d, v3.2d, v9.d[1] ld1 {v4.2d, v5.2d}, [pA] add pA, pA, #32 @@ -217,25 +217,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v6.2d, v12.2d[0] - fmla v19.2d, v7.2d, v12.2d[0] + fmla v16.2d, v4.2d, v12.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v18.2d, v6.2d, v12.d[0] + fmla v19.2d, v7.2d, v12.d[0] - fmla v20.2d, v4.2d, v12.2d[1] - fmla v21.2d, v5.2d, v12.2d[1] - fmla v22.2d, v6.2d, v12.2d[1] - fmla v23.2d, v7.2d, v12.2d[1] + fmla v20.2d, v4.2d, v12.d[1] + fmla v21.2d, v5.2d, v12.d[1] + fmla v22.2d, v6.2d, v12.d[1] + fmla v23.2d, v7.2d, v12.d[1] - fmla v24.2d, v4.2d, v13.2d[0] - fmla v25.2d, v5.2d, v13.2d[0] - fmla v26.2d, v6.2d, v13.2d[0] - fmla v27.2d, v7.2d, v13.2d[0] + fmla v24.2d, v4.2d, v13.d[0] + fmla v25.2d, v5.2d, v13.d[0] + fmla v26.2d, v6.2d, v13.d[0] + fmla v27.2d, v7.2d, v13.d[0] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v29.2d, v5.2d, v13.2d[1] - fmla v30.2d, v6.2d, v13.2d[1] - fmla v31.2d, v7.2d, v13.2d[1] + fmla v28.2d, v4.2d, v13.d[1] + fmla v29.2d, v5.2d, v13.d[1] + fmla v30.2d, v6.2d, v13.d[1] + fmla v31.2d, v7.2d, v13.d[1] ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 @@ -248,25 +248,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v6.2d, v12.2d[0] - fmla v19.2d, v7.2d, v12.2d[0] + fmla v16.2d, v4.2d, v12.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v18.2d, v6.2d, v12.d[0] + fmla v19.2d, v7.2d, v12.d[0] - fmla v20.2d, v4.2d, v12.2d[1] - fmla v21.2d, v5.2d, v12.2d[1] - fmla v22.2d, v6.2d, v12.2d[1] - fmla v23.2d, v7.2d, v12.2d[1] + fmla v20.2d, v4.2d, v12.d[1] + fmla v21.2d, v5.2d, v12.d[1] + fmla v22.2d, v6.2d, v12.d[1] + fmla v23.2d, v7.2d, v12.d[1] - fmla v24.2d, v4.2d, v13.2d[0] - fmla v25.2d, v5.2d, v13.2d[0] - fmla v26.2d, v6.2d, v13.2d[0] - fmla v27.2d, v7.2d, v13.2d[0] + fmla v24.2d, v4.2d, v13.d[0] + fmla v25.2d, v5.2d, v13.d[0] + fmla v26.2d, v6.2d, v13.d[0] + fmla v27.2d, v7.2d, v13.d[0] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v29.2d, v5.2d, v13.2d[1] - fmla v30.2d, v6.2d, v13.2d[1] - fmla v31.2d, v7.2d, v13.2d[1] + fmla v28.2d, v4.2d, v13.d[1] + fmla v29.2d, v5.2d, v13.d[1] + fmla v30.2d, v6.2d, v13.d[1] + fmla v31.2d, v7.2d, v13.d[1] .endm .macro KERNEL8x4_SUB @@ -277,25 +277,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v19.2d, v3.2d, v8.d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] - fmla v22.2d, v2.2d, v8.2d[1] - fmla v23.2d, v3.2d, v8.2d[1] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] + fmla v22.2d, v2.2d, v8.d[1] + fmla v23.2d, v3.2d, v8.d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v25.2d, v1.2d, v9.2d[0] - fmla v26.2d, v2.2d, v9.2d[0] - fmla v27.2d, v3.2d, v9.2d[0] + fmla v24.2d, v0.2d, v9.d[0] + fmla v25.2d, v1.2d, v9.d[0] + fmla v26.2d, v2.2d, v9.d[0] + fmla v27.2d, v3.2d, v9.d[0] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v29.2d, v1.2d, v9.2d[1] - fmla v30.2d, v2.2d, v9.2d[1] - fmla v31.2d, v3.2d, v9.2d[1] + fmla v28.2d, v0.2d, v9.d[1] + fmla v29.2d, v1.2d, v9.d[1] + fmla v30.2d, v2.2d, v9.d[1] + fmla v31.2d, v3.2d, v9.d[1] .endm .macro SAVE8x4 @@ -351,17 +351,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 @@ -406,10 +406,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v28.2d, v0.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 @@ -490,15 +490,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v19.2d, v3.2d, v8.d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] - fmla v22.2d, v2.2d, v8.2d[1] - fmla v23.2d, v3.2d, v8.2d[1] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] + fmla v22.2d, v2.2d, v8.d[1] + fmla v23.2d, v3.2d, v8.d[1] .endm .macro SAVE8x2 @@ -534,10 +534,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 @@ -568,8 +568,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 @@ -597,7 +597,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr d0 , [pA] add pA, pA, #8 - fmla v16.2d, v8.2d, v0.2d[0] + fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 @@ -629,10 +629,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v19.2d, v3.2d, v8.d[0] .endm .macro SAVE8x1 @@ -660,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 @@ -686,7 +686,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA , pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/sgemm_kernel_16x4.S b/kernel/arm64/sgemm_kernel_16x4.S index 22b55b01c..68366d9f2 100644 --- a/kernel/arm64/sgemm_kernel_16x4.S +++ b/kernel/arm64/sgemm_kernel_16x4.S @@ -158,25 +158,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v8.2s[0] - fmul v17.4s, v1.4s, v8.2s[0] - fmul v18.4s, v2.4s, v8.2s[0] - fmul v19.4s, v3.4s, v8.2s[0] + fmul v16.4s, v0.4s, v8.s[0] + fmul v17.4s, v1.4s, v8.s[0] + fmul v18.4s, v2.4s, v8.s[0] + fmul v19.4s, v3.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.2s[1] - fmul v21.4s, v1.4s, v8.2s[1] - fmul v22.4s, v2.4s, v8.2s[1] - fmul v23.4s, v3.4s, v8.2s[1] + fmul v20.4s, v0.4s, v8.s[1] + fmul v21.4s, v1.4s, v8.s[1] + fmul v22.4s, v2.4s, v8.s[1] + fmul v23.4s, v3.4s, v8.s[1] - fmul v24.4s, v0.4s, v9.2s[0] - fmul v25.4s, v1.4s, v9.2s[0] - fmul v26.4s, v2.4s, v9.2s[0] - fmul v27.4s, v3.4s, v9.2s[0] + fmul v24.4s, v0.4s, v9.s[0] + fmul v25.4s, v1.4s, v9.s[0] + fmul v26.4s, v2.4s, v9.s[0] + fmul v27.4s, v3.4s, v9.s[0] - fmul v28.4s, v0.4s, v9.2s[1] - fmul v29.4s, v1.4s, v9.2s[1] - fmul v30.4s, v2.4s, v9.2s[1] - fmul v31.4s, v3.4s, v9.2s[1] + fmul v28.4s, v0.4s, v9.s[1] + fmul v29.4s, v1.4s, v9.s[1] + fmul v30.4s, v2.4s, v9.s[1] + fmul v31.4s, v3.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -191,25 +191,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_M1 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v22.4s, v2.4s, v8.2s[1] - fmla v23.4s, v3.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v22.4s, v2.4s, v8.s[1] + fmla v23.4s, v3.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v26.4s, v2.4s, v9.2s[0] - fmla v27.4s, v3.4s, v9.2s[0] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v26.4s, v2.4s, v9.s[0] + fmla v27.4s, v3.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] - fmla v30.4s, v2.4s, v9.2s[1] - fmla v31.4s, v3.4s, v9.2s[1] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] + fmla v30.4s, v2.4s, v9.s[1] + fmla v31.4s, v3.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -224,25 +224,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_M2 - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v18.4s, v6.4s, v12.2s[0] - fmla v19.4s, v7.4s, v12.2s[0] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v18.4s, v6.4s, v12.s[0] + fmla v19.4s, v7.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v22.4s, v6.4s, v12.2s[1] - fmla v23.4s, v7.4s, v12.2s[1] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v22.4s, v6.4s, v12.s[1] + fmla v23.4s, v7.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v26.4s, v6.4s, v13.2s[0] - fmla v27.4s, v7.4s, v13.2s[0] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v26.4s, v6.4s, v13.s[0] + fmla v27.4s, v7.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] - fmla v30.4s, v6.4s, v13.2s[1] - fmla v31.4s, v7.4s, v13.2s[1] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] + fmla v30.4s, v6.4s, v13.s[1] + fmla v31.4s, v7.4s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 @@ -257,25 +257,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_E - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v18.4s, v6.4s, v12.2s[0] - fmla v19.4s, v7.4s, v12.2s[0] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v18.4s, v6.4s, v12.s[0] + fmla v19.4s, v7.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v22.4s, v6.4s, v12.2s[1] - fmla v23.4s, v7.4s, v12.2s[1] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v22.4s, v6.4s, v12.s[1] + fmla v23.4s, v7.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v26.4s, v6.4s, v13.2s[0] - fmla v27.4s, v7.4s, v13.2s[0] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v26.4s, v6.4s, v13.s[0] + fmla v27.4s, v7.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] - fmla v30.4s, v6.4s, v13.2s[1] - fmla v31.4s, v7.4s, v13.2s[1] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] + fmla v30.4s, v6.4s, v13.s[1] + fmla v31.4s, v7.4s, v13.s[1] .endm .macro KERNEL16x4_SUB @@ -290,25 +290,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v22.4s, v2.4s, v8.2s[1] - fmla v23.4s, v3.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v22.4s, v2.4s, v8.s[1] + fmla v23.4s, v3.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v26.4s, v2.4s, v9.2s[0] - fmla v27.4s, v3.4s, v9.2s[0] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v26.4s, v2.4s, v9.s[0] + fmla v27.4s, v3.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] - fmla v30.4s, v2.4s, v9.2s[1] - fmla v31.4s, v3.4s, v9.2s[1] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] + fmla v30.4s, v2.4s, v9.s[1] + fmla v31.4s, v3.4s, v9.s[1] .endm .macro SAVE16x4 @@ -370,14 +370,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v8.2s[0] - fmul v17.4s, v1.4s, v8.2s[0] - fmul v20.4s, v0.4s, v8.2s[1] - fmul v21.4s, v1.4s, v8.2s[1] - fmul v24.4s, v0.4s, v9.2s[0] - fmul v25.4s, v1.4s, v9.2s[0] - fmul v28.4s, v0.4s, v9.2s[1] - fmul v29.4s, v1.4s, v9.2s[1] + fmul v16.4s, v0.4s, v8.s[0] + fmul v17.4s, v1.4s, v8.s[0] + fmul v20.4s, v0.4s, v8.s[1] + fmul v21.4s, v1.4s, v8.s[1] + fmul v24.4s, v0.4s, v9.s[0] + fmul v25.4s, v1.4s, v9.s[0] + fmul v28.4s, v0.4s, v9.s[1] + fmul v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -388,14 +388,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -406,14 +406,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M2 - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 @@ -424,14 +424,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_E - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] .endm .macro KERNEL8x4_SUB @@ -442,14 +442,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] .endm .macro SAVE8x4 @@ -501,17 +501,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmul v16.2s, v0.2s, v8.2s[0] - fmul v29.2s, v1.2s, v9.2s[1] + fmul v16.2s, v0.2s, v8.s[0] + fmul v29.2s, v1.2s, v9.s[1] - fmul v20.2s, v0.2s, v8.2s[1] - fmul v25.2s, v1.2s, v9.2s[0] + fmul v20.2s, v0.2s, v8.s[1] + fmul v25.2s, v1.2s, v9.s[0] - fmul v24.2s, v0.2s, v9.2s[0] - fmul v21.2s, v1.2s, v8.2s[1] + fmul v24.2s, v0.2s, v9.s[0] + fmul v21.2s, v1.2s, v8.s[1] - fmul v28.2s, v0.2s, v9.2s[1] - fmul v17.2s, v1.2s, v8.2s[0] + fmul v28.2s, v0.2s, v9.s[1] + fmul v17.2s, v1.2s, v8.s[0] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -520,61 +520,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] ld1 {v4.2s, v5.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] ld1 {v0.2s, v1.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_SUB @@ -583,17 +583,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 @@ -638,10 +638,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v28.2s, v0.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 @@ -729,15 +729,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v22.4s, v2.4s, v8.2s[1] - fmla v23.4s, v3.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v22.4s, v2.4s, v8.s[1] + fmla v23.4s, v3.4s, v8.s[1] .endm .macro SAVE16x2 @@ -777,11 +777,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] .endm .macro SAVE8x2 @@ -817,10 +817,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 @@ -852,8 +852,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 @@ -882,7 +882,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0 , [pA] add pA, pA, #4 - fmla v16.2s, v8.2s, v0.2s[0] + fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 @@ -918,10 +918,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] .endm .macro SAVE16x1 @@ -951,8 +951,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] .endm .macro SAVE8x1 @@ -978,8 +978,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA , pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 @@ -1004,7 +1004,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA , pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/sgemm_kernel_4x4.S b/kernel/arm64/sgemm_kernel_4x4.S index bfa80d589..a5cf7baff 100644 --- a/kernel/arm64/sgemm_kernel_4x4.S +++ b/kernel/arm64/sgemm_kernel_4x4.S @@ -192,164 +192,164 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.4s}, [pA_0] add pA_0, pA_0, #16 - fmul v16.4s, v0.4s, v8.4s[0] - fmul v20.4s, v0.4s, v8.4s[1] + fmul v16.4s, v0.4s, v8.s[0] + fmul v20.4s, v0.4s, v8.s[1] ld1 {v2.4s}, [pA_1] add pA_1, pA_1, #16 - fmul v24.4s, v0.4s, v8.4s[2] - fmul v28.4s, v0.4s, v8.4s[3] + fmul v24.4s, v0.4s, v8.s[2] + fmul v28.4s, v0.4s, v8.s[3] ld1 {v4.4s}, [pA_2] add pA_2, pA_2, #16 - fmul v17.4s, v2.4s, v8.4s[0] - fmul v21.4s, v2.4s, v8.4s[1] + fmul v17.4s, v2.4s, v8.s[0] + fmul v21.4s, v2.4s, v8.s[1] ld1 {v6.4s}, [pA_3] add pA_3, pA_3, #16 - fmul v25.4s, v2.4s, v8.4s[2] - fmul v29.4s, v2.4s, v8.4s[3] + fmul v25.4s, v2.4s, v8.s[2] + fmul v29.4s, v2.4s, v8.s[3] ld1 {v12.4s}, [pB] // for next round add pB, pB, #16 - fmul v18.4s, v4.4s, v8.4s[0] - fmul v19.4s, v6.4s, v8.4s[0] + fmul v18.4s, v4.4s, v8.s[0] + fmul v19.4s, v6.4s, v8.s[0] ld1 {v1.4s}, [pA_0] // for next round add pA_0, pA_0, #16 - fmul v22.4s, v4.4s, v8.4s[1] - fmul v23.4s, v6.4s, v8.4s[1] + fmul v22.4s, v4.4s, v8.s[1] + fmul v23.4s, v6.4s, v8.s[1] ld1 {v3.4s}, [pA_1] // for next round add pA_1, pA_1, #16 - fmul v26.4s, v4.4s, v8.4s[2] - fmul v27.4s, v6.4s, v8.4s[2] + fmul v26.4s, v4.4s, v8.s[2] + fmul v27.4s, v6.4s, v8.s[2] ld1 {v5.4s}, [pA_2] // for next round add pA_2, pA_2, #16 - fmul v30.4s, v4.4s, v8.4s[3] - fmul v31.4s, v6.4s, v8.4s[3] + fmul v30.4s, v4.4s, v8.s[3] + fmul v31.4s, v6.4s, v8.s[3] ld1 {v7.4s}, [pA_3] // for next round add pA_3, pA_3, #16 .endm .macro KERNEL16x4_M2 - fmla v16.4s, v1.4s, v12.4s[0] - fmla v17.4s, v3.4s, v12.4s[0] + fmla v16.4s, v1.4s, v12.s[0] + fmla v17.4s, v3.4s, v12.s[0] ld1 {v8.4s}, [pB] // for next round add pB, pB, #16 - fmla v18.4s, v5.4s, v12.4s[0] - fmla v19.4s, v7.4s, v12.4s[0] + fmla v18.4s, v5.4s, v12.s[0] + fmla v19.4s, v7.4s, v12.s[0] ld1 {v0.4s}, [pA_0] // for next round add pA_0, pA_0, #16 - fmla v20.4s, v1.4s, v12.4s[1] - fmla v21.4s, v3.4s, v12.4s[1] + fmla v20.4s, v1.4s, v12.s[1] + fmla v21.4s, v3.4s, v12.s[1] ld1 {v2.4s}, [pA_1] // for next round add pA_1, pA_1, #16 - fmla v22.4s, v5.4s, v12.4s[1] - fmla v23.4s, v7.4s, v12.4s[1] + fmla v22.4s, v5.4s, v12.s[1] + fmla v23.4s, v7.4s, v12.s[1] ld1 {v4.4s}, [pA_2] // for next round add pA_2, pA_2, #16 - fmla v24.4s, v1.4s, v12.4s[2] - fmla v25.4s, v3.4s, v12.4s[2] + fmla v24.4s, v1.4s, v12.s[2] + fmla v25.4s, v3.4s, v12.s[2] ld1 {v6.4s}, [pA_3] // for next round add pA_3, pA_3, #16 - fmla v26.4s, v5.4s, v12.4s[2] - fmla v27.4s, v7.4s, v12.4s[2] + fmla v26.4s, v5.4s, v12.s[2] + fmla v27.4s, v7.4s, v12.s[2] prfm PLDL1KEEP, [pA_2, #512] - fmla v28.4s, v1.4s, v12.4s[3] - fmla v29.4s, v3.4s, v12.4s[3] + fmla v28.4s, v1.4s, v12.s[3] + fmla v29.4s, v3.4s, v12.s[3] prfm PLDL1KEEP, [pA_3, #512] - fmla v30.4s, v5.4s, v12.4s[3] - fmla v31.4s, v7.4s, v12.4s[3] + fmla v30.4s, v5.4s, v12.s[3] + fmla v31.4s, v7.4s, v12.s[3] prfm PLDL1KEEP, [pB, #512] .endm .macro KERNEL16x4_M1 - fmla v16.4s, v0.4s, v8.4s[0] - fmla v17.4s, v2.4s, v8.4s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v2.4s, v8.s[0] ld1 {v12.4s}, [pB] // for next round add pB, pB, #16 - fmla v18.4s, v4.4s, v8.4s[0] - fmla v19.4s, v6.4s, v8.4s[0] + fmla v18.4s, v4.4s, v8.s[0] + fmla v19.4s, v6.4s, v8.s[0] ld1 {v1.4s}, [pA_0] // for next round add pA_0, pA_0, #16 - fmla v20.4s, v0.4s, v8.4s[1] - fmla v21.4s, v2.4s, v8.4s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v2.4s, v8.s[1] ld1 {v3.4s}, [pA_1] // for next round add pA_1, pA_1, #16 - fmla v22.4s, v4.4s, v8.4s[1] - fmla v23.4s, v6.4s, v8.4s[1] + fmla v22.4s, v4.4s, v8.s[1] + fmla v23.4s, v6.4s, v8.s[1] ld1 {v5.4s}, [pA_2] // for next round add pA_2, pA_2, #16 - fmla v24.4s, v0.4s, v8.4s[2] - fmla v25.4s, v2.4s, v8.4s[2] + fmla v24.4s, v0.4s, v8.s[2] + fmla v25.4s, v2.4s, v8.s[2] ld1 {v7.4s}, [pA_3] // for next round add pA_3, pA_3, #16 - fmla v26.4s, v4.4s, v8.4s[2] - fmla v27.4s, v6.4s, v8.4s[2] + fmla v26.4s, v4.4s, v8.s[2] + fmla v27.4s, v6.4s, v8.s[2] prfm PLDL1KEEP, [pA_0, #512] - fmla v28.4s, v0.4s, v8.4s[3] - fmla v29.4s, v2.4s, v8.4s[3] + fmla v28.4s, v0.4s, v8.s[3] + fmla v29.4s, v2.4s, v8.s[3] prfm PLDL1KEEP, [pA_1, #512] - fmla v30.4s, v4.4s, v8.4s[3] - fmla v31.4s, v6.4s, v8.4s[3] + fmla v30.4s, v4.4s, v8.s[3] + fmla v31.4s, v6.4s, v8.s[3] .endm .macro KERNEL16x4_E - fmla v16.4s, v1.4s, v12.4s[0] - fmla v17.4s, v3.4s, v12.4s[0] - fmla v18.4s, v5.4s, v12.4s[0] - fmla v19.4s, v7.4s, v12.4s[0] - fmla v20.4s, v1.4s, v12.4s[1] - fmla v21.4s, v3.4s, v12.4s[1] - fmla v22.4s, v5.4s, v12.4s[1] - fmla v23.4s, v7.4s, v12.4s[1] - fmla v24.4s, v1.4s, v12.4s[2] - fmla v25.4s, v3.4s, v12.4s[2] - fmla v26.4s, v5.4s, v12.4s[2] - fmla v27.4s, v7.4s, v12.4s[2] - fmla v28.4s, v1.4s, v12.4s[3] - fmla v29.4s, v3.4s, v12.4s[3] - fmla v30.4s, v5.4s, v12.4s[3] - fmla v31.4s, v7.4s, v12.4s[3] + fmla v16.4s, v1.4s, v12.s[0] + fmla v17.4s, v3.4s, v12.s[0] + fmla v18.4s, v5.4s, v12.s[0] + fmla v19.4s, v7.4s, v12.s[0] + fmla v20.4s, v1.4s, v12.s[1] + fmla v21.4s, v3.4s, v12.s[1] + fmla v22.4s, v5.4s, v12.s[1] + fmla v23.4s, v7.4s, v12.s[1] + fmla v24.4s, v1.4s, v12.s[2] + fmla v25.4s, v3.4s, v12.s[2] + fmla v26.4s, v5.4s, v12.s[2] + fmla v27.4s, v7.4s, v12.s[2] + fmla v28.4s, v1.4s, v12.s[3] + fmla v29.4s, v3.4s, v12.s[3] + fmla v30.4s, v5.4s, v12.s[3] + fmla v31.4s, v7.4s, v12.s[3] .endm .macro KERNEL16x4_SUB @@ -359,34 +359,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.4s}, [pA_0] add pA_0, pA_0, #16 - fmla v16.4s, v0.4s, v8.4s[0] - fmla v20.4s, v0.4s, v8.4s[1] - fmla v24.4s, v0.4s, v8.4s[2] - fmla v28.4s, v0.4s, v8.4s[3] + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v24.4s, v0.4s, v8.s[2] + fmla v28.4s, v0.4s, v8.s[3] ld1 {v2.4s}, [pA_1] add pA_1, pA_1, #16 - fmla v17.4s, v2.4s, v8.4s[0] - fmla v21.4s, v2.4s, v8.4s[1] - fmla v25.4s, v2.4s, v8.4s[2] - fmla v29.4s, v2.4s, v8.4s[3] + fmla v17.4s, v2.4s, v8.s[0] + fmla v21.4s, v2.4s, v8.s[1] + fmla v25.4s, v2.4s, v8.s[2] + fmla v29.4s, v2.4s, v8.s[3] ld1 {v4.4s}, [pA_2] add pA_2, pA_2, #16 - fmla v18.4s, v4.4s, v8.4s[0] - fmla v22.4s, v4.4s, v8.4s[1] - fmla v26.4s, v4.4s, v8.4s[2] - fmla v30.4s, v4.4s, v8.4s[3] + fmla v18.4s, v4.4s, v8.s[0] + fmla v22.4s, v4.4s, v8.s[1] + fmla v26.4s, v4.4s, v8.s[2] + fmla v30.4s, v4.4s, v8.s[3] ld1 {v6.4s}, [pA_3] add pA_3, pA_3, #16 - fmla v19.4s, v6.4s, v8.4s[0] - fmla v23.4s, v6.4s, v8.4s[1] - fmla v27.4s, v6.4s, v8.4s[2] - fmla v31.4s, v6.4s, v8.4s[3] + fmla v19.4s, v6.4s, v8.s[0] + fmla v23.4s, v6.4s, v8.s[1] + fmla v27.4s, v6.4s, v8.s[2] + fmla v31.4s, v6.4s, v8.s[3] .endm .macro SAVE16x4 @@ -456,28 +456,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA_0] add pA_0, pA_0, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] ld1 {v2.2s, v3.2s}, [pA_1] add pA_1, pA_1, #16 - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] - fmla v18.2s, v2.2s, v8.2s[0] - fmla v31.2s, v3.2s, v9.2s[1] - fmla v22.2s, v2.2s, v8.2s[1] - fmla v27.2s, v3.2s, v9.2s[0] + fmla v18.2s, v2.2s, v8.s[0] + fmla v31.2s, v3.2s, v9.s[1] + fmla v22.2s, v2.2s, v8.s[1] + fmla v27.2s, v3.2s, v9.s[0] - fmla v26.2s, v2.2s, v9.2s[0] - fmla v23.2s, v3.2s, v8.2s[1] - fmla v30.2s, v2.2s, v9.2s[1] - fmla v19.2s, v3.2s, v8.2s[0] + fmla v26.2s, v2.2s, v9.s[0] + fmla v23.2s, v3.2s, v8.s[1] + fmla v30.2s, v2.2s, v9.s[1] + fmla v19.2s, v3.2s, v8.s[0] .endm .macro SAVE8x4 @@ -556,17 +556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA_0] add pA_0, pA_0, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 @@ -614,10 +614,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA_0] add pA_0, pA_0, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v28.2s, v0.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 @@ -700,10 +700,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA_0] add pA_0, pA_0, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 @@ -736,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA_0] add pA_0, pA_0, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 @@ -767,7 +767,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0 , [pA_0] add pA_0, pA_0, #4 - fmla v16.2s, v8.2s, v0.2s[0] + fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 @@ -796,8 +796,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA_0] add pA_0 , pA_0, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 @@ -825,7 +825,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA_0] add pA_0 , pA_0, #8 - fmla v16.2s, v0.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/sgemm_kernel_8x8.S b/kernel/arm64/sgemm_kernel_8x8.S index ac690e4d4..bd47bed31 100644 --- a/kernel/arm64/sgemm_kernel_8x8.S +++ b/kernel/arm64/sgemm_kernel_8x8.S @@ -157,22 +157,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v4.4s[0] - fmul v17.4s, v1.4s, v4.4s[0] - fmul v18.4s, v0.4s, v4.4s[1] - fmul v19.4s, v1.4s, v4.4s[1] - fmul v20.4s, v0.4s, v4.4s[2] - fmul v21.4s, v1.4s, v4.4s[2] - fmul v22.4s, v0.4s, v4.4s[3] - fmul v23.4s, v1.4s, v4.4s[3] - fmul v24.4s, v0.4s, v5.4s[0] - fmul v25.4s, v1.4s, v5.4s[0] - fmul v26.4s, v0.4s, v5.4s[1] - fmul v27.4s, v1.4s, v5.4s[1] - fmul v28.4s, v0.4s, v5.4s[2] - fmul v29.4s, v1.4s, v5.4s[2] - fmul v30.4s, v0.4s, v5.4s[3] - fmul v31.4s, v1.4s, v5.4s[3] + fmul v16.4s, v0.4s, v4.s[0] + fmul v17.4s, v1.4s, v4.s[0] + fmul v18.4s, v0.4s, v4.s[1] + fmul v19.4s, v1.4s, v4.s[1] + fmul v20.4s, v0.4s, v4.s[2] + fmul v21.4s, v1.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v23.4s, v1.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v25.4s, v1.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v27.4s, v1.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v29.4s, v1.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] + fmul v31.4s, v1.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -185,22 +185,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_M1 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v17.4s, v1.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v19.4s, v1.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v21.4s, v1.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v23.4s, v1.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v25.4s, v1.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v27.4s, v1.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v29.4s, v1.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] - fmla v31.4s, v1.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -213,22 +213,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_M2 - fmla v16.4s, v2.4s, v6.4s[0] - fmla v17.4s, v3.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v19.4s, v3.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v21.4s, v3.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v23.4s, v3.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v25.4s, v3.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v27.4s, v3.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v29.4s, v3.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] - fmla v31.4s, v3.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] ld1 {v4.4s}, [pB] add pB, pB, #16 @@ -241,22 +241,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_E - fmla v16.4s, v2.4s, v6.4s[0] - fmla v17.4s, v3.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v19.4s, v3.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v21.4s, v3.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v23.4s, v3.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v25.4s, v3.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v27.4s, v3.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v29.4s, v3.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] - fmla v31.4s, v3.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] .endm .macro KERNEL8x8_SUB @@ -269,22 +269,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v17.4s, v1.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v19.4s, v1.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v21.4s, v1.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v23.4s, v1.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v25.4s, v1.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v27.4s, v1.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v29.4s, v1.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] - fmla v31.4s, v1.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] .endm .macro SAVE8x8 @@ -367,14 +367,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v4.4s[0] - fmul v18.4s, v0.4s, v4.4s[1] - fmul v20.4s, v0.4s, v4.4s[2] - fmul v22.4s, v0.4s, v4.4s[3] - fmul v24.4s, v0.4s, v5.4s[0] - fmul v26.4s, v0.4s, v5.4s[1] - fmul v28.4s, v0.4s, v5.4s[2] - fmul v30.4s, v0.4s, v5.4s[3] + fmul v16.4s, v0.4s, v4.s[0] + fmul v18.4s, v0.4s, v4.s[1] + fmul v20.4s, v0.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -385,14 +385,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M1 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -403,14 +403,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M2 - fmla v16.4s, v2.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] ld1 {v4.4s}, [pB] add pB, pB, #16 @@ -421,14 +421,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_E - fmla v16.4s, v2.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] .endm .macro KERNEL4x8_SUB @@ -439,14 +439,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] .endm .macro SAVE4x8 @@ -520,14 +520,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v4.4s[0] - fmla v18.2s, v0.2s, v4.4s[1] - fmla v20.2s, v0.2s, v4.4s[2] - fmla v22.2s, v0.2s, v4.4s[3] - fmla v24.2s, v0.2s, v5.4s[0] - fmla v26.2s, v0.2s, v5.4s[1] - fmla v28.2s, v0.2s, v5.4s[2] - fmla v30.2s, v0.2s, v5.4s[3] + fmla v16.2s, v0.2s, v4.s[0] + fmla v18.2s, v0.2s, v4.s[1] + fmla v20.2s, v0.2s, v4.s[2] + fmla v22.2s, v0.2s, v4.s[3] + fmla v24.2s, v0.2s, v5.s[0] + fmla v26.2s, v0.2s, v5.s[1] + fmla v28.2s, v0.2s, v5.s[2] + fmla v30.2s, v0.2s, v5.s[3] .endm .macro SAVE2x8 @@ -601,14 +601,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0, [pA] add pA, pA, #4 - fmla s16, s0, v4.4s[0] - fmla s18, s0, v4.4s[1] - fmla s20, s0, v4.4s[2] - fmla s22, s0, v4.4s[3] - fmla s24, s0, v5.4s[0] - fmla s26, s0, v5.4s[1] - fmla s28, s0, v5.4s[2] - fmla s30, s0, v5.4s[3] + fmla s16, s0, v4.s[0] + fmla s18, s0, v4.s[1] + fmla s20, s0, v4.s[2] + fmla s22, s0, v4.s[3] + fmla s24, s0, v5.s[0] + fmla s26, s0, v5.s[1] + fmla s28, s0, v5.s[2] + fmla s30, s0, v5.s[3] .endm .macro SAVE1x8 @@ -682,14 +682,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v8.2s[0] - fmul v17.4s, v1.4s, v8.2s[0] - fmul v20.4s, v0.4s, v8.2s[1] - fmul v21.4s, v1.4s, v8.2s[1] - fmul v24.4s, v0.4s, v9.2s[0] - fmul v25.4s, v1.4s, v9.2s[0] - fmul v28.4s, v0.4s, v9.2s[1] - fmul v29.4s, v1.4s, v9.2s[1] + fmul v16.4s, v0.4s, v8.s[0] + fmul v17.4s, v1.4s, v8.s[0] + fmul v20.4s, v0.4s, v8.s[1] + fmul v21.4s, v1.4s, v8.s[1] + fmul v24.4s, v0.4s, v9.s[0] + fmul v25.4s, v1.4s, v9.s[0] + fmul v28.4s, v0.4s, v9.s[1] + fmul v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -700,14 +700,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -718,14 +718,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M2 - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 @@ -736,14 +736,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_E - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] .endm .macro KERNEL8x4_SUB @@ -754,14 +754,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] .endm .macro SAVE8x4 @@ -814,17 +814,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmul v16.2s, v0.2s, v8.2s[0] - fmul v29.2s, v1.2s, v9.2s[1] + fmul v16.2s, v0.2s, v8.s[0] + fmul v29.2s, v1.2s, v9.s[1] - fmul v20.2s, v0.2s, v8.2s[1] - fmul v25.2s, v1.2s, v9.2s[0] + fmul v20.2s, v0.2s, v8.s[1] + fmul v25.2s, v1.2s, v9.s[0] - fmul v24.2s, v0.2s, v9.2s[0] - fmul v21.2s, v1.2s, v8.2s[1] + fmul v24.2s, v0.2s, v9.s[0] + fmul v21.2s, v1.2s, v8.s[1] - fmul v28.2s, v0.2s, v9.2s[1] - fmul v17.2s, v1.2s, v8.2s[0] + fmul v28.2s, v0.2s, v9.s[1] + fmul v17.2s, v1.2s, v8.s[0] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -833,61 +833,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] ld1 {v4.2s, v5.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] ld1 {v0.2s, v1.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_SUB @@ -896,17 +896,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 @@ -951,10 +951,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v28.2s, v0.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 @@ -1034,11 +1034,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] .endm .macro SAVE8x2 @@ -1074,10 +1074,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 @@ -1109,8 +1109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 @@ -1139,7 +1139,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0 , [pA] add pA, pA, #4 - fmla v16.2s, v8.2s, v0.2s[0] + fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 @@ -1169,8 +1169,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] .endm .macro SAVE8x1 @@ -1196,8 +1196,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA , pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 @@ -1222,7 +1222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA , pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S old mode 100755 new mode 100644 index b99760a03..28b321651 --- a/kernel/arm64/strmm_kernel_16x4.S +++ b/kernel/arm64/strmm_kernel_16x4.S @@ -161,25 +161,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v8.2s[0] - fmul v17.4s, v1.4s, v8.2s[0] - fmul v18.4s, v2.4s, v8.2s[0] - fmul v19.4s, v3.4s, v8.2s[0] + fmul v16.4s, v0.4s, v8.s[0] + fmul v17.4s, v1.4s, v8.s[0] + fmul v18.4s, v2.4s, v8.s[0] + fmul v19.4s, v3.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.2s[1] - fmul v21.4s, v1.4s, v8.2s[1] - fmul v22.4s, v2.4s, v8.2s[1] - fmul v23.4s, v3.4s, v8.2s[1] + fmul v20.4s, v0.4s, v8.s[1] + fmul v21.4s, v1.4s, v8.s[1] + fmul v22.4s, v2.4s, v8.s[1] + fmul v23.4s, v3.4s, v8.s[1] - fmul v24.4s, v0.4s, v9.2s[0] - fmul v25.4s, v1.4s, v9.2s[0] - fmul v26.4s, v2.4s, v9.2s[0] - fmul v27.4s, v3.4s, v9.2s[0] + fmul v24.4s, v0.4s, v9.s[0] + fmul v25.4s, v1.4s, v9.s[0] + fmul v26.4s, v2.4s, v9.s[0] + fmul v27.4s, v3.4s, v9.s[0] - fmul v28.4s, v0.4s, v9.2s[1] - fmul v29.4s, v1.4s, v9.2s[1] - fmul v30.4s, v2.4s, v9.2s[1] - fmul v31.4s, v3.4s, v9.2s[1] + fmul v28.4s, v0.4s, v9.s[1] + fmul v29.4s, v1.4s, v9.s[1] + fmul v30.4s, v2.4s, v9.s[1] + fmul v31.4s, v3.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -194,25 +194,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_M1 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v22.4s, v2.4s, v8.2s[1] - fmla v23.4s, v3.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v22.4s, v2.4s, v8.s[1] + fmla v23.4s, v3.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v26.4s, v2.4s, v9.2s[0] - fmla v27.4s, v3.4s, v9.2s[0] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v26.4s, v2.4s, v9.s[0] + fmla v27.4s, v3.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] - fmla v30.4s, v2.4s, v9.2s[1] - fmla v31.4s, v3.4s, v9.2s[1] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] + fmla v30.4s, v2.4s, v9.s[1] + fmla v31.4s, v3.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -227,25 +227,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_M2 - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v18.4s, v6.4s, v12.2s[0] - fmla v19.4s, v7.4s, v12.2s[0] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v18.4s, v6.4s, v12.s[0] + fmla v19.4s, v7.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v22.4s, v6.4s, v12.2s[1] - fmla v23.4s, v7.4s, v12.2s[1] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v22.4s, v6.4s, v12.s[1] + fmla v23.4s, v7.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v26.4s, v6.4s, v13.2s[0] - fmla v27.4s, v7.4s, v13.2s[0] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v26.4s, v6.4s, v13.s[0] + fmla v27.4s, v7.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] - fmla v30.4s, v6.4s, v13.2s[1] - fmla v31.4s, v7.4s, v13.2s[1] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] + fmla v30.4s, v6.4s, v13.s[1] + fmla v31.4s, v7.4s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 @@ -260,25 +260,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_E - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v18.4s, v6.4s, v12.2s[0] - fmla v19.4s, v7.4s, v12.2s[0] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v18.4s, v6.4s, v12.s[0] + fmla v19.4s, v7.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v22.4s, v6.4s, v12.2s[1] - fmla v23.4s, v7.4s, v12.2s[1] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v22.4s, v6.4s, v12.s[1] + fmla v23.4s, v7.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v26.4s, v6.4s, v13.2s[0] - fmla v27.4s, v7.4s, v13.2s[0] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v26.4s, v6.4s, v13.s[0] + fmla v27.4s, v7.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] - fmla v30.4s, v6.4s, v13.2s[1] - fmla v31.4s, v7.4s, v13.2s[1] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] + fmla v30.4s, v6.4s, v13.s[1] + fmla v31.4s, v7.4s, v13.s[1] .endm .macro KERNEL16x4_SUB @@ -293,25 +293,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v22.4s, v2.4s, v8.2s[1] - fmla v23.4s, v3.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v22.4s, v2.4s, v8.s[1] + fmla v23.4s, v3.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v26.4s, v2.4s, v9.2s[0] - fmla v27.4s, v3.4s, v9.2s[0] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v26.4s, v2.4s, v9.s[0] + fmla v27.4s, v3.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] - fmla v30.4s, v2.4s, v9.2s[1] - fmla v31.4s, v3.4s, v9.2s[1] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] + fmla v30.4s, v2.4s, v9.s[1] + fmla v31.4s, v3.4s, v9.s[1] .endm .macro SAVE16x4 @@ -369,14 +369,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v8.2s[0] - fmul v17.4s, v1.4s, v8.2s[0] - fmul v20.4s, v0.4s, v8.2s[1] - fmul v21.4s, v1.4s, v8.2s[1] - fmul v24.4s, v0.4s, v9.2s[0] - fmul v25.4s, v1.4s, v9.2s[0] - fmul v28.4s, v0.4s, v9.2s[1] - fmul v29.4s, v1.4s, v9.2s[1] + fmul v16.4s, v0.4s, v8.s[0] + fmul v17.4s, v1.4s, v8.s[0] + fmul v20.4s, v0.4s, v8.s[1] + fmul v21.4s, v1.4s, v8.s[1] + fmul v24.4s, v0.4s, v9.s[0] + fmul v25.4s, v1.4s, v9.s[0] + fmul v28.4s, v0.4s, v9.s[1] + fmul v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -387,14 +387,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -405,14 +405,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M2 - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 @@ -423,14 +423,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_E - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] .endm .macro KERNEL8x4_SUB @@ -441,14 +441,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] .endm .macro SAVE8x4 @@ -496,17 +496,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmul v16.2s, v0.2s, v8.2s[0] - fmul v29.2s, v1.2s, v9.2s[1] + fmul v16.2s, v0.2s, v8.s[0] + fmul v29.2s, v1.2s, v9.s[1] - fmul v20.2s, v0.2s, v8.2s[1] - fmul v25.2s, v1.2s, v9.2s[0] + fmul v20.2s, v0.2s, v8.s[1] + fmul v25.2s, v1.2s, v9.s[0] - fmul v24.2s, v0.2s, v9.2s[0] - fmul v21.2s, v1.2s, v8.2s[1] + fmul v24.2s, v0.2s, v9.s[0] + fmul v21.2s, v1.2s, v8.s[1] - fmul v28.2s, v0.2s, v9.2s[1] - fmul v17.2s, v1.2s, v8.2s[0] + fmul v28.2s, v0.2s, v9.s[1] + fmul v17.2s, v1.2s, v8.s[0] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -515,61 +515,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] ld1 {v4.2s, v5.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] ld1 {v0.2s, v1.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_SUB @@ -578,17 +578,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 @@ -633,10 +633,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v28.2s, v0.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 @@ -718,15 +718,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v22.4s, v2.4s, v8.2s[1] - fmla v23.4s, v3.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v22.4s, v2.4s, v8.s[1] + fmla v23.4s, v3.4s, v8.s[1] .endm .macro SAVE16x2 @@ -764,11 +764,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] .endm .macro SAVE8x2 @@ -802,10 +802,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 @@ -837,8 +837,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 @@ -866,7 +866,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0 , [pA] add pA, pA, #4 - fmla v16.2s, v8.2s, v0.2s[0] + fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 @@ -901,10 +901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] .endm .macro SAVE16x1 @@ -934,8 +934,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] .endm .macro SAVE8x1 @@ -961,8 +961,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA , pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 @@ -987,7 +987,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA , pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/strmm_kernel_4x4.S b/kernel/arm64/strmm_kernel_4x4.S index 674e200d8..eeb3e6e72 100644 --- a/kernel/arm64/strmm_kernel_4x4.S +++ b/kernel/arm64/strmm_kernel_4x4.S @@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmul v16.2s, v0.2s, v8.2s[0] - fmul v29.2s, v1.2s, v9.2s[1] + fmul v16.2s, v0.2s, v8.s[0] + fmul v29.2s, v1.2s, v9.s[1] - fmul v20.2s, v0.2s, v8.2s[1] - fmul v25.2s, v1.2s, v9.2s[0] + fmul v20.2s, v0.2s, v8.s[1] + fmul v25.2s, v1.2s, v9.s[0] - fmul v24.2s, v0.2s, v9.2s[0] - fmul v21.2s, v1.2s, v8.2s[1] + fmul v24.2s, v0.2s, v9.s[0] + fmul v21.2s, v1.2s, v8.s[1] - fmul v28.2s, v0.2s, v9.2s[1] - fmul v17.2s, v1.2s, v8.2s[0] + fmul v28.2s, v0.2s, v9.s[1] + fmul v17.2s, v1.2s, v8.s[0] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] ld1 {v4.2s, v5.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] ld1 {v0.2s, v1.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_SUB @@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 @@ -280,10 +280,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v28.2s, v0.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 @@ -353,10 +353,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 @@ -386,8 +386,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 @@ -414,7 +414,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0 , [pA] add pA, pA, #4 - fmla v16.2s, v8.2s, v0.2s[0] + fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 @@ -440,8 +440,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA , pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 @@ -468,7 +468,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA , pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/strmm_kernel_8x8.S b/kernel/arm64/strmm_kernel_8x8.S old mode 100755 new mode 100644 index 98b912934..843f0c890 --- a/kernel/arm64/strmm_kernel_8x8.S +++ b/kernel/arm64/strmm_kernel_8x8.S @@ -159,22 +159,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v4.4s[0] - fmul v17.4s, v1.4s, v4.4s[0] - fmul v18.4s, v0.4s, v4.4s[1] - fmul v19.4s, v1.4s, v4.4s[1] - fmul v20.4s, v0.4s, v4.4s[2] - fmul v21.4s, v1.4s, v4.4s[2] - fmul v22.4s, v0.4s, v4.4s[3] - fmul v23.4s, v1.4s, v4.4s[3] - fmul v24.4s, v0.4s, v5.4s[0] - fmul v25.4s, v1.4s, v5.4s[0] - fmul v26.4s, v0.4s, v5.4s[1] - fmul v27.4s, v1.4s, v5.4s[1] - fmul v28.4s, v0.4s, v5.4s[2] - fmul v29.4s, v1.4s, v5.4s[2] - fmul v30.4s, v0.4s, v5.4s[3] - fmul v31.4s, v1.4s, v5.4s[3] + fmul v16.4s, v0.4s, v4.s[0] + fmul v17.4s, v1.4s, v4.s[0] + fmul v18.4s, v0.4s, v4.s[1] + fmul v19.4s, v1.4s, v4.s[1] + fmul v20.4s, v0.4s, v4.s[2] + fmul v21.4s, v1.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v23.4s, v1.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v25.4s, v1.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v27.4s, v1.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v29.4s, v1.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] + fmul v31.4s, v1.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -187,22 +187,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_M1 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v17.4s, v1.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v19.4s, v1.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v21.4s, v1.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v23.4s, v1.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v25.4s, v1.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v27.4s, v1.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v29.4s, v1.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] - fmla v31.4s, v1.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -215,22 +215,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_M2 - fmla v16.4s, v2.4s, v6.4s[0] - fmla v17.4s, v3.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v19.4s, v3.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v21.4s, v3.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v23.4s, v3.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v25.4s, v3.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v27.4s, v3.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v29.4s, v3.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] - fmla v31.4s, v3.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] ld1 {v4.4s}, [pB] add pB, pB, #16 @@ -243,22 +243,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_E - fmla v16.4s, v2.4s, v6.4s[0] - fmla v17.4s, v3.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v19.4s, v3.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v21.4s, v3.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v23.4s, v3.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v25.4s, v3.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v27.4s, v3.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v29.4s, v3.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] - fmla v31.4s, v3.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] .endm .macro KERNEL8x8_SUB @@ -271,22 +271,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v17.4s, v1.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v19.4s, v1.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v21.4s, v1.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v23.4s, v1.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v25.4s, v1.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v27.4s, v1.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v29.4s, v1.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] - fmla v31.4s, v1.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] .endm .macro SAVE8x8 @@ -361,14 +361,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v4.4s[0] - fmul v18.4s, v0.4s, v4.4s[1] - fmul v20.4s, v0.4s, v4.4s[2] - fmul v22.4s, v0.4s, v4.4s[3] - fmul v24.4s, v0.4s, v5.4s[0] - fmul v26.4s, v0.4s, v5.4s[1] - fmul v28.4s, v0.4s, v5.4s[2] - fmul v30.4s, v0.4s, v5.4s[3] + fmul v16.4s, v0.4s, v4.s[0] + fmul v18.4s, v0.4s, v4.s[1] + fmul v20.4s, v0.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -379,14 +379,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M1 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -397,14 +397,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M2 - fmla v16.4s, v2.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] ld1 {v4.4s}, [pB] add pB, pB, #16 @@ -415,14 +415,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_E - fmla v16.4s, v2.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] .endm .macro KERNEL4x8_SUB @@ -433,14 +433,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] .endm .macro SAVE4x8 @@ -514,14 +514,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v4.4s[0] - fmla v18.2s, v0.2s, v4.4s[1] - fmla v20.2s, v0.2s, v4.4s[2] - fmla v22.2s, v0.2s, v4.4s[3] - fmla v24.2s, v0.2s, v5.4s[0] - fmla v26.2s, v0.2s, v5.4s[1] - fmla v28.2s, v0.2s, v5.4s[2] - fmla v30.2s, v0.2s, v5.4s[3] + fmla v16.2s, v0.2s, v4.s[0] + fmla v18.2s, v0.2s, v4.s[1] + fmla v20.2s, v0.2s, v4.s[2] + fmla v22.2s, v0.2s, v4.s[3] + fmla v24.2s, v0.2s, v5.s[0] + fmla v26.2s, v0.2s, v5.s[1] + fmla v28.2s, v0.2s, v5.s[2] + fmla v30.2s, v0.2s, v5.s[3] .endm .macro SAVE2x8 @@ -595,14 +595,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0, [pA] add pA, pA, #4 - fmla s16, s0, v4.4s[0] - fmla s18, s0, v4.4s[1] - fmla s20, s0, v4.4s[2] - fmla s22, s0, v4.4s[3] - fmla s24, s0, v5.4s[0] - fmla s26, s0, v5.4s[1] - fmla s28, s0, v5.4s[2] - fmla s30, s0, v5.4s[3] + fmla s16, s0, v4.s[0] + fmla s18, s0, v4.s[1] + fmla s20, s0, v4.s[2] + fmla s22, s0, v4.s[3] + fmla s24, s0, v5.s[0] + fmla s26, s0, v5.s[1] + fmla s28, s0, v5.s[2] + fmla s30, s0, v5.s[3] .endm .macro SAVE1x8 @@ -676,14 +676,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v8.2s[0] - fmul v17.4s, v1.4s, v8.2s[0] - fmul v20.4s, v0.4s, v8.2s[1] - fmul v21.4s, v1.4s, v8.2s[1] - fmul v24.4s, v0.4s, v9.2s[0] - fmul v25.4s, v1.4s, v9.2s[0] - fmul v28.4s, v0.4s, v9.2s[1] - fmul v29.4s, v1.4s, v9.2s[1] + fmul v16.4s, v0.4s, v8.s[0] + fmul v17.4s, v1.4s, v8.s[0] + fmul v20.4s, v0.4s, v8.s[1] + fmul v21.4s, v1.4s, v8.s[1] + fmul v24.4s, v0.4s, v9.s[0] + fmul v25.4s, v1.4s, v9.s[0] + fmul v28.4s, v0.4s, v9.s[1] + fmul v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -694,14 +694,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -712,14 +712,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M2 - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 @@ -730,14 +730,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_E - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] .endm .macro KERNEL8x4_SUB @@ -748,14 +748,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] .endm .macro SAVE8x4 @@ -808,17 +808,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmul v16.2s, v0.2s, v8.2s[0] - fmul v29.2s, v1.2s, v9.2s[1] + fmul v16.2s, v0.2s, v8.s[0] + fmul v29.2s, v1.2s, v9.s[1] - fmul v20.2s, v0.2s, v8.2s[1] - fmul v25.2s, v1.2s, v9.2s[0] + fmul v20.2s, v0.2s, v8.s[1] + fmul v25.2s, v1.2s, v9.s[0] - fmul v24.2s, v0.2s, v9.2s[0] - fmul v21.2s, v1.2s, v8.2s[1] + fmul v24.2s, v0.2s, v9.s[0] + fmul v21.2s, v1.2s, v8.s[1] - fmul v28.2s, v0.2s, v9.2s[1] - fmul v17.2s, v1.2s, v8.2s[0] + fmul v28.2s, v0.2s, v9.s[1] + fmul v17.2s, v1.2s, v8.s[0] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -827,61 +827,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] ld1 {v4.2s, v5.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] ld1 {v0.2s, v1.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_SUB @@ -890,17 +890,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 @@ -945,10 +945,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v28.2s, v0.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 @@ -1028,11 +1028,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] .endm .macro SAVE8x2 @@ -1068,10 +1068,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 @@ -1103,8 +1103,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 @@ -1133,7 +1133,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0 , [pA] add pA, pA, #4 - fmla v16.2s, v8.2s, v0.2s[0] + fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 @@ -1163,8 +1163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] .endm .macro SAVE8x1 @@ -1190,8 +1190,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA , pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 @@ -1216,7 +1216,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA , pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S index 28ce3de40..1cb695e56 100644 --- a/kernel/arm64/zgemm_kernel_4x4.S +++ b/kernel/arm64/zgemm_kernel_4x4.S @@ -182,93 +182,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmul v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] + fmul v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.2d, v0.2d, v9.2d[0] + fmls v17.2d, v0.2d, v9.d[0] #else - fmul v17.2d, v0.2d, v9.2d[0] + fmul v17.2d, v0.2d, v9.d[0] #endif - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - fmul v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] + fmul v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b - fmls v19.2d, v2.2d, v9.2d[0] + fmls v19.2d, v2.2d, v9.d[0] #else - fmul v19.2d, v2.2d, v9.2d[0] + fmul v19.2d, v2.2d, v9.d[0] #endif - OP_ir v19.2d, v3.2d, v8.2d[0] + OP_ir v19.2d, v3.2d, v8.d[0] - fmul v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] + fmul v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.2d, v0.2d, v9.2d[1] + fmls v21.2d, v0.2d, v9.d[1] #else - fmul v21.2d, v0.2d, v9.2d[1] + fmul v21.2d, v0.2d, v9.d[1] #endif - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_ir v21.2d, v1.2d, v8.d[1] - fmul v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] + fmul v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b - fmls v23.2d, v2.2d, v9.2d[1] + fmls v23.2d, v2.2d, v9.d[1] #else - fmul v23.2d, v2.2d, v9.2d[1] + fmul v23.2d, v2.2d, v9.d[1] #endif - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_ir v23.2d, v3.2d, v8.d[1] - fmul v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] + fmul v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.2d, v0.2d, v11.2d[0] + fmls v25.2d, v0.2d, v11.d[0] #else - fmul v25.2d, v0.2d, v11.2d[0] + fmul v25.2d, v0.2d, v11.d[0] #endif - OP_ir v25.2d, v1.2d, v10.2d[0] + OP_ir v25.2d, v1.2d, v10.d[0] - fmul v26.2d, v2.2d, v10.2d[0] - OP_ii v26.2d, v3.2d, v11.2d[0] + fmul v26.2d, v2.2d, v10.d[0] + OP_ii v26.2d, v3.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.2d, v2.2d, v11.2d[0] + fmls v27.2d, v2.2d, v11.d[0] #else - fmul v27.2d, v2.2d, v11.2d[0] + fmul v27.2d, v2.2d, v11.d[0] #endif - OP_ir v27.2d, v3.2d, v10.2d[0] + OP_ir v27.2d, v3.2d, v10.d[0] - fmul v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] + fmul v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.2d, v0.2d, v11.2d[1] + fmls v29.2d, v0.2d, v11.d[1] #else - fmul v29.2d, v0.2d, v11.2d[1] + fmul v29.2d, v0.2d, v11.d[1] #endif - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_ir v29.2d, v1.2d, v10.d[1] - fmul v30.2d, v2.2d, v10.2d[1] - OP_ii v30.2d, v3.2d, v11.2d[1] + fmul v30.2d, v2.2d, v10.d[1] + OP_ii v30.2d, v3.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.2d, v2.2d, v11.2d[1] + fmls v31.2d, v2.2d, v11.d[1] #else - fmul v31.2d, v2.2d, v11.2d[1] + fmul v31.2d, v2.2d, v11.d[1] #endif - OP_ir v31.2d, v3.2d, v10.2d[1] + OP_ir v31.2d, v3.2d, v10.d[1] ld2 {v12.2d, v13.2d}, [pB] add pB, pB, #32 @@ -281,161 +281,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] ld2 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] - OP_ri v19.2d, v2.2d, v9.2d[0] - OP_ir v19.2d, v3.2d, v8.2d[0] + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] ld2 {v14.2d, v15.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] ld2 {v4.2d, v5.2d} , [pA] // For next round add pA, pA, #32 - OP_rr v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] - OP_ri v23.2d, v2.2d, v9.2d[1] - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_rr v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] + OP_ri v23.2d, v2.2d, v9.d[1] + OP_ir v23.2d, v3.2d, v8.d[1] ld2 {v6.2d, v7.2d} , [pA] // For next round add pA, pA, #32 - OP_rr v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] - OP_ri v25.2d, v0.2d, v11.2d[0] - OP_ir v25.2d, v1.2d, v10.2d[0] + OP_rr v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] + OP_ri v25.2d, v0.2d, v11.d[0] + OP_ir v25.2d, v1.2d, v10.d[0] prfm PLDL1KEEP, [pA, #512] - OP_rr v26.2d, v2.2d, v10.2d[0] - OP_ii v26.2d, v3.2d, v11.2d[0] - OP_ri v27.2d, v2.2d, v11.2d[0] - OP_ir v27.2d, v3.2d, v10.2d[0] + OP_rr v26.2d, v2.2d, v10.d[0] + OP_ii v26.2d, v3.2d, v11.d[0] + OP_ri v27.2d, v2.2d, v11.d[0] + OP_ir v27.2d, v3.2d, v10.d[0] prfm PLDL1KEEP, [pB, #512] - OP_rr v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] - OP_ri v29.2d, v0.2d, v11.2d[1] - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_rr v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] + OP_ri v29.2d, v0.2d, v11.d[1] + OP_ir v29.2d, v1.2d, v10.d[1] - OP_rr v30.2d, v2.2d, v10.2d[1] - OP_ii v30.2d, v3.2d, v11.2d[1] - OP_ri v31.2d, v2.2d, v11.2d[1] - OP_ir v31.2d, v3.2d, v10.2d[1] + OP_rr v30.2d, v2.2d, v10.d[1] + OP_ii v30.2d, v3.2d, v11.d[1] + OP_ri v31.2d, v2.2d, v11.d[1] + OP_ir v31.2d, v3.2d, v10.d[1] .endm .macro KERNEL4x4_M2 - OP_rr v16.2d, v4.2d, v12.2d[0] - OP_ii v16.2d, v5.2d, v13.2d[0] - OP_ri v17.2d, v4.2d, v13.2d[0] - OP_ir v17.2d, v5.2d, v12.2d[0] + OP_rr v16.2d, v4.2d, v12.d[0] + OP_ii v16.2d, v5.2d, v13.d[0] + OP_ri v17.2d, v4.2d, v13.d[0] + OP_ir v17.2d, v5.2d, v12.d[0] ld2 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v18.2d, v6.2d, v12.2d[0] - OP_ii v18.2d, v7.2d, v13.2d[0] - OP_ri v19.2d, v6.2d, v13.2d[0] - OP_ir v19.2d, v7.2d, v12.2d[0] + OP_rr v18.2d, v6.2d, v12.d[0] + OP_ii v18.2d, v7.2d, v13.d[0] + OP_ri v19.2d, v6.2d, v13.d[0] + OP_ir v19.2d, v7.2d, v12.d[0] ld2 {v10.2d, v11.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v20.2d, v4.2d, v12.2d[1] - OP_ii v20.2d, v5.2d, v13.2d[1] - OP_ri v21.2d, v4.2d, v13.2d[1] - OP_ir v21.2d, v5.2d, v12.2d[1] + OP_rr v20.2d, v4.2d, v12.d[1] + OP_ii v20.2d, v5.2d, v13.d[1] + OP_ri v21.2d, v4.2d, v13.d[1] + OP_ir v21.2d, v5.2d, v12.d[1] ld2 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 - OP_rr v22.2d, v6.2d, v12.2d[1] - OP_ii v22.2d, v7.2d, v13.2d[1] - OP_ri v23.2d, v6.2d, v13.2d[1] - OP_ir v23.2d, v7.2d, v12.2d[1] + OP_rr v22.2d, v6.2d, v12.d[1] + OP_ii v22.2d, v7.2d, v13.d[1] + OP_ri v23.2d, v6.2d, v13.d[1] + OP_ir v23.2d, v7.2d, v12.d[1] ld2 {v2.2d, v3.2d}, [pA] // For next round add pA, pA, #32 - OP_rr v24.2d, v4.2d, v14.2d[0] - OP_ii v24.2d, v5.2d, v15.2d[0] - OP_ri v25.2d, v4.2d, v15.2d[0] - OP_ir v25.2d, v5.2d, v14.2d[0] + OP_rr v24.2d, v4.2d, v14.d[0] + OP_ii v24.2d, v5.2d, v15.d[0] + OP_ri v25.2d, v4.2d, v15.d[0] + OP_ir v25.2d, v5.2d, v14.d[0] prfm PLDL1KEEP, [pA, #512] - OP_rr v26.2d, v6.2d, v14.2d[0] - OP_ii v26.2d, v7.2d, v15.2d[0] - OP_ri v27.2d, v6.2d, v15.2d[0] - OP_ir v27.2d, v7.2d, v14.2d[0] + OP_rr v26.2d, v6.2d, v14.d[0] + OP_ii v26.2d, v7.2d, v15.d[0] + OP_ri v27.2d, v6.2d, v15.d[0] + OP_ir v27.2d, v7.2d, v14.d[0] prfm PLDL1KEEP, [pB, #512] - OP_rr v28.2d, v4.2d, v14.2d[1] - OP_ii v28.2d, v5.2d, v15.2d[1] - OP_ri v29.2d, v4.2d, v15.2d[1] - OP_ir v29.2d, v5.2d, v14.2d[1] + OP_rr v28.2d, v4.2d, v14.d[1] + OP_ii v28.2d, v5.2d, v15.d[1] + OP_ri v29.2d, v4.2d, v15.d[1] + OP_ir v29.2d, v5.2d, v14.d[1] - OP_rr v30.2d, v6.2d, v14.2d[1] - OP_ii v30.2d, v7.2d, v15.2d[1] - OP_ri v31.2d, v6.2d, v15.2d[1] - OP_ir v31.2d, v7.2d, v14.2d[1] + OP_rr v30.2d, v6.2d, v14.d[1] + OP_ii v30.2d, v7.2d, v15.d[1] + OP_ri v31.2d, v6.2d, v15.d[1] + OP_ir v31.2d, v7.2d, v14.d[1] .endm .macro KERNEL4x4_E - OP_rr v16.2d, v4.2d, v12.2d[0] - OP_ii v16.2d, v5.2d, v13.2d[0] - OP_ri v17.2d, v4.2d, v13.2d[0] - OP_ir v17.2d, v5.2d, v12.2d[0] + OP_rr v16.2d, v4.2d, v12.d[0] + OP_ii v16.2d, v5.2d, v13.d[0] + OP_ri v17.2d, v4.2d, v13.d[0] + OP_ir v17.2d, v5.2d, v12.d[0] - OP_rr v18.2d, v6.2d, v12.2d[0] - OP_ii v18.2d, v7.2d, v13.2d[0] - OP_ri v19.2d, v6.2d, v13.2d[0] - OP_ir v19.2d, v7.2d, v12.2d[0] + OP_rr v18.2d, v6.2d, v12.d[0] + OP_ii v18.2d, v7.2d, v13.d[0] + OP_ri v19.2d, v6.2d, v13.d[0] + OP_ir v19.2d, v7.2d, v12.d[0] - OP_rr v20.2d, v4.2d, v12.2d[1] - OP_ii v20.2d, v5.2d, v13.2d[1] - OP_ri v21.2d, v4.2d, v13.2d[1] - OP_ir v21.2d, v5.2d, v12.2d[1] + OP_rr v20.2d, v4.2d, v12.d[1] + OP_ii v20.2d, v5.2d, v13.d[1] + OP_ri v21.2d, v4.2d, v13.d[1] + OP_ir v21.2d, v5.2d, v12.d[1] - OP_rr v22.2d, v6.2d, v12.2d[1] - OP_ii v22.2d, v7.2d, v13.2d[1] - OP_ri v23.2d, v6.2d, v13.2d[1] - OP_ir v23.2d, v7.2d, v12.2d[1] + OP_rr v22.2d, v6.2d, v12.d[1] + OP_ii v22.2d, v7.2d, v13.d[1] + OP_ri v23.2d, v6.2d, v13.d[1] + OP_ir v23.2d, v7.2d, v12.d[1] - OP_rr v24.2d, v4.2d, v14.2d[0] - OP_ii v24.2d, v5.2d, v15.2d[0] - OP_ri v25.2d, v4.2d, v15.2d[0] - OP_ir v25.2d, v5.2d, v14.2d[0] + OP_rr v24.2d, v4.2d, v14.d[0] + OP_ii v24.2d, v5.2d, v15.d[0] + OP_ri v25.2d, v4.2d, v15.d[0] + OP_ir v25.2d, v5.2d, v14.d[0] - OP_rr v26.2d, v6.2d, v14.2d[0] - OP_ii v26.2d, v7.2d, v15.2d[0] - OP_ri v27.2d, v6.2d, v15.2d[0] - OP_ir v27.2d, v7.2d, v14.2d[0] + OP_rr v26.2d, v6.2d, v14.d[0] + OP_ii v26.2d, v7.2d, v15.d[0] + OP_ri v27.2d, v6.2d, v15.d[0] + OP_ir v27.2d, v7.2d, v14.d[0] - OP_rr v28.2d, v4.2d, v14.2d[1] - OP_ii v28.2d, v5.2d, v15.2d[1] - OP_ri v29.2d, v4.2d, v15.2d[1] - OP_ir v29.2d, v5.2d, v14.2d[1] + OP_rr v28.2d, v4.2d, v14.d[1] + OP_ii v28.2d, v5.2d, v15.d[1] + OP_ri v29.2d, v4.2d, v15.d[1] + OP_ir v29.2d, v5.2d, v14.d[1] - OP_rr v30.2d, v6.2d, v14.2d[1] - OP_ii v30.2d, v7.2d, v15.2d[1] - OP_ri v31.2d, v6.2d, v15.2d[1] - OP_ir v31.2d, v7.2d, v14.2d[1] + OP_rr v30.2d, v6.2d, v14.d[1] + OP_ii v30.2d, v7.2d, v15.d[1] + OP_ri v31.2d, v6.2d, v15.d[1] + OP_ir v31.2d, v7.2d, v14.d[1] .endm .macro KERNEL4x4_SUB @@ -448,45 +448,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] - OP_ri v19.2d, v2.2d, v9.2d[0] - OP_ir v19.2d, v3.2d, v8.2d[0] + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] - OP_rr v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] - OP_ri v23.2d, v2.2d, v9.2d[1] - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_rr v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] + OP_ri v23.2d, v2.2d, v9.d[1] + OP_ir v23.2d, v3.2d, v8.d[1] - OP_rr v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] - OP_ri v25.2d, v0.2d, v11.2d[0] - OP_ir v25.2d, v1.2d, v10.2d[0] + OP_rr v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] + OP_ri v25.2d, v0.2d, v11.d[0] + OP_ir v25.2d, v1.2d, v10.d[0] - OP_rr v26.2d, v2.2d, v10.2d[0] - OP_ii v26.2d, v3.2d, v11.2d[0] - OP_ri v27.2d, v2.2d, v11.2d[0] - OP_ir v27.2d, v3.2d, v10.2d[0] + OP_rr v26.2d, v2.2d, v10.d[0] + OP_ii v26.2d, v3.2d, v11.d[0] + OP_ri v27.2d, v2.2d, v11.d[0] + OP_ir v27.2d, v3.2d, v10.d[0] - OP_rr v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] - OP_ri v29.2d, v0.2d, v11.2d[1] - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_rr v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] + OP_ri v29.2d, v0.2d, v11.d[1] + OP_ir v29.2d, v1.2d, v10.d[1] - OP_rr v30.2d, v2.2d, v10.2d[1] - OP_ii v30.2d, v3.2d, v11.2d[1] - OP_ri v31.2d, v2.2d, v11.2d[1] - OP_ir v31.2d, v3.2d, v10.2d[1] + OP_rr v30.2d, v2.2d, v10.d[1] + OP_ii v30.2d, v3.2d, v11.d[1] + OP_ri v31.2d, v2.2d, v11.d[1] + OP_ir v31.2d, v3.2d, v10.d[1] .endm .macro SAVE4x4 @@ -582,25 +582,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] - OP_rr v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] - OP_ri v25.2d, v0.2d, v11.2d[0] - OP_ir v25.2d, v1.2d, v10.2d[0] + OP_rr v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] + OP_ri v25.2d, v0.2d, v11.d[0] + OP_ir v25.2d, v1.2d, v10.d[0] - OP_rr v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] - OP_ri v29.2d, v0.2d, v11.2d[1] - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_rr v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] + OP_ri v29.2d, v0.2d, v11.d[1] + OP_ir v29.2d, v1.2d, v10.d[1] .endm .macro SAVE2x4 @@ -669,25 +669,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.d, v1.d}[0], [pA] add pA, pA, #16 - OP_rr d16, d0, v8.2d[0] - OP_ii d16, d1, v9.2d[0] - OP_ri d17, d0, v9.2d[0] - OP_ir d17, d1, v8.2d[0] + OP_rr d16, d0, v8.d[0] + OP_ii d16, d1, v9.d[0] + OP_ri d17, d0, v9.d[0] + OP_ir d17, d1, v8.d[0] - OP_rr d20, d0, v8.2d[1] - OP_ii d20, d1, v9.2d[1] - OP_ri d21, d0, v9.2d[1] - OP_ir d21, d1, v8.2d[1] + OP_rr d20, d0, v8.d[1] + OP_ii d20, d1, v9.d[1] + OP_ri d21, d0, v9.d[1] + OP_ir d21, d1, v8.d[1] - OP_rr d24, d0, v10.2d[0] - OP_ii d24, d1, v11.2d[0] - OP_ri d25, d0, v11.2d[0] - OP_ir d25, d1, v10.2d[0] + OP_rr d24, d0, v10.d[0] + OP_ii d24, d1, v11.d[0] + OP_ri d25, d0, v11.d[0] + OP_ir d25, d1, v10.d[0] - OP_rr d28, d0, v10.2d[1] - OP_ii d28, d1, v11.2d[1] - OP_ri d29, d0, v11.2d[1] - OP_ir d29, d1, v10.2d[1] + OP_rr d28, d0, v10.d[1] + OP_ii d28, d1, v11.d[1] + OP_ri d29, d0, v11.d[1] + OP_ir d29, d1, v10.d[1] .endm .macro SAVE1x4 @@ -756,25 +756,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] - OP_ri v19.2d, v2.2d, v9.2d[0] - OP_ir v19.2d, v3.2d, v8.2d[0] + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] - OP_rr v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] - OP_ri v23.2d, v2.2d, v9.2d[1] - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_rr v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] + OP_ri v23.2d, v2.2d, v9.d[1] + OP_ir v23.2d, v3.2d, v8.d[1] .endm .macro SAVE4x2 @@ -833,15 +833,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] .endm .macro SAVE2x2 @@ -886,15 +886,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.d, v1.d}[0], [pA] add pA, pA, #16 - OP_rr d16, d0, v8.2d[0] - OP_ii d16, d1, v9.2d[0] - OP_ri d17, d0, v9.2d[0] - OP_ir d17, d1, v8.2d[0] + OP_rr d16, d0, v8.d[0] + OP_ii d16, d1, v9.d[0] + OP_ri d17, d0, v9.d[0] + OP_ir d17, d1, v8.d[0] - OP_rr d20, d0, v8.2d[1] - OP_ii d20, d1, v9.2d[1] - OP_ri d21, d0, v9.2d[1] - OP_ir d21, d1, v8.2d[1] + OP_rr d20, d0, v8.d[1] + OP_ii d20, d1, v9.d[1] + OP_ri d21, d0, v9.d[1] + OP_ir d21, d1, v8.d[1] .endm .macro SAVE1x2 diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S index 3ff8227e3..7945870d6 100644 --- a/kernel/arm64/ztrmm_kernel_4x4.S +++ b/kernel/arm64/ztrmm_kernel_4x4.S @@ -185,93 +185,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmul v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] + fmul v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.2d, v0.2d, v9.2d[0] + fmls v17.2d, v0.2d, v9.d[0] #else - fmul v17.2d, v0.2d, v9.2d[0] + fmul v17.2d, v0.2d, v9.d[0] #endif - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - fmul v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] + fmul v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b - fmls v19.2d, v2.2d, v9.2d[0] + fmls v19.2d, v2.2d, v9.d[0] #else - fmul v19.2d, v2.2d, v9.2d[0] + fmul v19.2d, v2.2d, v9.d[0] #endif - OP_ir v19.2d, v3.2d, v8.2d[0] + OP_ir v19.2d, v3.2d, v8.d[0] - fmul v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] + fmul v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.2d, v0.2d, v9.2d[1] + fmls v21.2d, v0.2d, v9.d[1] #else - fmul v21.2d, v0.2d, v9.2d[1] + fmul v21.2d, v0.2d, v9.d[1] #endif - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_ir v21.2d, v1.2d, v8.d[1] - fmul v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] + fmul v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b - fmls v23.2d, v2.2d, v9.2d[1] + fmls v23.2d, v2.2d, v9.d[1] #else - fmul v23.2d, v2.2d, v9.2d[1] + fmul v23.2d, v2.2d, v9.d[1] #endif - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_ir v23.2d, v3.2d, v8.d[1] - fmul v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] + fmul v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.2d, v0.2d, v11.2d[0] + fmls v25.2d, v0.2d, v11.d[0] #else - fmul v25.2d, v0.2d, v11.2d[0] + fmul v25.2d, v0.2d, v11.d[0] #endif - OP_ir v25.2d, v1.2d, v10.2d[0] + OP_ir v25.2d, v1.2d, v10.d[0] - fmul v26.2d, v2.2d, v10.2d[0] - OP_ii v26.2d, v3.2d, v11.2d[0] + fmul v26.2d, v2.2d, v10.d[0] + OP_ii v26.2d, v3.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.2d, v2.2d, v11.2d[0] + fmls v27.2d, v2.2d, v11.d[0] #else - fmul v27.2d, v2.2d, v11.2d[0] + fmul v27.2d, v2.2d, v11.d[0] #endif - OP_ir v27.2d, v3.2d, v10.2d[0] + OP_ir v27.2d, v3.2d, v10.d[0] - fmul v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] + fmul v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.2d, v0.2d, v11.2d[1] + fmls v29.2d, v0.2d, v11.d[1] #else - fmul v29.2d, v0.2d, v11.2d[1] + fmul v29.2d, v0.2d, v11.d[1] #endif - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_ir v29.2d, v1.2d, v10.d[1] - fmul v30.2d, v2.2d, v10.2d[1] - OP_ii v30.2d, v3.2d, v11.2d[1] + fmul v30.2d, v2.2d, v10.d[1] + OP_ii v30.2d, v3.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.2d, v2.2d, v11.2d[1] + fmls v31.2d, v2.2d, v11.d[1] #else - fmul v31.2d, v2.2d, v11.2d[1] + fmul v31.2d, v2.2d, v11.d[1] #endif - OP_ir v31.2d, v3.2d, v10.2d[1] + OP_ir v31.2d, v3.2d, v10.d[1] ld2 {v12.2d, v13.2d}, [pB] add pB, pB, #32 @@ -284,161 +284,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] ld2 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] - OP_ri v19.2d, v2.2d, v9.2d[0] - OP_ir v19.2d, v3.2d, v8.2d[0] + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] ld2 {v14.2d, v15.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] ld2 {v4.2d, v5.2d} , [pA] // For next round add pA, pA, #32 - OP_rr v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] - OP_ri v23.2d, v2.2d, v9.2d[1] - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_rr v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] + OP_ri v23.2d, v2.2d, v9.d[1] + OP_ir v23.2d, v3.2d, v8.d[1] ld2 {v6.2d, v7.2d} , [pA] // For next round add pA, pA, #32 - OP_rr v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] - OP_ri v25.2d, v0.2d, v11.2d[0] - OP_ir v25.2d, v1.2d, v10.2d[0] + OP_rr v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] + OP_ri v25.2d, v0.2d, v11.d[0] + OP_ir v25.2d, v1.2d, v10.d[0] prfm PLDL1KEEP, [pA, #512] - OP_rr v26.2d, v2.2d, v10.2d[0] - OP_ii v26.2d, v3.2d, v11.2d[0] - OP_ri v27.2d, v2.2d, v11.2d[0] - OP_ir v27.2d, v3.2d, v10.2d[0] + OP_rr v26.2d, v2.2d, v10.d[0] + OP_ii v26.2d, v3.2d, v11.d[0] + OP_ri v27.2d, v2.2d, v11.d[0] + OP_ir v27.2d, v3.2d, v10.d[0] prfm PLDL1KEEP, [pB, #512] - OP_rr v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] - OP_ri v29.2d, v0.2d, v11.2d[1] - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_rr v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] + OP_ri v29.2d, v0.2d, v11.d[1] + OP_ir v29.2d, v1.2d, v10.d[1] - OP_rr v30.2d, v2.2d, v10.2d[1] - OP_ii v30.2d, v3.2d, v11.2d[1] - OP_ri v31.2d, v2.2d, v11.2d[1] - OP_ir v31.2d, v3.2d, v10.2d[1] + OP_rr v30.2d, v2.2d, v10.d[1] + OP_ii v30.2d, v3.2d, v11.d[1] + OP_ri v31.2d, v2.2d, v11.d[1] + OP_ir v31.2d, v3.2d, v10.d[1] .endm .macro KERNEL4x4_M2 - OP_rr v16.2d, v4.2d, v12.2d[0] - OP_ii v16.2d, v5.2d, v13.2d[0] - OP_ri v17.2d, v4.2d, v13.2d[0] - OP_ir v17.2d, v5.2d, v12.2d[0] + OP_rr v16.2d, v4.2d, v12.d[0] + OP_ii v16.2d, v5.2d, v13.d[0] + OP_ri v17.2d, v4.2d, v13.d[0] + OP_ir v17.2d, v5.2d, v12.d[0] ld2 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v18.2d, v6.2d, v12.2d[0] - OP_ii v18.2d, v7.2d, v13.2d[0] - OP_ri v19.2d, v6.2d, v13.2d[0] - OP_ir v19.2d, v7.2d, v12.2d[0] + OP_rr v18.2d, v6.2d, v12.d[0] + OP_ii v18.2d, v7.2d, v13.d[0] + OP_ri v19.2d, v6.2d, v13.d[0] + OP_ir v19.2d, v7.2d, v12.d[0] ld2 {v10.2d, v11.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v20.2d, v4.2d, v12.2d[1] - OP_ii v20.2d, v5.2d, v13.2d[1] - OP_ri v21.2d, v4.2d, v13.2d[1] - OP_ir v21.2d, v5.2d, v12.2d[1] + OP_rr v20.2d, v4.2d, v12.d[1] + OP_ii v20.2d, v5.2d, v13.d[1] + OP_ri v21.2d, v4.2d, v13.d[1] + OP_ir v21.2d, v5.2d, v12.d[1] ld2 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 - OP_rr v22.2d, v6.2d, v12.2d[1] - OP_ii v22.2d, v7.2d, v13.2d[1] - OP_ri v23.2d, v6.2d, v13.2d[1] - OP_ir v23.2d, v7.2d, v12.2d[1] + OP_rr v22.2d, v6.2d, v12.d[1] + OP_ii v22.2d, v7.2d, v13.d[1] + OP_ri v23.2d, v6.2d, v13.d[1] + OP_ir v23.2d, v7.2d, v12.d[1] ld2 {v2.2d, v3.2d}, [pA] // For next round add pA, pA, #32 - OP_rr v24.2d, v4.2d, v14.2d[0] - OP_ii v24.2d, v5.2d, v15.2d[0] - OP_ri v25.2d, v4.2d, v15.2d[0] - OP_ir v25.2d, v5.2d, v14.2d[0] + OP_rr v24.2d, v4.2d, v14.d[0] + OP_ii v24.2d, v5.2d, v15.d[0] + OP_ri v25.2d, v4.2d, v15.d[0] + OP_ir v25.2d, v5.2d, v14.d[0] prfm PLDL1KEEP, [pA, #512] - OP_rr v26.2d, v6.2d, v14.2d[0] - OP_ii v26.2d, v7.2d, v15.2d[0] - OP_ri v27.2d, v6.2d, v15.2d[0] - OP_ir v27.2d, v7.2d, v14.2d[0] + OP_rr v26.2d, v6.2d, v14.d[0] + OP_ii v26.2d, v7.2d, v15.d[0] + OP_ri v27.2d, v6.2d, v15.d[0] + OP_ir v27.2d, v7.2d, v14.d[0] prfm PLDL1KEEP, [pB, #512] - OP_rr v28.2d, v4.2d, v14.2d[1] - OP_ii v28.2d, v5.2d, v15.2d[1] - OP_ri v29.2d, v4.2d, v15.2d[1] - OP_ir v29.2d, v5.2d, v14.2d[1] + OP_rr v28.2d, v4.2d, v14.d[1] + OP_ii v28.2d, v5.2d, v15.d[1] + OP_ri v29.2d, v4.2d, v15.d[1] + OP_ir v29.2d, v5.2d, v14.d[1] - OP_rr v30.2d, v6.2d, v14.2d[1] - OP_ii v30.2d, v7.2d, v15.2d[1] - OP_ri v31.2d, v6.2d, v15.2d[1] - OP_ir v31.2d, v7.2d, v14.2d[1] + OP_rr v30.2d, v6.2d, v14.d[1] + OP_ii v30.2d, v7.2d, v15.d[1] + OP_ri v31.2d, v6.2d, v15.d[1] + OP_ir v31.2d, v7.2d, v14.d[1] .endm .macro KERNEL4x4_E - OP_rr v16.2d, v4.2d, v12.2d[0] - OP_ii v16.2d, v5.2d, v13.2d[0] - OP_ri v17.2d, v4.2d, v13.2d[0] - OP_ir v17.2d, v5.2d, v12.2d[0] + OP_rr v16.2d, v4.2d, v12.d[0] + OP_ii v16.2d, v5.2d, v13.d[0] + OP_ri v17.2d, v4.2d, v13.d[0] + OP_ir v17.2d, v5.2d, v12.d[0] - OP_rr v18.2d, v6.2d, v12.2d[0] - OP_ii v18.2d, v7.2d, v13.2d[0] - OP_ri v19.2d, v6.2d, v13.2d[0] - OP_ir v19.2d, v7.2d, v12.2d[0] + OP_rr v18.2d, v6.2d, v12.d[0] + OP_ii v18.2d, v7.2d, v13.d[0] + OP_ri v19.2d, v6.2d, v13.d[0] + OP_ir v19.2d, v7.2d, v12.d[0] - OP_rr v20.2d, v4.2d, v12.2d[1] - OP_ii v20.2d, v5.2d, v13.2d[1] - OP_ri v21.2d, v4.2d, v13.2d[1] - OP_ir v21.2d, v5.2d, v12.2d[1] + OP_rr v20.2d, v4.2d, v12.d[1] + OP_ii v20.2d, v5.2d, v13.d[1] + OP_ri v21.2d, v4.2d, v13.d[1] + OP_ir v21.2d, v5.2d, v12.d[1] - OP_rr v22.2d, v6.2d, v12.2d[1] - OP_ii v22.2d, v7.2d, v13.2d[1] - OP_ri v23.2d, v6.2d, v13.2d[1] - OP_ir v23.2d, v7.2d, v12.2d[1] + OP_rr v22.2d, v6.2d, v12.d[1] + OP_ii v22.2d, v7.2d, v13.d[1] + OP_ri v23.2d, v6.2d, v13.d[1] + OP_ir v23.2d, v7.2d, v12.d[1] - OP_rr v24.2d, v4.2d, v14.2d[0] - OP_ii v24.2d, v5.2d, v15.2d[0] - OP_ri v25.2d, v4.2d, v15.2d[0] - OP_ir v25.2d, v5.2d, v14.2d[0] + OP_rr v24.2d, v4.2d, v14.d[0] + OP_ii v24.2d, v5.2d, v15.d[0] + OP_ri v25.2d, v4.2d, v15.d[0] + OP_ir v25.2d, v5.2d, v14.d[0] - OP_rr v26.2d, v6.2d, v14.2d[0] - OP_ii v26.2d, v7.2d, v15.2d[0] - OP_ri v27.2d, v6.2d, v15.2d[0] - OP_ir v27.2d, v7.2d, v14.2d[0] + OP_rr v26.2d, v6.2d, v14.d[0] + OP_ii v26.2d, v7.2d, v15.d[0] + OP_ri v27.2d, v6.2d, v15.d[0] + OP_ir v27.2d, v7.2d, v14.d[0] - OP_rr v28.2d, v4.2d, v14.2d[1] - OP_ii v28.2d, v5.2d, v15.2d[1] - OP_ri v29.2d, v4.2d, v15.2d[1] - OP_ir v29.2d, v5.2d, v14.2d[1] + OP_rr v28.2d, v4.2d, v14.d[1] + OP_ii v28.2d, v5.2d, v15.d[1] + OP_ri v29.2d, v4.2d, v15.d[1] + OP_ir v29.2d, v5.2d, v14.d[1] - OP_rr v30.2d, v6.2d, v14.2d[1] - OP_ii v30.2d, v7.2d, v15.2d[1] - OP_ri v31.2d, v6.2d, v15.2d[1] - OP_ir v31.2d, v7.2d, v14.2d[1] + OP_rr v30.2d, v6.2d, v14.d[1] + OP_ii v30.2d, v7.2d, v15.d[1] + OP_ri v31.2d, v6.2d, v15.d[1] + OP_ir v31.2d, v7.2d, v14.d[1] .endm .macro KERNEL4x4_SUB @@ -451,45 +451,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] - OP_ri v19.2d, v2.2d, v9.2d[0] - OP_ir v19.2d, v3.2d, v8.2d[0] + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] - OP_rr v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] - OP_ri v23.2d, v2.2d, v9.2d[1] - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_rr v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] + OP_ri v23.2d, v2.2d, v9.d[1] + OP_ir v23.2d, v3.2d, v8.d[1] - OP_rr v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] - OP_ri v25.2d, v0.2d, v11.2d[0] - OP_ir v25.2d, v1.2d, v10.2d[0] + OP_rr v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] + OP_ri v25.2d, v0.2d, v11.d[0] + OP_ir v25.2d, v1.2d, v10.d[0] - OP_rr v26.2d, v2.2d, v10.2d[0] - OP_ii v26.2d, v3.2d, v11.2d[0] - OP_ri v27.2d, v2.2d, v11.2d[0] - OP_ir v27.2d, v3.2d, v10.2d[0] + OP_rr v26.2d, v2.2d, v10.d[0] + OP_ii v26.2d, v3.2d, v11.d[0] + OP_ri v27.2d, v2.2d, v11.d[0] + OP_ir v27.2d, v3.2d, v10.d[0] - OP_rr v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] - OP_ri v29.2d, v0.2d, v11.2d[1] - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_rr v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] + OP_ri v29.2d, v0.2d, v11.d[1] + OP_ir v29.2d, v1.2d, v10.d[1] - OP_rr v30.2d, v2.2d, v10.2d[1] - OP_ii v30.2d, v3.2d, v11.2d[1] - OP_ri v31.2d, v2.2d, v11.2d[1] - OP_ir v31.2d, v3.2d, v10.2d[1] + OP_rr v30.2d, v2.2d, v10.d[1] + OP_ii v30.2d, v3.2d, v11.d[1] + OP_ri v31.2d, v2.2d, v11.d[1] + OP_ir v31.2d, v3.2d, v10.d[1] .endm .macro SAVE4x4 @@ -577,25 +577,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] - OP_rr v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] - OP_ri v25.2d, v0.2d, v11.2d[0] - OP_ir v25.2d, v1.2d, v10.2d[0] + OP_rr v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] + OP_ri v25.2d, v0.2d, v11.d[0] + OP_ir v25.2d, v1.2d, v10.d[0] - OP_rr v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] - OP_ri v29.2d, v0.2d, v11.2d[1] - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_rr v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] + OP_ri v29.2d, v0.2d, v11.d[1] + OP_ir v29.2d, v1.2d, v10.d[1] .endm .macro SAVE2x4 @@ -660,25 +660,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.d, v1.d}[0], [pA] add pA, pA, #16 - OP_rr d16, d0, v8.2d[0] - OP_ii d16, d1, v9.2d[0] - OP_ri d17, d0, v9.2d[0] - OP_ir d17, d1, v8.2d[0] + OP_rr d16, d0, v8.d[0] + OP_ii d16, d1, v9.d[0] + OP_ri d17, d0, v9.d[0] + OP_ir d17, d1, v8.d[0] - OP_rr d20, d0, v8.2d[1] - OP_ii d20, d1, v9.2d[1] - OP_ri d21, d0, v9.2d[1] - OP_ir d21, d1, v8.2d[1] + OP_rr d20, d0, v8.d[1] + OP_ii d20, d1, v9.d[1] + OP_ri d21, d0, v9.d[1] + OP_ir d21, d1, v8.d[1] - OP_rr d24, d0, v10.2d[0] - OP_ii d24, d1, v11.2d[0] - OP_ri d25, d0, v11.2d[0] - OP_ir d25, d1, v10.2d[0] + OP_rr d24, d0, v10.d[0] + OP_ii d24, d1, v11.d[0] + OP_ri d25, d0, v11.d[0] + OP_ir d25, d1, v10.d[0] - OP_rr d28, d0, v10.2d[1] - OP_ii d28, d1, v11.2d[1] - OP_ri d29, d0, v11.2d[1] - OP_ir d29, d1, v10.2d[1] + OP_rr d28, d0, v10.d[1] + OP_ii d28, d1, v11.d[1] + OP_ri d29, d0, v11.d[1] + OP_ir d29, d1, v10.d[1] .endm .macro SAVE1x4 @@ -743,25 +743,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] - OP_ri v19.2d, v2.2d, v9.2d[0] - OP_ir v19.2d, v3.2d, v8.2d[0] + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] - OP_rr v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] - OP_ri v23.2d, v2.2d, v9.2d[1] - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_rr v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] + OP_ri v23.2d, v2.2d, v9.d[1] + OP_ir v23.2d, v3.2d, v8.d[1] .endm .macro SAVE4x2 @@ -816,15 +816,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] .endm .macro SAVE2x2 @@ -867,15 +867,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.d, v1.d}[0], [pA] add pA, pA, #16 - OP_rr d16, d0, v8.2d[0] - OP_ii d16, d1, v9.2d[0] - OP_ri d17, d0, v9.2d[0] - OP_ir d17, d1, v8.2d[0] + OP_rr d16, d0, v8.d[0] + OP_ii d16, d1, v9.d[0] + OP_ri d17, d0, v9.d[0] + OP_ir d17, d1, v8.d[0] - OP_rr d20, d0, v8.2d[1] - OP_ii d20, d1, v9.2d[1] - OP_ri d21, d0, v9.2d[1] - OP_ir d21, d1, v8.2d[1] + OP_rr d20, d0, v8.d[1] + OP_ii d20, d1, v9.d[1] + OP_ri d21, d0, v9.d[1] + OP_ir d21, d1, v8.d[1] .endm .macro SAVE1x2