Merge pull request #819 from ashwinyes/develop_20160324_fixes_optimizations

Cortex-A57: Fixes and Optimizations
This commit is contained in:
Zhang Xianyi 2016-03-27 00:04:20 -04:00
commit 7b4b7179ba
18 changed files with 3345 additions and 3313 deletions

View File

@ -179,93 +179,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [ppA] ld2 {v2.4s, v3.4s}, [ppA]
add ppA, ppA, #32 add ppA, ppA, #32
fmul v16.4s, v0.4s, v8.4s[0] fmul v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b eor v17.16b, v17.16b, v17.16b
fmls v17.4s, v0.4s, v9.4s[0] fmls v17.4s, v0.4s, v9.s[0]
#else #else
fmul v17.4s, v0.4s, v9.4s[0] fmul v17.4s, v0.4s, v9.s[0]
#endif #endif
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.4s[1] fmul v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b eor v21.16b, v21.16b, v21.16b
fmls v21.4s, v0.4s, v9.4s[1] fmls v21.4s, v0.4s, v9.s[1]
#else #else
fmul v21.4s, v0.4s, v9.4s[1] fmul v21.4s, v0.4s, v9.s[1]
#endif #endif
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
fmul v24.4s, v0.4s, v8.4s[2] fmul v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b eor v25.16b, v25.16b, v25.16b
fmls v25.4s, v0.4s, v9.4s[2] fmls v25.4s, v0.4s, v9.s[2]
#else #else
fmul v25.4s, v0.4s, v9.4s[2] fmul v25.4s, v0.4s, v9.s[2]
#endif #endif
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
fmul v28.4s, v0.4s, v8.4s[3] fmul v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b eor v29.16b, v29.16b, v29.16b
fmls v29.4s, v0.4s, v9.4s[3] fmls v29.4s, v0.4s, v9.s[3]
#else #else
fmul v29.4s, v0.4s, v9.4s[3] fmul v29.4s, v0.4s, v9.s[3]
#endif #endif
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
fmul v18.4s, v2.4s, v8.4s[0] fmul v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.4s[0] OP_ii v18.4s, v3.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b eor v19.16b, v19.16b, v19.16b
fmls v19.4s, v2.4s, v9.4s[0] fmls v19.4s, v2.4s, v9.s[0]
#else #else
fmul v19.4s, v2.4s, v9.4s[0] fmul v19.4s, v2.4s, v9.s[0]
#endif #endif
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
fmul v22.4s, v2.4s, v8.4s[1] fmul v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.4s[1] OP_ii v22.4s, v3.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v23.16b, v23.16b, v23.16b eor v23.16b, v23.16b, v23.16b
fmls v23.4s, v2.4s, v9.4s[1] fmls v23.4s, v2.4s, v9.s[1]
#else #else
fmul v23.4s, v2.4s, v9.4s[1] fmul v23.4s, v2.4s, v9.s[1]
#endif #endif
OP_ir v23.4s, v3.4s, v8.4s[1] OP_ir v23.4s, v3.4s, v8.s[1]
fmul v26.4s, v2.4s, v8.4s[2] fmul v26.4s, v2.4s, v8.s[2]
OP_ii v26.4s, v3.4s, v9.4s[2] OP_ii v26.4s, v3.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b eor v27.16b, v27.16b, v27.16b
fmls v27.4s, v2.4s, v9.4s[2] fmls v27.4s, v2.4s, v9.s[2]
#else #else
fmul v27.4s, v2.4s, v9.4s[2] fmul v27.4s, v2.4s, v9.s[2]
#endif #endif
OP_ir v27.4s, v3.4s, v8.4s[2] OP_ir v27.4s, v3.4s, v8.s[2]
fmul v30.4s, v2.4s, v8.4s[3] fmul v30.4s, v2.4s, v8.s[3]
OP_ii v30.4s, v3.4s, v9.4s[3] OP_ii v30.4s, v3.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b eor v31.16b, v31.16b, v31.16b
fmls v31.4s, v2.4s, v9.4s[3] fmls v31.4s, v2.4s, v9.s[3]
#else #else
fmul v31.4s, v2.4s, v9.4s[3] fmul v31.4s, v2.4s, v9.s[3]
#endif #endif
OP_ir v31.4s, v3.4s, v8.4s[3] OP_ir v31.4s, v3.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB] ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -276,159 +276,159 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
ld2 {v12.4s, v13.4s}, [pB] // for next round ld2 {v12.4s, v13.4s}, [pB] // for next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
ld2 {v4.4s, v5.4s} , [pA] // for next round ld2 {v4.4s, v5.4s} , [pA] // for next round
add pA, pA, #32 add pA, pA, #32
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
OP_rr v18.4s, v2.4s, v8.4s[0] OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.4s[0] OP_ii v18.4s, v3.4s, v9.s[0]
OP_ri v19.4s, v2.4s, v9.4s[0] OP_ri v19.4s, v2.4s, v9.s[0]
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
ld2 {v6.4s, v7.4s} , [ppA] // for next round ld2 {v6.4s, v7.4s} , [ppA] // for next round
add ppA, ppA, #32 add ppA, ppA, #32
OP_rr v22.4s, v2.4s, v8.4s[1] OP_rr v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.4s[1] OP_ii v22.4s, v3.4s, v9.s[1]
OP_ri v23.4s, v2.4s, v9.4s[1] OP_ri v23.4s, v2.4s, v9.s[1]
OP_ir v23.4s, v3.4s, v8.4s[1] OP_ir v23.4s, v3.4s, v8.s[1]
prfm PLDL1KEEP, [ppA, #512] prfm PLDL1KEEP, [ppA, #512]
OP_rr v26.4s, v2.4s, v8.4s[2] OP_rr v26.4s, v2.4s, v8.s[2]
OP_ii v26.4s, v3.4s, v9.4s[2] OP_ii v26.4s, v3.4s, v9.s[2]
OP_ri v27.4s, v2.4s, v9.4s[2] OP_ri v27.4s, v2.4s, v9.s[2]
OP_ir v27.4s, v3.4s, v8.4s[2] OP_ir v27.4s, v3.4s, v8.s[2]
OP_rr v30.4s, v2.4s, v8.4s[3] OP_rr v30.4s, v2.4s, v8.s[3]
OP_ii v30.4s, v3.4s, v9.4s[3] OP_ii v30.4s, v3.4s, v9.s[3]
OP_ri v31.4s, v2.4s, v9.4s[3] OP_ri v31.4s, v2.4s, v9.s[3]
OP_ir v31.4s, v3.4s, v8.4s[3] OP_ir v31.4s, v3.4s, v8.s[3]
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
ld2 {v8.4s, v9.4s}, [pB] // for next round ld2 {v8.4s, v9.4s}, [pB] // for next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
ld2 {v0.4s, v1.4s}, [pA] // for next round ld2 {v0.4s, v1.4s}, [pA] // for next round
add pA, pA, #32 add pA, pA, #32
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
prfm PLDL1KEEP, [ppA, #512] prfm PLDL1KEEP, [ppA, #512]
OP_rr v18.4s, v6.4s, v12.4s[0] OP_rr v18.4s, v6.4s, v12.s[0]
OP_ii v18.4s, v7.4s, v13.4s[0] OP_ii v18.4s, v7.4s, v13.s[0]
OP_ri v19.4s, v6.4s, v13.4s[0] OP_ri v19.4s, v6.4s, v13.s[0]
OP_ir v19.4s, v7.4s, v12.4s[0] OP_ir v19.4s, v7.4s, v12.s[0]
ld2 {v2.4s, v3.4s}, [ppA] // for next round ld2 {v2.4s, v3.4s}, [ppA] // for next round
add ppA, ppA, #32 add ppA, ppA, #32
OP_rr v22.4s, v6.4s, v12.4s[1] OP_rr v22.4s, v6.4s, v12.s[1]
OP_ii v22.4s, v7.4s, v13.4s[1] OP_ii v22.4s, v7.4s, v13.s[1]
OP_ri v23.4s, v6.4s, v13.4s[1] OP_ri v23.4s, v6.4s, v13.s[1]
OP_ir v23.4s, v7.4s, v12.4s[1] OP_ir v23.4s, v7.4s, v12.s[1]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
OP_rr v26.4s, v6.4s, v12.4s[2] OP_rr v26.4s, v6.4s, v12.s[2]
OP_ii v26.4s, v7.4s, v13.4s[2] OP_ii v26.4s, v7.4s, v13.s[2]
OP_ri v27.4s, v6.4s, v13.4s[2] OP_ri v27.4s, v6.4s, v13.s[2]
OP_ir v27.4s, v7.4s, v12.4s[2] OP_ir v27.4s, v7.4s, v12.s[2]
OP_rr v30.4s, v6.4s, v12.4s[3] OP_rr v30.4s, v6.4s, v12.s[3]
OP_ii v30.4s, v7.4s, v13.4s[3] OP_ii v30.4s, v7.4s, v13.s[3]
OP_ri v31.4s, v6.4s, v13.4s[3] OP_ri v31.4s, v6.4s, v13.s[3]
OP_ir v31.4s, v7.4s, v12.4s[3] OP_ir v31.4s, v7.4s, v12.s[3]
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
OP_rr v18.4s, v6.4s, v12.4s[0] OP_rr v18.4s, v6.4s, v12.s[0]
OP_ii v18.4s, v7.4s, v13.4s[0] OP_ii v18.4s, v7.4s, v13.s[0]
OP_ri v19.4s, v6.4s, v13.4s[0] OP_ri v19.4s, v6.4s, v13.s[0]
OP_ir v19.4s, v7.4s, v12.4s[0] OP_ir v19.4s, v7.4s, v12.s[0]
OP_rr v22.4s, v6.4s, v12.4s[1] OP_rr v22.4s, v6.4s, v12.s[1]
OP_ii v22.4s, v7.4s, v13.4s[1] OP_ii v22.4s, v7.4s, v13.s[1]
OP_ri v23.4s, v6.4s, v13.4s[1] OP_ri v23.4s, v6.4s, v13.s[1]
OP_ir v23.4s, v7.4s, v12.4s[1] OP_ir v23.4s, v7.4s, v12.s[1]
OP_rr v26.4s, v6.4s, v12.4s[2] OP_rr v26.4s, v6.4s, v12.s[2]
OP_ii v26.4s, v7.4s, v13.4s[2] OP_ii v26.4s, v7.4s, v13.s[2]
OP_ri v27.4s, v6.4s, v13.4s[2] OP_ri v27.4s, v6.4s, v13.s[2]
OP_ir v27.4s, v7.4s, v12.4s[2] OP_ir v27.4s, v7.4s, v12.s[2]
OP_rr v30.4s, v6.4s, v12.4s[3] OP_rr v30.4s, v6.4s, v12.s[3]
OP_ii v30.4s, v7.4s, v13.4s[3] OP_ii v30.4s, v7.4s, v13.s[3]
OP_ri v31.4s, v6.4s, v13.4s[3] OP_ri v31.4s, v6.4s, v13.s[3]
OP_ir v31.4s, v7.4s, v12.4s[3] OP_ir v31.4s, v7.4s, v12.s[3]
.endm .endm
.macro KERNEL8x4_SUB .macro KERNEL8x4_SUB
@ -437,48 +437,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
ld2 {v2.4s, v3.4s}, [ppA] ld2 {v2.4s, v3.4s}, [ppA]
add ppA, ppA, #32 add ppA, ppA, #32
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
OP_rr v18.4s, v2.4s, v8.4s[0] OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.4s[0] OP_ii v18.4s, v3.4s, v9.s[0]
OP_ri v19.4s, v2.4s, v9.4s[0] OP_ri v19.4s, v2.4s, v9.s[0]
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
OP_rr v22.4s, v2.4s, v8.4s[1] OP_rr v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.4s[1] OP_ii v22.4s, v3.4s, v9.s[1]
OP_ri v23.4s, v2.4s, v9.4s[1] OP_ri v23.4s, v2.4s, v9.s[1]
OP_ir v23.4s, v3.4s, v8.4s[1] OP_ir v23.4s, v3.4s, v8.s[1]
OP_rr v26.4s, v2.4s, v8.4s[2] OP_rr v26.4s, v2.4s, v8.s[2]
OP_ii v26.4s, v3.4s, v9.4s[2] OP_ii v26.4s, v3.4s, v9.s[2]
OP_ri v27.4s, v2.4s, v9.4s[2] OP_ri v27.4s, v2.4s, v9.s[2]
OP_ir v27.4s, v3.4s, v8.4s[2] OP_ir v27.4s, v3.4s, v8.s[2]
OP_rr v30.4s, v2.4s, v8.4s[3] OP_rr v30.4s, v2.4s, v8.s[3]
OP_ii v30.4s, v3.4s, v9.4s[3] OP_ii v30.4s, v3.4s, v9.s[3]
OP_ri v31.4s, v2.4s, v9.4s[3] OP_ri v31.4s, v2.4s, v9.s[3]
OP_ir v31.4s, v3.4s, v8.4s[3] OP_ir v31.4s, v3.4s, v8.s[3]
.endm .endm
.macro SAVE8x4 .macro SAVE8x4
@ -578,25 +578,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -658,25 +658,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA] ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr v16.2s, v0.2s, v8.4s[0] OP_rr v16.2s, v0.2s, v8.s[0]
OP_ii v16.2s, v1.2s, v9.4s[0] OP_ii v16.2s, v1.2s, v9.s[0]
OP_ri v17.2s, v0.2s, v9.4s[0] OP_ri v17.2s, v0.2s, v9.s[0]
OP_ir v17.2s, v1.2s, v8.4s[0] OP_ir v17.2s, v1.2s, v8.s[0]
OP_rr v20.2s, v0.2s, v8.4s[1] OP_rr v20.2s, v0.2s, v8.s[1]
OP_ii v20.2s, v1.2s, v9.4s[1] OP_ii v20.2s, v1.2s, v9.s[1]
OP_ri v21.2s, v0.2s, v9.4s[1] OP_ri v21.2s, v0.2s, v9.s[1]
OP_ir v21.2s, v1.2s, v8.4s[1] OP_ir v21.2s, v1.2s, v8.s[1]
OP_rr v24.2s, v0.2s, v8.4s[2] OP_rr v24.2s, v0.2s, v8.s[2]
OP_ii v24.2s, v1.2s, v9.4s[2] OP_ii v24.2s, v1.2s, v9.s[2]
OP_ri v25.2s, v0.2s, v9.4s[2] OP_ri v25.2s, v0.2s, v9.s[2]
OP_ir v25.2s, v1.2s, v8.4s[2] OP_ir v25.2s, v1.2s, v8.s[2]
OP_rr v28.2s, v0.2s, v8.4s[3] OP_rr v28.2s, v0.2s, v8.s[3]
OP_ii v28.2s, v1.2s, v9.4s[3] OP_ii v28.2s, v1.2s, v9.s[3]
OP_ri v29.2s, v0.2s, v9.4s[3] OP_ri v29.2s, v0.2s, v9.s[3]
OP_ir v29.2s, v1.2s, v8.4s[3] OP_ir v29.2s, v1.2s, v8.s[3]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -738,25 +738,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA] ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8 add pA, pA, #8
OP_rr s16, s0, v8.4s[0] OP_rr s16, s0, v8.s[0]
OP_ii s16, s1, v9.4s[0] OP_ii s16, s1, v9.s[0]
OP_ri s17, s0, v9.4s[0] OP_ri s17, s0, v9.s[0]
OP_ir s17, s1, v8.4s[0] OP_ir s17, s1, v8.s[0]
OP_rr s20, s0, v8.4s[1] OP_rr s20, s0, v8.s[1]
OP_ii s20, s1, v9.4s[1] OP_ii s20, s1, v9.s[1]
OP_ri s21, s0, v9.4s[1] OP_ri s21, s0, v9.s[1]
OP_ir s21, s1, v8.4s[1] OP_ir s21, s1, v8.s[1]
OP_rr s24, s0, v8.4s[2] OP_rr s24, s0, v8.s[2]
OP_ii s24, s1, v9.4s[2] OP_ii s24, s1, v9.s[2]
OP_ri s25, s0, v9.4s[2] OP_ri s25, s0, v9.s[2]
OP_ir s25, s1, v8.4s[2] OP_ir s25, s1, v8.s[2]
OP_rr s28, s0, v8.4s[3] OP_rr s28, s0, v8.s[3]
OP_ii s28, s1, v9.4s[3] OP_ii s28, s1, v9.s[3]
OP_ri s29, s0, v9.4s[3] OP_ri s29, s0, v9.s[3]
OP_ir s29, s1, v8.4s[3] OP_ir s29, s1, v8.s[3]
.endm .endm
.macro SAVE1x4 .macro SAVE1x4
@ -814,15 +814,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.2s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.2s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.2s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.2s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.2s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.2s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.2s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.2s[1] OP_ir v21.4s, v1.4s, v8.s[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -862,15 +862,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA] ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr v16.2s, v0.2s, v8.2s[0] OP_rr v16.2s, v0.2s, v8.s[0]
OP_ii v16.2s, v1.2s, v9.2s[0] OP_ii v16.2s, v1.2s, v9.s[0]
OP_ri v17.2s, v0.2s, v9.2s[0] OP_ri v17.2s, v0.2s, v9.s[0]
OP_ir v17.2s, v1.2s, v8.2s[0] OP_ir v17.2s, v1.2s, v8.s[0]
OP_rr v20.2s, v0.2s, v8.2s[1] OP_rr v20.2s, v0.2s, v8.s[1]
OP_ii v20.2s, v1.2s, v9.2s[1] OP_ii v20.2s, v1.2s, v9.s[1]
OP_ri v21.2s, v0.2s, v9.2s[1] OP_ri v21.2s, v0.2s, v9.s[1]
OP_ir v21.2s, v1.2s, v8.2s[1] OP_ir v21.2s, v1.2s, v8.s[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -910,15 +910,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA] ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8 add pA, pA, #8
OP_rr s16, s0, v8.2s[0] OP_rr s16, s0, v8.s[0]
OP_ii s16, s1, v9.2s[0] OP_ii s16, s1, v9.s[0]
OP_ri s17, s0, v9.2s[0] OP_ri s17, s0, v9.s[0]
OP_ir s17, s1, v8.2s[0] OP_ir s17, s1, v8.s[0]
OP_rr s20, s0, v8.2s[1] OP_rr s20, s0, v8.s[1]
OP_ii s20, s1, v9.2s[1] OP_ii s20, s1, v9.s[1]
OP_ri s21, s0, v9.2s[1] OP_ri s21, s0, v9.s[1]
OP_ir s21, s1, v8.2s[1] OP_ir s21, s1, v8.s[1]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2

664
kernel/arm64/cgemm_kernel_8x4.S Executable file → Normal file
View File

@ -178,93 +178,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA] ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.4s, v0.4s, v8.4s[0] fmul v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b eor v17.16b, v17.16b, v17.16b
fmls v17.4s, v0.4s, v9.4s[0] fmls v17.4s, v0.4s, v9.s[0]
#else #else
fmul v17.4s, v0.4s, v9.4s[0] fmul v17.4s, v0.4s, v9.s[0]
#endif #endif
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
fmul v18.4s, v2.4s, v8.4s[0] fmul v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.4s[0] OP_ii v18.4s, v3.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b eor v19.16b, v19.16b, v19.16b
fmls v19.4s, v2.4s, v9.4s[0] fmls v19.4s, v2.4s, v9.s[0]
#else #else
fmul v19.4s, v2.4s, v9.4s[0] fmul v19.4s, v2.4s, v9.s[0]
#endif #endif
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.4s[1] fmul v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b eor v21.16b, v21.16b, v21.16b
fmls v21.4s, v0.4s, v9.4s[1] fmls v21.4s, v0.4s, v9.s[1]
#else #else
fmul v21.4s, v0.4s, v9.4s[1] fmul v21.4s, v0.4s, v9.s[1]
#endif #endif
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
fmul v22.4s, v2.4s, v8.4s[1] fmul v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.4s[1] OP_ii v22.4s, v3.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v23.16b, v23.16b, v23.16b eor v23.16b, v23.16b, v23.16b
fmls v23.4s, v2.4s, v9.4s[1] fmls v23.4s, v2.4s, v9.s[1]
#else #else
fmul v23.4s, v2.4s, v9.4s[1] fmul v23.4s, v2.4s, v9.s[1]
#endif #endif
OP_ir v23.4s, v3.4s, v8.4s[1] OP_ir v23.4s, v3.4s, v8.s[1]
fmul v24.4s, v0.4s, v8.4s[2] fmul v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b eor v25.16b, v25.16b, v25.16b
fmls v25.4s, v0.4s, v9.4s[2] fmls v25.4s, v0.4s, v9.s[2]
#else #else
fmul v25.4s, v0.4s, v9.4s[2] fmul v25.4s, v0.4s, v9.s[2]
#endif #endif
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
fmul v26.4s, v2.4s, v8.4s[2] fmul v26.4s, v2.4s, v8.s[2]
OP_ii v26.4s, v3.4s, v9.4s[2] OP_ii v26.4s, v3.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b eor v27.16b, v27.16b, v27.16b
fmls v27.4s, v2.4s, v9.4s[2] fmls v27.4s, v2.4s, v9.s[2]
#else #else
fmul v27.4s, v2.4s, v9.4s[2] fmul v27.4s, v2.4s, v9.s[2]
#endif #endif
OP_ir v27.4s, v3.4s, v8.4s[2] OP_ir v27.4s, v3.4s, v8.s[2]
fmul v28.4s, v0.4s, v8.4s[3] fmul v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b eor v29.16b, v29.16b, v29.16b
fmls v29.4s, v0.4s, v9.4s[3] fmls v29.4s, v0.4s, v9.s[3]
#else #else
fmul v29.4s, v0.4s, v9.4s[3] fmul v29.4s, v0.4s, v9.s[3]
#endif #endif
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
fmul v30.4s, v2.4s, v8.4s[3] fmul v30.4s, v2.4s, v8.s[3]
OP_ii v30.4s, v3.4s, v9.4s[3] OP_ii v30.4s, v3.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b eor v31.16b, v31.16b, v31.16b
fmls v31.4s, v2.4s, v9.4s[3] fmls v31.4s, v2.4s, v9.s[3]
#else #else
fmul v31.4s, v2.4s, v9.4s[3] fmul v31.4s, v2.4s, v9.s[3]
#endif #endif
OP_ir v31.4s, v3.4s, v8.4s[3] OP_ir v31.4s, v3.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB] ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -275,45 +275,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v18.4s, v2.4s, v8.4s[0] OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.4s[0] OP_ii v18.4s, v3.4s, v9.s[0]
OP_ri v19.4s, v2.4s, v9.4s[0] OP_ri v19.4s, v2.4s, v9.s[0]
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
OP_rr v22.4s, v2.4s, v8.4s[1] OP_rr v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.4s[1] OP_ii v22.4s, v3.4s, v9.s[1]
OP_ri v23.4s, v2.4s, v9.4s[1] OP_ri v23.4s, v2.4s, v9.s[1]
OP_ir v23.4s, v3.4s, v8.4s[1] OP_ir v23.4s, v3.4s, v8.s[1]
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
OP_rr v26.4s, v2.4s, v8.4s[2] OP_rr v26.4s, v2.4s, v8.s[2]
OP_ii v26.4s, v3.4s, v9.4s[2] OP_ii v26.4s, v3.4s, v9.s[2]
OP_ri v27.4s, v2.4s, v9.4s[2] OP_ri v27.4s, v2.4s, v9.s[2]
OP_ir v27.4s, v3.4s, v8.4s[2] OP_ir v27.4s, v3.4s, v8.s[2]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
OP_rr v30.4s, v2.4s, v8.4s[3] OP_rr v30.4s, v2.4s, v8.s[3]
OP_ii v30.4s, v3.4s, v9.4s[3] OP_ii v30.4s, v3.4s, v9.s[3]
OP_ri v31.4s, v2.4s, v9.4s[3] OP_ri v31.4s, v2.4s, v9.s[3]
OP_ir v31.4s, v3.4s, v8.4s[3] OP_ir v31.4s, v3.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB] // For next round ld2 {v12.4s, v13.4s}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
@ -324,45 +324,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
OP_rr v18.4s, v6.4s, v12.4s[0] OP_rr v18.4s, v6.4s, v12.s[0]
OP_ii v18.4s, v7.4s, v13.4s[0] OP_ii v18.4s, v7.4s, v13.s[0]
OP_ri v19.4s, v6.4s, v13.4s[0] OP_ri v19.4s, v6.4s, v13.s[0]
OP_ir v19.4s, v7.4s, v12.4s[0] OP_ir v19.4s, v7.4s, v12.s[0]
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
OP_rr v22.4s, v6.4s, v12.4s[1] OP_rr v22.4s, v6.4s, v12.s[1]
OP_ii v22.4s, v7.4s, v13.4s[1] OP_ii v22.4s, v7.4s, v13.s[1]
OP_ri v23.4s, v6.4s, v13.4s[1] OP_ri v23.4s, v6.4s, v13.s[1]
OP_ir v23.4s, v7.4s, v12.4s[1] OP_ir v23.4s, v7.4s, v12.s[1]
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
OP_rr v26.4s, v6.4s, v12.4s[2] OP_rr v26.4s, v6.4s, v12.s[2]
OP_ii v26.4s, v7.4s, v13.4s[2] OP_ii v26.4s, v7.4s, v13.s[2]
OP_ri v27.4s, v6.4s, v13.4s[2] OP_ri v27.4s, v6.4s, v13.s[2]
OP_ir v27.4s, v7.4s, v12.4s[2] OP_ir v27.4s, v7.4s, v12.s[2]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
OP_rr v30.4s, v6.4s, v12.4s[3] OP_rr v30.4s, v6.4s, v12.s[3]
OP_ii v30.4s, v7.4s, v13.4s[3] OP_ii v30.4s, v7.4s, v13.s[3]
OP_ri v31.4s, v6.4s, v13.4s[3] OP_ri v31.4s, v6.4s, v13.s[3]
OP_ir v31.4s, v7.4s, v12.4s[3] OP_ir v31.4s, v7.4s, v12.s[3]
ld2 {v8.4s, v9.4s}, [pB] ld2 {v8.4s, v9.4s}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -373,45 +373,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
OP_rr v18.4s, v6.4s, v12.4s[0] OP_rr v18.4s, v6.4s, v12.s[0]
OP_ii v18.4s, v7.4s, v13.4s[0] OP_ii v18.4s, v7.4s, v13.s[0]
OP_ri v19.4s, v6.4s, v13.4s[0] OP_ri v19.4s, v6.4s, v13.s[0]
OP_ir v19.4s, v7.4s, v12.4s[0] OP_ir v19.4s, v7.4s, v12.s[0]
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
OP_rr v22.4s, v6.4s, v12.4s[1] OP_rr v22.4s, v6.4s, v12.s[1]
OP_ii v22.4s, v7.4s, v13.4s[1] OP_ii v22.4s, v7.4s, v13.s[1]
OP_ri v23.4s, v6.4s, v13.4s[1] OP_ri v23.4s, v6.4s, v13.s[1]
OP_ir v23.4s, v7.4s, v12.4s[1] OP_ir v23.4s, v7.4s, v12.s[1]
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
OP_rr v26.4s, v6.4s, v12.4s[2] OP_rr v26.4s, v6.4s, v12.s[2]
OP_ii v26.4s, v7.4s, v13.4s[2] OP_ii v26.4s, v7.4s, v13.s[2]
OP_ri v27.4s, v6.4s, v13.4s[2] OP_ri v27.4s, v6.4s, v13.s[2]
OP_ir v27.4s, v7.4s, v12.4s[2] OP_ir v27.4s, v7.4s, v12.s[2]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
OP_rr v30.4s, v6.4s, v12.4s[3] OP_rr v30.4s, v6.4s, v12.s[3]
OP_ii v30.4s, v7.4s, v13.4s[3] OP_ii v30.4s, v7.4s, v13.s[3]
OP_ri v31.4s, v6.4s, v13.4s[3] OP_ri v31.4s, v6.4s, v13.s[3]
OP_ir v31.4s, v7.4s, v12.4s[3] OP_ir v31.4s, v7.4s, v12.s[3]
.endm .endm
@ -423,45 +423,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA] ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v18.4s, v2.4s, v8.4s[0] OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.4s[0] OP_ii v18.4s, v3.4s, v9.s[0]
OP_ri v19.4s, v2.4s, v9.4s[0] OP_ri v19.4s, v2.4s, v9.s[0]
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
OP_rr v22.4s, v2.4s, v8.4s[1] OP_rr v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.4s[1] OP_ii v22.4s, v3.4s, v9.s[1]
OP_ri v23.4s, v2.4s, v9.4s[1] OP_ri v23.4s, v2.4s, v9.s[1]
OP_ir v23.4s, v3.4s, v8.4s[1] OP_ir v23.4s, v3.4s, v8.s[1]
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
OP_rr v26.4s, v2.4s, v8.4s[2] OP_rr v26.4s, v2.4s, v8.s[2]
OP_ii v26.4s, v3.4s, v9.4s[2] OP_ii v26.4s, v3.4s, v9.s[2]
OP_ri v27.4s, v2.4s, v9.4s[2] OP_ri v27.4s, v2.4s, v9.s[2]
OP_ir v27.4s, v3.4s, v8.4s[2] OP_ir v27.4s, v3.4s, v8.s[2]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
OP_rr v30.4s, v2.4s, v8.4s[3] OP_rr v30.4s, v2.4s, v8.s[3]
OP_ii v30.4s, v3.4s, v9.4s[3] OP_ii v30.4s, v3.4s, v9.s[3]
OP_ri v31.4s, v2.4s, v9.4s[3] OP_ri v31.4s, v2.4s, v9.s[3]
OP_ir v31.4s, v3.4s, v8.4s[3] OP_ir v31.4s, v3.4s, v8.s[3]
.endm .endm
@ -560,49 +560,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.4s, v0.4s, v8.4s[0] fmul v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b eor v17.16b, v17.16b, v17.16b
fmls v17.4s, v0.4s, v9.4s[0] fmls v17.4s, v0.4s, v9.s[0]
#else #else
fmul v17.4s, v0.4s, v9.4s[0] fmul v17.4s, v0.4s, v9.s[0]
#endif #endif
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.4s[1] fmul v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b eor v21.16b, v21.16b, v21.16b
fmls v21.4s, v0.4s, v9.4s[1] fmls v21.4s, v0.4s, v9.s[1]
#else #else
fmul v21.4s, v0.4s, v9.4s[1] fmul v21.4s, v0.4s, v9.s[1]
#endif #endif
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
fmul v24.4s, v0.4s, v8.4s[2] fmul v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b eor v25.16b, v25.16b, v25.16b
fmls v25.4s, v0.4s, v9.4s[2] fmls v25.4s, v0.4s, v9.s[2]
#else #else
fmul v25.4s, v0.4s, v9.4s[2] fmul v25.4s, v0.4s, v9.s[2]
#endif #endif
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
fmul v28.4s, v0.4s, v8.4s[3] fmul v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b eor v29.16b, v29.16b, v29.16b
fmls v29.4s, v0.4s, v9.4s[3] fmls v29.4s, v0.4s, v9.s[3]
#else #else
fmul v29.4s, v0.4s, v9.4s[3] fmul v29.4s, v0.4s, v9.s[3]
#endif #endif
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB] ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -611,85 +611,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
ld2 {v12.4s, v13.4s}, [pB] // For next round ld2 {v12.4s, v13.4s}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
ld2 {v4.4s, v5.4s}, [pA] // For next round ld2 {v4.4s, v5.4s}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
ld2 {v8.4s, v9.4s}, [pB] // For next round ld2 {v8.4s, v9.4s}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
ld2 {v0.4s, v1.4s}, [pA] // For next round ld2 {v0.4s, v1.4s}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -698,25 +698,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -778,25 +778,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA] ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr v16.2s, v0.2s, v8.4s[0] OP_rr v16.2s, v0.2s, v8.s[0]
OP_ii v16.2s, v1.2s, v9.4s[0] OP_ii v16.2s, v1.2s, v9.s[0]
OP_ri v17.2s, v0.2s, v9.4s[0] OP_ri v17.2s, v0.2s, v9.s[0]
OP_ir v17.2s, v1.2s, v8.4s[0] OP_ir v17.2s, v1.2s, v8.s[0]
OP_rr v20.2s, v0.2s, v8.4s[1] OP_rr v20.2s, v0.2s, v8.s[1]
OP_ii v20.2s, v1.2s, v9.4s[1] OP_ii v20.2s, v1.2s, v9.s[1]
OP_ri v21.2s, v0.2s, v9.4s[1] OP_ri v21.2s, v0.2s, v9.s[1]
OP_ir v21.2s, v1.2s, v8.4s[1] OP_ir v21.2s, v1.2s, v8.s[1]
OP_rr v24.2s, v0.2s, v8.4s[2] OP_rr v24.2s, v0.2s, v8.s[2]
OP_ii v24.2s, v1.2s, v9.4s[2] OP_ii v24.2s, v1.2s, v9.s[2]
OP_ri v25.2s, v0.2s, v9.4s[2] OP_ri v25.2s, v0.2s, v9.s[2]
OP_ir v25.2s, v1.2s, v8.4s[2] OP_ir v25.2s, v1.2s, v8.s[2]
OP_rr v28.2s, v0.2s, v8.4s[3] OP_rr v28.2s, v0.2s, v8.s[3]
OP_ii v28.2s, v1.2s, v9.4s[3] OP_ii v28.2s, v1.2s, v9.s[3]
OP_ri v29.2s, v0.2s, v9.4s[3] OP_ri v29.2s, v0.2s, v9.s[3]
OP_ir v29.2s, v1.2s, v8.4s[3] OP_ir v29.2s, v1.2s, v8.s[3]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -858,25 +858,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA] ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8 add pA, pA, #8
OP_rr s16, s0, v8.4s[0] OP_rr s16, s0, v8.s[0]
OP_ii s16, s1, v9.4s[0] OP_ii s16, s1, v9.s[0]
OP_ri s17, s0, v9.4s[0] OP_ri s17, s0, v9.s[0]
OP_ir s17, s1, v8.4s[0] OP_ir s17, s1, v8.s[0]
OP_rr s20, s0, v8.4s[1] OP_rr s20, s0, v8.s[1]
OP_ii s20, s1, v9.4s[1] OP_ii s20, s1, v9.s[1]
OP_ri s21, s0, v9.4s[1] OP_ri s21, s0, v9.s[1]
OP_ir s21, s1, v8.4s[1] OP_ir s21, s1, v8.s[1]
OP_rr s24, s0, v8.4s[2] OP_rr s24, s0, v8.s[2]
OP_ii s24, s1, v9.4s[2] OP_ii s24, s1, v9.s[2]
OP_ri s25, s0, v9.4s[2] OP_ri s25, s0, v9.s[2]
OP_ir s25, s1, v8.4s[2] OP_ir s25, s1, v8.s[2]
OP_rr s28, s0, v8.4s[3] OP_rr s28, s0, v8.s[3]
OP_ii s28, s1, v9.4s[3] OP_ii s28, s1, v9.s[3]
OP_ri s29, s0, v9.4s[3] OP_ri s29, s0, v9.s[3]
OP_ir s29, s1, v8.4s[3] OP_ir s29, s1, v8.s[3]
.endm .endm
.macro SAVE1x4 .macro SAVE1x4
@ -940,25 +940,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA] ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.2s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.2s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.2s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.2s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v18.4s, v2.4s, v8.2s[0] OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.2s[0] OP_ii v18.4s, v3.4s, v9.s[0]
OP_ri v19.4s, v2.4s, v9.2s[0] OP_ri v19.4s, v2.4s, v9.s[0]
OP_ir v19.4s, v3.4s, v8.2s[0] OP_ir v19.4s, v3.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.2s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.2s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.2s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.2s[1] OP_ir v21.4s, v1.4s, v8.s[1]
OP_rr v22.4s, v2.4s, v8.2s[1] OP_rr v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.2s[1] OP_ii v22.4s, v3.4s, v9.s[1]
OP_ri v23.4s, v2.4s, v9.2s[1] OP_ri v23.4s, v2.4s, v9.s[1]
OP_ir v23.4s, v3.4s, v8.2s[1] OP_ir v23.4s, v3.4s, v8.s[1]
.endm .endm
.macro SAVE8x2 .macro SAVE8x2
@ -1016,15 +1016,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.2s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.2s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.2s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.2s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.2s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.2s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.2s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.2s[1] OP_ir v21.4s, v1.4s, v8.s[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -1064,15 +1064,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA] ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr v16.2s, v0.2s, v8.2s[0] OP_rr v16.2s, v0.2s, v8.s[0]
OP_ii v16.2s, v1.2s, v9.2s[0] OP_ii v16.2s, v1.2s, v9.s[0]
OP_ri v17.2s, v0.2s, v9.2s[0] OP_ri v17.2s, v0.2s, v9.s[0]
OP_ir v17.2s, v1.2s, v8.2s[0] OP_ir v17.2s, v1.2s, v8.s[0]
OP_rr v20.2s, v0.2s, v8.2s[1] OP_rr v20.2s, v0.2s, v8.s[1]
OP_ii v20.2s, v1.2s, v9.2s[1] OP_ii v20.2s, v1.2s, v9.s[1]
OP_ri v21.2s, v0.2s, v9.2s[1] OP_ri v21.2s, v0.2s, v9.s[1]
OP_ir v21.2s, v1.2s, v8.2s[1] OP_ir v21.2s, v1.2s, v8.s[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -1112,15 +1112,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA] ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8 add pA, pA, #8
OP_rr s16, s0, v8.2s[0] OP_rr s16, s0, v8.s[0]
OP_ii s16, s1, v9.2s[0] OP_ii s16, s1, v9.s[0]
OP_ri s17, s0, v9.2s[0] OP_ri s17, s0, v9.s[0]
OP_ir s17, s1, v8.2s[0] OP_ir s17, s1, v8.s[0]
OP_rr s20, s0, v8.2s[1] OP_rr s20, s0, v8.s[1]
OP_ii s20, s1, v9.2s[1] OP_ii s20, s1, v9.s[1]
OP_ri s21, s0, v9.2s[1] OP_ri s21, s0, v9.s[1]
OP_ir s21, s1, v8.2s[1] OP_ir s21, s1, v8.s[1]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -1162,15 +1162,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA] ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v8.4s[1] OP_ii v16.4s, v1.4s, v8.s[1]
OP_ri v17.4s, v0.4s, v8.4s[1] OP_ri v17.4s, v0.4s, v8.s[1]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v18.4s, v2.4s, v8.4s[0] OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v8.4s[1] OP_ii v18.4s, v3.4s, v8.s[1]
OP_ri v19.4s, v2.4s, v8.4s[1] OP_ri v19.4s, v2.4s, v8.s[1]
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
.endm .endm
.macro SAVE8x1 .macro SAVE8x1

View File

@ -170,49 +170,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.4s, v0.4s, v8.4s[0] fmul v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b eor v17.16b, v17.16b, v17.16b
fmls v17.4s, v0.4s, v9.4s[0] fmls v17.4s, v0.4s, v9.s[0]
#else #else
fmul v17.4s, v0.4s, v9.4s[0] fmul v17.4s, v0.4s, v9.s[0]
#endif #endif
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.4s[1] fmul v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b eor v21.16b, v21.16b, v21.16b
fmls v21.4s, v0.4s, v9.4s[1] fmls v21.4s, v0.4s, v9.s[1]
#else #else
fmul v21.4s, v0.4s, v9.4s[1] fmul v21.4s, v0.4s, v9.s[1]
#endif #endif
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
fmul v24.4s, v0.4s, v8.4s[2] fmul v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b eor v25.16b, v25.16b, v25.16b
fmls v25.4s, v0.4s, v9.4s[2] fmls v25.4s, v0.4s, v9.s[2]
#else #else
fmul v25.4s, v0.4s, v9.4s[2] fmul v25.4s, v0.4s, v9.s[2]
#endif #endif
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
fmul v28.4s, v0.4s, v8.4s[3] fmul v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b eor v29.16b, v29.16b, v29.16b
fmls v29.4s, v0.4s, v9.4s[3] fmls v29.4s, v0.4s, v9.s[3]
#else #else
fmul v29.4s, v0.4s, v9.4s[3] fmul v29.4s, v0.4s, v9.s[3]
#endif #endif
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB] ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -221,85 +221,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
ld2 {v12.4s, v13.4s}, [pB] // For next round ld2 {v12.4s, v13.4s}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
ld2 {v4.4s, v5.4s}, [pA] // For next round ld2 {v4.4s, v5.4s}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
ld2 {v8.4s, v9.4s}, [pB] // For next round ld2 {v8.4s, v9.4s}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
ld2 {v0.4s, v1.4s}, [pA] // For next round ld2 {v0.4s, v1.4s}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -308,25 +308,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -384,25 +384,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA] ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr v16.2s, v0.2s, v8.4s[0] OP_rr v16.2s, v0.2s, v8.s[0]
OP_ii v16.2s, v1.2s, v9.4s[0] OP_ii v16.2s, v1.2s, v9.s[0]
OP_ri v17.2s, v0.2s, v9.4s[0] OP_ri v17.2s, v0.2s, v9.s[0]
OP_ir v17.2s, v1.2s, v8.4s[0] OP_ir v17.2s, v1.2s, v8.s[0]
OP_rr v20.2s, v0.2s, v8.4s[1] OP_rr v20.2s, v0.2s, v8.s[1]
OP_ii v20.2s, v1.2s, v9.4s[1] OP_ii v20.2s, v1.2s, v9.s[1]
OP_ri v21.2s, v0.2s, v9.4s[1] OP_ri v21.2s, v0.2s, v9.s[1]
OP_ir v21.2s, v1.2s, v8.4s[1] OP_ir v21.2s, v1.2s, v8.s[1]
OP_rr v24.2s, v0.2s, v8.4s[2] OP_rr v24.2s, v0.2s, v8.s[2]
OP_ii v24.2s, v1.2s, v9.4s[2] OP_ii v24.2s, v1.2s, v9.s[2]
OP_ri v25.2s, v0.2s, v9.4s[2] OP_ri v25.2s, v0.2s, v9.s[2]
OP_ir v25.2s, v1.2s, v8.4s[2] OP_ir v25.2s, v1.2s, v8.s[2]
OP_rr v28.2s, v0.2s, v8.4s[3] OP_rr v28.2s, v0.2s, v8.s[3]
OP_ii v28.2s, v1.2s, v9.4s[3] OP_ii v28.2s, v1.2s, v9.s[3]
OP_ri v29.2s, v0.2s, v9.4s[3] OP_ri v29.2s, v0.2s, v9.s[3]
OP_ir v29.2s, v1.2s, v8.4s[3] OP_ir v29.2s, v1.2s, v8.s[3]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -460,25 +460,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA] ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8 add pA, pA, #8
OP_rr s16, s0, v8.4s[0] OP_rr s16, s0, v8.s[0]
OP_ii s16, s1, v9.4s[0] OP_ii s16, s1, v9.s[0]
OP_ri s17, s0, v9.4s[0] OP_ri s17, s0, v9.s[0]
OP_ir s17, s1, v8.4s[0] OP_ir s17, s1, v8.s[0]
OP_rr s20, s0, v8.4s[1] OP_rr s20, s0, v8.s[1]
OP_ii s20, s1, v9.4s[1] OP_ii s20, s1, v9.s[1]
OP_ri s21, s0, v9.4s[1] OP_ri s21, s0, v9.s[1]
OP_ir s21, s1, v8.4s[1] OP_ir s21, s1, v8.s[1]
OP_rr s24, s0, v8.4s[2] OP_rr s24, s0, v8.s[2]
OP_ii s24, s1, v9.4s[2] OP_ii s24, s1, v9.s[2]
OP_ri s25, s0, v9.4s[2] OP_ri s25, s0, v9.s[2]
OP_ir s25, s1, v8.4s[2] OP_ir s25, s1, v8.s[2]
OP_rr s28, s0, v8.4s[3] OP_rr s28, s0, v8.s[3]
OP_ii s28, s1, v9.4s[3] OP_ii s28, s1, v9.s[3]
OP_ri s29, s0, v9.4s[3] OP_ri s29, s0, v9.s[3]
OP_ir s29, s1, v8.4s[3] OP_ir s29, s1, v8.s[3]
.endm .endm
.macro SAVE1x4 .macro SAVE1x4
@ -532,15 +532,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.2s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.2s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.2s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.2s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.2s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.2s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.2s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.2s[1] OP_ir v21.4s, v1.4s, v8.s[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -578,15 +578,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA] ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr v16.2s, v0.2s, v8.2s[0] OP_rr v16.2s, v0.2s, v8.s[0]
OP_ii v16.2s, v1.2s, v9.2s[0] OP_ii v16.2s, v1.2s, v9.s[0]
OP_ri v17.2s, v0.2s, v9.2s[0] OP_ri v17.2s, v0.2s, v9.s[0]
OP_ir v17.2s, v1.2s, v8.2s[0] OP_ir v17.2s, v1.2s, v8.s[0]
OP_rr v20.2s, v0.2s, v8.2s[1] OP_rr v20.2s, v0.2s, v8.s[1]
OP_ii v20.2s, v1.2s, v9.2s[1] OP_ii v20.2s, v1.2s, v9.s[1]
OP_ri v21.2s, v0.2s, v9.2s[1] OP_ri v21.2s, v0.2s, v9.s[1]
OP_ir v21.2s, v1.2s, v8.2s[1] OP_ir v21.2s, v1.2s, v8.s[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -624,15 +624,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA] ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8 add pA, pA, #8
OP_rr s16, s0, v8.2s[0] OP_rr s16, s0, v8.s[0]
OP_ii s16, s1, v9.2s[0] OP_ii s16, s1, v9.s[0]
OP_ri s17, s0, v9.2s[0] OP_ri s17, s0, v9.s[0]
OP_ir s17, s1, v8.2s[0] OP_ir s17, s1, v8.s[0]
OP_rr s20, s0, v8.2s[1] OP_rr s20, s0, v8.s[1]
OP_ii s20, s1, v9.2s[1] OP_ii s20, s1, v9.s[1]
OP_ri s21, s0, v9.2s[1] OP_ri s21, s0, v9.s[1]
OP_ir s21, s1, v8.2s[1] OP_ir s21, s1, v8.s[1]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2

664
kernel/arm64/ctrmm_kernel_8x4.S Executable file → Normal file
View File

@ -180,93 +180,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA] ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.4s, v0.4s, v8.4s[0] fmul v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b eor v17.16b, v17.16b, v17.16b
fmls v17.4s, v0.4s, v9.4s[0] fmls v17.4s, v0.4s, v9.s[0]
#else #else
fmul v17.4s, v0.4s, v9.4s[0] fmul v17.4s, v0.4s, v9.s[0]
#endif #endif
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
fmul v18.4s, v2.4s, v8.4s[0] fmul v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.4s[0] OP_ii v18.4s, v3.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b eor v19.16b, v19.16b, v19.16b
fmls v19.4s, v2.4s, v9.4s[0] fmls v19.4s, v2.4s, v9.s[0]
#else #else
fmul v19.4s, v2.4s, v9.4s[0] fmul v19.4s, v2.4s, v9.s[0]
#endif #endif
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.4s[1] fmul v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b eor v21.16b, v21.16b, v21.16b
fmls v21.4s, v0.4s, v9.4s[1] fmls v21.4s, v0.4s, v9.s[1]
#else #else
fmul v21.4s, v0.4s, v9.4s[1] fmul v21.4s, v0.4s, v9.s[1]
#endif #endif
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
fmul v22.4s, v2.4s, v8.4s[1] fmul v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.4s[1] OP_ii v22.4s, v3.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v23.16b, v23.16b, v23.16b eor v23.16b, v23.16b, v23.16b
fmls v23.4s, v2.4s, v9.4s[1] fmls v23.4s, v2.4s, v9.s[1]
#else #else
fmul v23.4s, v2.4s, v9.4s[1] fmul v23.4s, v2.4s, v9.s[1]
#endif #endif
OP_ir v23.4s, v3.4s, v8.4s[1] OP_ir v23.4s, v3.4s, v8.s[1]
fmul v24.4s, v0.4s, v8.4s[2] fmul v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b eor v25.16b, v25.16b, v25.16b
fmls v25.4s, v0.4s, v9.4s[2] fmls v25.4s, v0.4s, v9.s[2]
#else #else
fmul v25.4s, v0.4s, v9.4s[2] fmul v25.4s, v0.4s, v9.s[2]
#endif #endif
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
fmul v26.4s, v2.4s, v8.4s[2] fmul v26.4s, v2.4s, v8.s[2]
OP_ii v26.4s, v3.4s, v9.4s[2] OP_ii v26.4s, v3.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b eor v27.16b, v27.16b, v27.16b
fmls v27.4s, v2.4s, v9.4s[2] fmls v27.4s, v2.4s, v9.s[2]
#else #else
fmul v27.4s, v2.4s, v9.4s[2] fmul v27.4s, v2.4s, v9.s[2]
#endif #endif
OP_ir v27.4s, v3.4s, v8.4s[2] OP_ir v27.4s, v3.4s, v8.s[2]
fmul v28.4s, v0.4s, v8.4s[3] fmul v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b eor v29.16b, v29.16b, v29.16b
fmls v29.4s, v0.4s, v9.4s[3] fmls v29.4s, v0.4s, v9.s[3]
#else #else
fmul v29.4s, v0.4s, v9.4s[3] fmul v29.4s, v0.4s, v9.s[3]
#endif #endif
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
fmul v30.4s, v2.4s, v8.4s[3] fmul v30.4s, v2.4s, v8.s[3]
OP_ii v30.4s, v3.4s, v9.4s[3] OP_ii v30.4s, v3.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b eor v31.16b, v31.16b, v31.16b
fmls v31.4s, v2.4s, v9.4s[3] fmls v31.4s, v2.4s, v9.s[3]
#else #else
fmul v31.4s, v2.4s, v9.4s[3] fmul v31.4s, v2.4s, v9.s[3]
#endif #endif
OP_ir v31.4s, v3.4s, v8.4s[3] OP_ir v31.4s, v3.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB] ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -277,45 +277,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v18.4s, v2.4s, v8.4s[0] OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.4s[0] OP_ii v18.4s, v3.4s, v9.s[0]
OP_ri v19.4s, v2.4s, v9.4s[0] OP_ri v19.4s, v2.4s, v9.s[0]
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
OP_rr v22.4s, v2.4s, v8.4s[1] OP_rr v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.4s[1] OP_ii v22.4s, v3.4s, v9.s[1]
OP_ri v23.4s, v2.4s, v9.4s[1] OP_ri v23.4s, v2.4s, v9.s[1]
OP_ir v23.4s, v3.4s, v8.4s[1] OP_ir v23.4s, v3.4s, v8.s[1]
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
OP_rr v26.4s, v2.4s, v8.4s[2] OP_rr v26.4s, v2.4s, v8.s[2]
OP_ii v26.4s, v3.4s, v9.4s[2] OP_ii v26.4s, v3.4s, v9.s[2]
OP_ri v27.4s, v2.4s, v9.4s[2] OP_ri v27.4s, v2.4s, v9.s[2]
OP_ir v27.4s, v3.4s, v8.4s[2] OP_ir v27.4s, v3.4s, v8.s[2]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
OP_rr v30.4s, v2.4s, v8.4s[3] OP_rr v30.4s, v2.4s, v8.s[3]
OP_ii v30.4s, v3.4s, v9.4s[3] OP_ii v30.4s, v3.4s, v9.s[3]
OP_ri v31.4s, v2.4s, v9.4s[3] OP_ri v31.4s, v2.4s, v9.s[3]
OP_ir v31.4s, v3.4s, v8.4s[3] OP_ir v31.4s, v3.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB] // For next round ld2 {v12.4s, v13.4s}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
@ -326,45 +326,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
OP_rr v18.4s, v6.4s, v12.4s[0] OP_rr v18.4s, v6.4s, v12.s[0]
OP_ii v18.4s, v7.4s, v13.4s[0] OP_ii v18.4s, v7.4s, v13.s[0]
OP_ri v19.4s, v6.4s, v13.4s[0] OP_ri v19.4s, v6.4s, v13.s[0]
OP_ir v19.4s, v7.4s, v12.4s[0] OP_ir v19.4s, v7.4s, v12.s[0]
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
OP_rr v22.4s, v6.4s, v12.4s[1] OP_rr v22.4s, v6.4s, v12.s[1]
OP_ii v22.4s, v7.4s, v13.4s[1] OP_ii v22.4s, v7.4s, v13.s[1]
OP_ri v23.4s, v6.4s, v13.4s[1] OP_ri v23.4s, v6.4s, v13.s[1]
OP_ir v23.4s, v7.4s, v12.4s[1] OP_ir v23.4s, v7.4s, v12.s[1]
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
OP_rr v26.4s, v6.4s, v12.4s[2] OP_rr v26.4s, v6.4s, v12.s[2]
OP_ii v26.4s, v7.4s, v13.4s[2] OP_ii v26.4s, v7.4s, v13.s[2]
OP_ri v27.4s, v6.4s, v13.4s[2] OP_ri v27.4s, v6.4s, v13.s[2]
OP_ir v27.4s, v7.4s, v12.4s[2] OP_ir v27.4s, v7.4s, v12.s[2]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
OP_rr v30.4s, v6.4s, v12.4s[3] OP_rr v30.4s, v6.4s, v12.s[3]
OP_ii v30.4s, v7.4s, v13.4s[3] OP_ii v30.4s, v7.4s, v13.s[3]
OP_ri v31.4s, v6.4s, v13.4s[3] OP_ri v31.4s, v6.4s, v13.s[3]
OP_ir v31.4s, v7.4s, v12.4s[3] OP_ir v31.4s, v7.4s, v12.s[3]
ld2 {v8.4s, v9.4s}, [pB] ld2 {v8.4s, v9.4s}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -375,45 +375,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
OP_rr v18.4s, v6.4s, v12.4s[0] OP_rr v18.4s, v6.4s, v12.s[0]
OP_ii v18.4s, v7.4s, v13.4s[0] OP_ii v18.4s, v7.4s, v13.s[0]
OP_ri v19.4s, v6.4s, v13.4s[0] OP_ri v19.4s, v6.4s, v13.s[0]
OP_ir v19.4s, v7.4s, v12.4s[0] OP_ir v19.4s, v7.4s, v12.s[0]
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
OP_rr v22.4s, v6.4s, v12.4s[1] OP_rr v22.4s, v6.4s, v12.s[1]
OP_ii v22.4s, v7.4s, v13.4s[1] OP_ii v22.4s, v7.4s, v13.s[1]
OP_ri v23.4s, v6.4s, v13.4s[1] OP_ri v23.4s, v6.4s, v13.s[1]
OP_ir v23.4s, v7.4s, v12.4s[1] OP_ir v23.4s, v7.4s, v12.s[1]
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
OP_rr v26.4s, v6.4s, v12.4s[2] OP_rr v26.4s, v6.4s, v12.s[2]
OP_ii v26.4s, v7.4s, v13.4s[2] OP_ii v26.4s, v7.4s, v13.s[2]
OP_ri v27.4s, v6.4s, v13.4s[2] OP_ri v27.4s, v6.4s, v13.s[2]
OP_ir v27.4s, v7.4s, v12.4s[2] OP_ir v27.4s, v7.4s, v12.s[2]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
OP_rr v30.4s, v6.4s, v12.4s[3] OP_rr v30.4s, v6.4s, v12.s[3]
OP_ii v30.4s, v7.4s, v13.4s[3] OP_ii v30.4s, v7.4s, v13.s[3]
OP_ri v31.4s, v6.4s, v13.4s[3] OP_ri v31.4s, v6.4s, v13.s[3]
OP_ir v31.4s, v7.4s, v12.4s[3] OP_ir v31.4s, v7.4s, v12.s[3]
.endm .endm
@ -425,45 +425,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA] ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v18.4s, v2.4s, v8.4s[0] OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.4s[0] OP_ii v18.4s, v3.4s, v9.s[0]
OP_ri v19.4s, v2.4s, v9.4s[0] OP_ri v19.4s, v2.4s, v9.s[0]
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
OP_rr v22.4s, v2.4s, v8.4s[1] OP_rr v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.4s[1] OP_ii v22.4s, v3.4s, v9.s[1]
OP_ri v23.4s, v2.4s, v9.4s[1] OP_ri v23.4s, v2.4s, v9.s[1]
OP_ir v23.4s, v3.4s, v8.4s[1] OP_ir v23.4s, v3.4s, v8.s[1]
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
OP_rr v26.4s, v2.4s, v8.4s[2] OP_rr v26.4s, v2.4s, v8.s[2]
OP_ii v26.4s, v3.4s, v9.4s[2] OP_ii v26.4s, v3.4s, v9.s[2]
OP_ri v27.4s, v2.4s, v9.4s[2] OP_ri v27.4s, v2.4s, v9.s[2]
OP_ir v27.4s, v3.4s, v8.4s[2] OP_ir v27.4s, v3.4s, v8.s[2]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
OP_rr v30.4s, v2.4s, v8.4s[3] OP_rr v30.4s, v2.4s, v8.s[3]
OP_ii v30.4s, v3.4s, v9.4s[3] OP_ii v30.4s, v3.4s, v9.s[3]
OP_ri v31.4s, v2.4s, v9.4s[3] OP_ri v31.4s, v2.4s, v9.s[3]
OP_ir v31.4s, v3.4s, v8.4s[3] OP_ir v31.4s, v3.4s, v8.s[3]
.endm .endm
@ -562,49 +562,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.4s, v0.4s, v8.4s[0] fmul v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b eor v17.16b, v17.16b, v17.16b
fmls v17.4s, v0.4s, v9.4s[0] fmls v17.4s, v0.4s, v9.s[0]
#else #else
fmul v17.4s, v0.4s, v9.4s[0] fmul v17.4s, v0.4s, v9.s[0]
#endif #endif
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.4s[1] fmul v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b eor v21.16b, v21.16b, v21.16b
fmls v21.4s, v0.4s, v9.4s[1] fmls v21.4s, v0.4s, v9.s[1]
#else #else
fmul v21.4s, v0.4s, v9.4s[1] fmul v21.4s, v0.4s, v9.s[1]
#endif #endif
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
fmul v24.4s, v0.4s, v8.4s[2] fmul v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b eor v25.16b, v25.16b, v25.16b
fmls v25.4s, v0.4s, v9.4s[2] fmls v25.4s, v0.4s, v9.s[2]
#else #else
fmul v25.4s, v0.4s, v9.4s[2] fmul v25.4s, v0.4s, v9.s[2]
#endif #endif
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
fmul v28.4s, v0.4s, v8.4s[3] fmul v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b eor v29.16b, v29.16b, v29.16b
fmls v29.4s, v0.4s, v9.4s[3] fmls v29.4s, v0.4s, v9.s[3]
#else #else
fmul v29.4s, v0.4s, v9.4s[3] fmul v29.4s, v0.4s, v9.s[3]
#endif #endif
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB] ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -613,85 +613,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
ld2 {v12.4s, v13.4s}, [pB] // For next round ld2 {v12.4s, v13.4s}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
ld2 {v4.4s, v5.4s}, [pA] // For next round ld2 {v4.4s, v5.4s}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
ld2 {v8.4s, v9.4s}, [pB] // For next round ld2 {v8.4s, v9.4s}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
ld2 {v0.4s, v1.4s}, [pA] // For next round ld2 {v0.4s, v1.4s}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -700,25 +700,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -780,25 +780,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA] ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr v16.2s, v0.2s, v8.4s[0] OP_rr v16.2s, v0.2s, v8.s[0]
OP_ii v16.2s, v1.2s, v9.4s[0] OP_ii v16.2s, v1.2s, v9.s[0]
OP_ri v17.2s, v0.2s, v9.4s[0] OP_ri v17.2s, v0.2s, v9.s[0]
OP_ir v17.2s, v1.2s, v8.4s[0] OP_ir v17.2s, v1.2s, v8.s[0]
OP_rr v20.2s, v0.2s, v8.4s[1] OP_rr v20.2s, v0.2s, v8.s[1]
OP_ii v20.2s, v1.2s, v9.4s[1] OP_ii v20.2s, v1.2s, v9.s[1]
OP_ri v21.2s, v0.2s, v9.4s[1] OP_ri v21.2s, v0.2s, v9.s[1]
OP_ir v21.2s, v1.2s, v8.4s[1] OP_ir v21.2s, v1.2s, v8.s[1]
OP_rr v24.2s, v0.2s, v8.4s[2] OP_rr v24.2s, v0.2s, v8.s[2]
OP_ii v24.2s, v1.2s, v9.4s[2] OP_ii v24.2s, v1.2s, v9.s[2]
OP_ri v25.2s, v0.2s, v9.4s[2] OP_ri v25.2s, v0.2s, v9.s[2]
OP_ir v25.2s, v1.2s, v8.4s[2] OP_ir v25.2s, v1.2s, v8.s[2]
OP_rr v28.2s, v0.2s, v8.4s[3] OP_rr v28.2s, v0.2s, v8.s[3]
OP_ii v28.2s, v1.2s, v9.4s[3] OP_ii v28.2s, v1.2s, v9.s[3]
OP_ri v29.2s, v0.2s, v9.4s[3] OP_ri v29.2s, v0.2s, v9.s[3]
OP_ir v29.2s, v1.2s, v8.4s[3] OP_ir v29.2s, v1.2s, v8.s[3]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -860,25 +860,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA] ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8 add pA, pA, #8
OP_rr s16, s0, v8.4s[0] OP_rr s16, s0, v8.s[0]
OP_ii s16, s1, v9.4s[0] OP_ii s16, s1, v9.s[0]
OP_ri s17, s0, v9.4s[0] OP_ri s17, s0, v9.s[0]
OP_ir s17, s1, v8.4s[0] OP_ir s17, s1, v8.s[0]
OP_rr s20, s0, v8.4s[1] OP_rr s20, s0, v8.s[1]
OP_ii s20, s1, v9.4s[1] OP_ii s20, s1, v9.s[1]
OP_ri s21, s0, v9.4s[1] OP_ri s21, s0, v9.s[1]
OP_ir s21, s1, v8.4s[1] OP_ir s21, s1, v8.s[1]
OP_rr s24, s0, v8.4s[2] OP_rr s24, s0, v8.s[2]
OP_ii s24, s1, v9.4s[2] OP_ii s24, s1, v9.s[2]
OP_ri s25, s0, v9.4s[2] OP_ri s25, s0, v9.s[2]
OP_ir s25, s1, v8.4s[2] OP_ir s25, s1, v8.s[2]
OP_rr s28, s0, v8.4s[3] OP_rr s28, s0, v8.s[3]
OP_ii s28, s1, v9.4s[3] OP_ii s28, s1, v9.s[3]
OP_ri s29, s0, v9.4s[3] OP_ri s29, s0, v9.s[3]
OP_ir s29, s1, v8.4s[3] OP_ir s29, s1, v8.s[3]
.endm .endm
.macro SAVE1x4 .macro SAVE1x4
@ -942,25 +942,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA] ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.2s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.2s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.2s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.2s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v18.4s, v2.4s, v8.2s[0] OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.2s[0] OP_ii v18.4s, v3.4s, v9.s[0]
OP_ri v19.4s, v2.4s, v9.2s[0] OP_ri v19.4s, v2.4s, v9.s[0]
OP_ir v19.4s, v3.4s, v8.2s[0] OP_ir v19.4s, v3.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.2s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.2s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.2s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.2s[1] OP_ir v21.4s, v1.4s, v8.s[1]
OP_rr v22.4s, v2.4s, v8.2s[1] OP_rr v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.2s[1] OP_ii v22.4s, v3.4s, v9.s[1]
OP_ri v23.4s, v2.4s, v9.2s[1] OP_ri v23.4s, v2.4s, v9.s[1]
OP_ir v23.4s, v3.4s, v8.2s[1] OP_ir v23.4s, v3.4s, v8.s[1]
.endm .endm
.macro SAVE8x2 .macro SAVE8x2
@ -1018,15 +1018,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.2s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.2s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.2s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.2s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.2s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.2s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.2s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.2s[1] OP_ir v21.4s, v1.4s, v8.s[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -1066,15 +1066,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA] ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr v16.2s, v0.2s, v8.2s[0] OP_rr v16.2s, v0.2s, v8.s[0]
OP_ii v16.2s, v1.2s, v9.2s[0] OP_ii v16.2s, v1.2s, v9.s[0]
OP_ri v17.2s, v0.2s, v9.2s[0] OP_ri v17.2s, v0.2s, v9.s[0]
OP_ir v17.2s, v1.2s, v8.2s[0] OP_ir v17.2s, v1.2s, v8.s[0]
OP_rr v20.2s, v0.2s, v8.2s[1] OP_rr v20.2s, v0.2s, v8.s[1]
OP_ii v20.2s, v1.2s, v9.2s[1] OP_ii v20.2s, v1.2s, v9.s[1]
OP_ri v21.2s, v0.2s, v9.2s[1] OP_ri v21.2s, v0.2s, v9.s[1]
OP_ir v21.2s, v1.2s, v8.2s[1] OP_ir v21.2s, v1.2s, v8.s[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -1114,15 +1114,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA] ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8 add pA, pA, #8
OP_rr s16, s0, v8.2s[0] OP_rr s16, s0, v8.s[0]
OP_ii s16, s1, v9.2s[0] OP_ii s16, s1, v9.s[0]
OP_ri s17, s0, v9.2s[0] OP_ri s17, s0, v9.s[0]
OP_ir s17, s1, v8.2s[0] OP_ir s17, s1, v8.s[0]
OP_rr s20, s0, v8.2s[1] OP_rr s20, s0, v8.s[1]
OP_ii s20, s1, v9.2s[1] OP_ii s20, s1, v9.s[1]
OP_ri s21, s0, v9.2s[1] OP_ri s21, s0, v9.s[1]
OP_ir s21, s1, v8.2s[1] OP_ir s21, s1, v8.s[1]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -1164,15 +1164,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA] ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v8.4s[1] OP_ii v16.4s, v1.4s, v8.s[1]
OP_ri v17.4s, v0.4s, v8.4s[1] OP_ri v17.4s, v0.4s, v8.s[1]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v18.4s, v2.4s, v8.4s[0] OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v8.4s[1] OP_ii v18.4s, v3.4s, v8.s[1]
OP_ri v19.4s, v2.4s, v8.4s[1] OP_ri v19.4s, v2.4s, v8.s[1]
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
.endm .endm
.macro SAVE8x1 .macro SAVE8x1

View File

@ -161,150 +161,150 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldp q0, q1, [pA] ldp q0, q1, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.2d, v0.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.d[0]
fmul v29.2d, v1.2d, v11.2d[0] fmul v29.2d, v1.2d, v11.d[0]
ldp q2, q3, [ppA] ldp q2, q3, [ppA]
add ppA, ppA, #32 add ppA, ppA, #32
fmul v20.2d, v0.2d, v9.2d[0] fmul v20.2d, v0.2d, v9.d[0]
fmul v25.2d, v1.2d, v10.2d[0] fmul v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmul v18.2d, v2.2d, v8.2d[0] fmul v18.2d, v2.2d, v8.d[0]
fmul v31.2d, v3.2d, v11.2d[0] fmul v31.2d, v3.2d, v11.d[0]
prfm PLDL1KEEP, [ppA, #A_PRE_SIZE] prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
fmul v22.2d, v2.2d, v9.2d[0] fmul v22.2d, v2.2d, v9.d[0]
fmul v27.2d, v3.2d, v10.2d[0] fmul v27.2d, v3.2d, v10.d[0]
ldp d12, d13, [pB] ldp d12, d13, [pB]
add pB, pB, #16 add pB, pB, #16
fmul v24.2d, v0.2d, v10.2d[0] fmul v24.2d, v0.2d, v10.d[0]
fmul v21.2d, v1.2d, v9.2d[0] fmul v21.2d, v1.2d, v9.d[0]
ldp q4, q5, [pA] // for next round ldp q4, q5, [pA] // for next round
add pA, pA, #32 add pA, pA, #32
fmul v26.2d, v2.2d, v10.2d[0] fmul v26.2d, v2.2d, v10.d[0]
fmul v23.2d, v3.2d, v9.2d[0] fmul v23.2d, v3.2d, v9.d[0]
ldp q6, q7, [ppA] // for next round ldp q6, q7, [ppA] // for next round
add ppA, ppA, #32 add ppA, ppA, #32
fmul v28.2d, v0.2d, v11.2d[0] fmul v28.2d, v0.2d, v11.d[0]
fmul v17.2d, v1.2d, v8.2d[0] fmul v17.2d, v1.2d, v8.d[0]
ldp d14, d15, [pB] ldp d14, d15, [pB]
add pB, pB, #16 add pB, pB, #16
fmul v30.2d, v2.2d, v11.2d[0] fmul v30.2d, v2.2d, v11.d[0]
fmul v19.2d, v3.2d, v8.2d[0] fmul v19.2d, v3.2d, v8.d[0]
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v29.2d, v5.2d, v15.2d[0] fmla v29.2d, v5.2d, v15.d[0]
ldp d8, d9, [pB] ldp d8, d9, [pB]
add pB, pB, #16 add pB, pB, #16
fmla v18.2d, v6.2d, v12.2d[0] fmla v18.2d, v6.2d, v12.d[0]
fmla v31.2d, v7.2d, v15.2d[0] fmla v31.2d, v7.2d, v15.d[0]
ldp d10, d11, [pB] ldp d10, d11, [pB]
add pB, pB, #16 add pB, pB, #16
fmla v20.2d, v4.2d, v13.2d[0] fmla v20.2d, v4.2d, v13.d[0]
fmla v25.2d, v5.2d, v14.2d[0] fmla v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v22.2d, v6.2d, v13.2d[0] fmla v22.2d, v6.2d, v13.d[0]
fmla v27.2d, v7.2d, v14.2d[0] fmla v27.2d, v7.2d, v14.d[0]
fmla v24.2d, v4.2d, v14.2d[0] fmla v24.2d, v4.2d, v14.d[0]
fmla v21.2d, v5.2d, v13.2d[0] fmla v21.2d, v5.2d, v13.d[0]
ldp q0, q1, [pA] ldp q0, q1, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v26.2d, v6.2d, v14.2d[0] fmla v26.2d, v6.2d, v14.d[0]
fmla v23.2d, v7.2d, v13.2d[0] fmla v23.2d, v7.2d, v13.d[0]
fmla v28.2d, v4.2d, v15.2d[0] fmla v28.2d, v4.2d, v15.d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
ldp q2, q3, [ppA] ldp q2, q3, [ppA]
add ppA, ppA, #32 add ppA, ppA, #32
fmla v30.2d, v6.2d, v15.2d[0] fmla v30.2d, v6.2d, v15.d[0]
fmla v19.2d, v7.2d, v12.2d[0] fmla v19.2d, v7.2d, v12.d[0]
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v11.2d[0] fmla v29.2d, v1.2d, v11.d[0]
ldp d12, d13, [pB] ldp d12, d13, [pB]
add pB, pB, #16 add pB, pB, #16
fmla v18.2d, v2.2d, v8.2d[0] fmla v18.2d, v2.2d, v8.d[0]
fmla v31.2d, v3.2d, v11.2d[0] fmla v31.2d, v3.2d, v11.d[0]
ldp d14, d15, [pB] ldp d14, d15, [pB]
add pB, pB, #16 add pB, pB, #16
fmla v20.2d, v0.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.d[0]
fmla v25.2d, v1.2d, v10.2d[0] fmla v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla v22.2d, v2.2d, v9.2d[0] fmla v22.2d, v2.2d, v9.d[0]
fmla v27.2d, v3.2d, v10.2d[0] fmla v27.2d, v3.2d, v10.d[0]
prfm PLDL1KEEP, [ppA, #A_PRE_SIZE] prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
fmla v24.2d, v0.2d, v10.2d[0] fmla v24.2d, v0.2d, v10.d[0]
fmla v21.2d, v1.2d, v9.2d[0] fmla v21.2d, v1.2d, v9.d[0]
ldp q4, q5, [pA] ldp q4, q5, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v26.2d, v2.2d, v10.2d[0] fmla v26.2d, v2.2d, v10.d[0]
fmla v23.2d, v3.2d, v9.2d[0] fmla v23.2d, v3.2d, v9.d[0]
fmla v28.2d, v0.2d, v11.2d[0] fmla v28.2d, v0.2d, v11.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
ldp q6, q7, [ppA] ldp q6, q7, [ppA]
add ppA, ppA, #32 add ppA, ppA, #32
fmla v30.2d, v2.2d, v11.2d[0] fmla v30.2d, v2.2d, v11.d[0]
fmla v19.2d, v3.2d, v8.2d[0] fmla v19.2d, v3.2d, v8.d[0]
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v25.2d, v5.2d, v14.2d[0] fmla v25.2d, v5.2d, v14.d[0]
fmla v18.2d, v6.2d, v12.2d[0] fmla v18.2d, v6.2d, v12.d[0]
fmla v27.2d, v7.2d, v14.2d[0] fmla v27.2d, v7.2d, v14.d[0]
fmla v20.2d, v4.2d, v13.2d[0] fmla v20.2d, v4.2d, v13.d[0]
fmla v29.2d, v5.2d, v15.2d[0] fmla v29.2d, v5.2d, v15.d[0]
fmla v22.2d, v6.2d, v13.2d[0] fmla v22.2d, v6.2d, v13.d[0]
fmla v31.2d, v7.2d, v15.2d[0] fmla v31.2d, v7.2d, v15.d[0]
fmla v24.2d, v4.2d, v14.2d[0] fmla v24.2d, v4.2d, v14.d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
fmla v26.2d, v6.2d, v14.2d[0] fmla v26.2d, v6.2d, v14.d[0]
fmla v19.2d, v7.2d, v12.2d[0] fmla v19.2d, v7.2d, v12.d[0]
fmla v28.2d, v4.2d, v15.2d[0] fmla v28.2d, v4.2d, v15.d[0]
fmla v21.2d, v5.2d, v13.2d[0] fmla v21.2d, v5.2d, v13.d[0]
fmla v30.2d, v6.2d, v15.2d[0] fmla v30.2d, v6.2d, v15.d[0]
fmla v23.2d, v7.2d, v13.2d[0] fmla v23.2d, v7.2d, v13.d[0]
.endm .endm
.macro KERNEL8x4_SUB .macro KERNEL8x4_SUB
@ -315,28 +315,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldp q0, q1, [pA] ldp q0, q1, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v11.2d[0] fmla v29.2d, v1.2d, v11.d[0]
fmla v20.2d, v0.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.d[0]
fmla v25.2d, v1.2d, v10.2d[0] fmla v25.2d, v1.2d, v10.d[0]
ldp q2, q3, [ppA] ldp q2, q3, [ppA]
add ppA, ppA, #32 add ppA, ppA, #32
fmla v24.2d, v0.2d, v10.2d[0] fmla v24.2d, v0.2d, v10.d[0]
fmla v21.2d, v1.2d, v9.2d[0] fmla v21.2d, v1.2d, v9.d[0]
fmla v28.2d, v0.2d, v11.2d[0] fmla v28.2d, v0.2d, v11.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v2.2d, v8.2d[0] fmla v18.2d, v2.2d, v8.d[0]
fmla v31.2d, v3.2d, v11.2d[0] fmla v31.2d, v3.2d, v11.d[0]
fmla v22.2d, v2.2d, v9.2d[0] fmla v22.2d, v2.2d, v9.d[0]
fmla v27.2d, v3.2d, v10.2d[0] fmla v27.2d, v3.2d, v10.d[0]
fmla v26.2d, v2.2d, v10.2d[0] fmla v26.2d, v2.2d, v10.d[0]
fmla v23.2d, v3.2d, v9.2d[0] fmla v23.2d, v3.2d, v9.d[0]
fmla v30.2d, v2.2d, v11.2d[0] fmla v30.2d, v2.2d, v11.d[0]
fmla v19.2d, v3.2d, v8.2d[0] fmla v19.2d, v3.2d, v8.d[0]
.endm .endm
.macro SAVE8x4 .macro SAVE8x4
@ -422,17 +422,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -482,10 +482,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -572,10 +572,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -610,8 +610,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -643,7 +643,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr d0 , [pA] ldr d0 , [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2d, v8.2d, v0.2d[0] fmla v16.2d, v8.2d, v0.d[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -674,8 +674,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32 add pA , pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -705,7 +705,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

284
kernel/arm64/dgemm_kernel_4x8.S Executable file → Normal file
View File

@ -154,25 +154,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v10.2d, v11.2d}, [pB] ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
fmul v16.2d, v0.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.d[0]
fmul v17.2d, v1.2d, v8.2d[0] fmul v17.2d, v1.2d, v8.d[0]
fmul v18.2d, v0.2d, v8.2d[1] fmul v18.2d, v0.2d, v8.d[1]
fmul v19.2d, v1.2d, v8.2d[1] fmul v19.2d, v1.2d, v8.d[1]
fmul v20.2d, v0.2d, v9.2d[0] fmul v20.2d, v0.2d, v9.d[0]
fmul v21.2d, v1.2d, v9.2d[0] fmul v21.2d, v1.2d, v9.d[0]
fmul v22.2d, v0.2d, v9.2d[1] fmul v22.2d, v0.2d, v9.d[1]
fmul v23.2d, v1.2d, v9.2d[1] fmul v23.2d, v1.2d, v9.d[1]
fmul v24.2d, v0.2d, v10.2d[0] fmul v24.2d, v0.2d, v10.d[0]
fmul v25.2d, v1.2d, v10.2d[0] fmul v25.2d, v1.2d, v10.d[0]
fmul v26.2d, v0.2d, v10.2d[1] fmul v26.2d, v0.2d, v10.d[1]
fmul v27.2d, v1.2d, v10.2d[1] fmul v27.2d, v1.2d, v10.d[1]
fmul v28.2d, v0.2d, v11.2d[0] fmul v28.2d, v0.2d, v11.d[0]
fmul v29.2d, v1.2d, v11.2d[0] fmul v29.2d, v1.2d, v11.d[0]
fmul v30.2d, v0.2d, v11.2d[1] fmul v30.2d, v0.2d, v11.d[1]
fmul v31.2d, v1.2d, v11.2d[1] fmul v31.2d, v1.2d, v11.d[1]
ld1 {v12.2d, v13.2d}, [pB] ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -183,25 +183,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_M1 .macro KERNEL4x8_M1
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v0.2d, v8.2d[1] fmla v18.2d, v0.2d, v8.d[1]
fmla v19.2d, v1.2d, v8.2d[1] fmla v19.2d, v1.2d, v8.d[1]
fmla v20.2d, v0.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v9.2d[0] fmla v21.2d, v1.2d, v9.d[0]
fmla v22.2d, v0.2d, v9.2d[1] fmla v22.2d, v0.2d, v9.d[1]
fmla v23.2d, v1.2d, v9.2d[1] fmla v23.2d, v1.2d, v9.d[1]
fmla v24.2d, v0.2d, v10.2d[0] fmla v24.2d, v0.2d, v10.d[0]
fmla v25.2d, v1.2d, v10.2d[0] fmla v25.2d, v1.2d, v10.d[0]
fmla v26.2d, v0.2d, v10.2d[1] fmla v26.2d, v0.2d, v10.d[1]
fmla v27.2d, v1.2d, v10.2d[1] fmla v27.2d, v1.2d, v10.d[1]
fmla v28.2d, v0.2d, v11.2d[0] fmla v28.2d, v0.2d, v11.d[0]
fmla v29.2d, v1.2d, v11.2d[0] fmla v29.2d, v1.2d, v11.d[0]
fmla v30.2d, v0.2d, v11.2d[1] fmla v30.2d, v0.2d, v11.d[1]
fmla v31.2d, v1.2d, v11.2d[1] fmla v31.2d, v1.2d, v11.d[1]
ld1 {v12.2d, v13.2d}, [pB] // For next round ld1 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
@ -214,25 +214,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_M2 .macro KERNEL4x8_M2
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
fmla v18.2d, v4.2d, v12.2d[1] fmla v18.2d, v4.2d, v12.d[1]
fmla v19.2d, v5.2d, v12.2d[1] fmla v19.2d, v5.2d, v12.d[1]
fmla v20.2d, v4.2d, v13.2d[0] fmla v20.2d, v4.2d, v13.d[0]
fmla v21.2d, v5.2d, v13.2d[0] fmla v21.2d, v5.2d, v13.d[0]
fmla v22.2d, v4.2d, v13.2d[1] fmla v22.2d, v4.2d, v13.d[1]
fmla v23.2d, v5.2d, v13.2d[1] fmla v23.2d, v5.2d, v13.d[1]
fmla v24.2d, v4.2d, v14.2d[0] fmla v24.2d, v4.2d, v14.d[0]
fmla v25.2d, v5.2d, v14.2d[0] fmla v25.2d, v5.2d, v14.d[0]
fmla v26.2d, v4.2d, v14.2d[1] fmla v26.2d, v4.2d, v14.d[1]
fmla v27.2d, v5.2d, v14.2d[1] fmla v27.2d, v5.2d, v14.d[1]
fmla v28.2d, v4.2d, v15.2d[0] fmla v28.2d, v4.2d, v15.d[0]
fmla v29.2d, v5.2d, v15.2d[0] fmla v29.2d, v5.2d, v15.d[0]
fmla v30.2d, v4.2d, v15.2d[1] fmla v30.2d, v4.2d, v15.d[1]
fmla v31.2d, v5.2d, v15.2d[1] fmla v31.2d, v5.2d, v15.d[1]
ld1 {v8.2d, v9.2d}, [pB] // For next round ld1 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
@ -245,25 +245,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_E .macro KERNEL4x8_E
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
fmla v18.2d, v4.2d, v12.2d[1] fmla v18.2d, v4.2d, v12.d[1]
fmla v19.2d, v5.2d, v12.2d[1] fmla v19.2d, v5.2d, v12.d[1]
fmla v20.2d, v4.2d, v13.2d[0] fmla v20.2d, v4.2d, v13.d[0]
fmla v21.2d, v5.2d, v13.2d[0] fmla v21.2d, v5.2d, v13.d[0]
fmla v22.2d, v4.2d, v13.2d[1] fmla v22.2d, v4.2d, v13.d[1]
fmla v23.2d, v5.2d, v13.2d[1] fmla v23.2d, v5.2d, v13.d[1]
fmla v24.2d, v4.2d, v14.2d[0] fmla v24.2d, v4.2d, v14.d[0]
fmla v25.2d, v5.2d, v14.2d[0] fmla v25.2d, v5.2d, v14.d[0]
fmla v26.2d, v4.2d, v14.2d[1] fmla v26.2d, v4.2d, v14.d[1]
fmla v27.2d, v5.2d, v14.2d[1] fmla v27.2d, v5.2d, v14.d[1]
fmla v28.2d, v4.2d, v15.2d[0] fmla v28.2d, v4.2d, v15.d[0]
fmla v29.2d, v5.2d, v15.2d[0] fmla v29.2d, v5.2d, v15.d[0]
fmla v30.2d, v4.2d, v15.2d[1] fmla v30.2d, v4.2d, v15.d[1]
fmla v31.2d, v5.2d, v15.2d[1] fmla v31.2d, v5.2d, v15.d[1]
.endm .endm
.macro KERNEL4x8_SUB .macro KERNEL4x8_SUB
@ -274,25 +274,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v10.2d, v11.2d}, [pB] ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v0.2d, v8.2d[1] fmla v18.2d, v0.2d, v8.d[1]
fmla v19.2d, v1.2d, v8.2d[1] fmla v19.2d, v1.2d, v8.d[1]
fmla v20.2d, v0.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v9.2d[0] fmla v21.2d, v1.2d, v9.d[0]
fmla v22.2d, v0.2d, v9.2d[1] fmla v22.2d, v0.2d, v9.d[1]
fmla v23.2d, v1.2d, v9.2d[1] fmla v23.2d, v1.2d, v9.d[1]
fmla v24.2d, v0.2d, v10.2d[0] fmla v24.2d, v0.2d, v10.d[0]
fmla v25.2d, v1.2d, v10.2d[0] fmla v25.2d, v1.2d, v10.d[0]
fmla v26.2d, v0.2d, v10.2d[1] fmla v26.2d, v0.2d, v10.d[1]
fmla v27.2d, v1.2d, v10.2d[1] fmla v27.2d, v1.2d, v10.d[1]
fmla v28.2d, v0.2d, v11.2d[0] fmla v28.2d, v0.2d, v11.d[0]
fmla v29.2d, v1.2d, v11.2d[0] fmla v29.2d, v1.2d, v11.d[0]
fmla v30.2d, v0.2d, v11.2d[1] fmla v30.2d, v0.2d, v11.d[1]
fmla v31.2d, v1.2d, v11.2d[1] fmla v31.2d, v1.2d, v11.d[1]
.endm .endm
.macro SAVE4x8 .macro SAVE4x8
@ -374,17 +374,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v10.2d, v11.2d}, [pB] ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v18.2d, v0.2d, v8.2d[1] fmla v18.2d, v0.2d, v8.d[1]
fmla v20.2d, v0.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.d[0]
fmla v22.2d, v0.2d, v9.2d[1] fmla v22.2d, v0.2d, v9.d[1]
fmla v24.2d, v0.2d, v10.2d[0] fmla v24.2d, v0.2d, v10.d[0]
fmla v26.2d, v0.2d, v10.2d[1] fmla v26.2d, v0.2d, v10.d[1]
fmla v28.2d, v0.2d, v11.2d[0] fmla v28.2d, v0.2d, v11.d[0]
fmla v30.2d, v0.2d, v11.2d[1] fmla v30.2d, v0.2d, v11.d[1]
.endm .endm
.macro SAVE2x8 .macro SAVE2x8
@ -520,17 +520,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.2d, v0.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.d[0]
fmul v29.2d, v1.2d, v9.2d[1] fmul v29.2d, v1.2d, v9.d[1]
fmul v20.2d, v0.2d, v8.2d[1] fmul v20.2d, v0.2d, v8.d[1]
fmul v25.2d, v1.2d, v9.2d[0] fmul v25.2d, v1.2d, v9.d[0]
fmul v24.2d, v0.2d, v9.2d[0] fmul v24.2d, v0.2d, v9.d[0]
fmul v21.2d, v1.2d, v8.2d[1] fmul v21.2d, v1.2d, v8.d[1]
fmul v28.2d, v0.2d, v9.2d[1] fmul v28.2d, v0.2d, v9.d[1]
fmul v17.2d, v1.2d, v8.2d[0] fmul v17.2d, v1.2d, v8.d[0]
ld1 {v12.2d, v13.2d}, [pB] ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -539,61 +539,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
ld1 {v12.2d, v13.2d}, [pB] // For next round ld1 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
ld1 {v4.2d, v5.2d}, [pA] // For next round ld1 {v4.2d, v5.2d}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v29.2d, v5.2d, v13.2d[1] fmla v29.2d, v5.2d, v13.d[1]
ld1 {v8.2d, v9.2d}, [pB] // For next round ld1 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
fmla v20.2d, v4.2d, v12.2d[1] fmla v20.2d, v4.2d, v12.d[1]
fmla v25.2d, v5.2d, v13.2d[0] fmla v25.2d, v5.2d, v13.d[0]
ld1 {v0.2d, v1.2d}, [pA] // For next round ld1 {v0.2d, v1.2d}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
fmla v24.2d, v4.2d, v13.2d[0] fmla v24.2d, v4.2d, v13.d[0]
fmla v21.2d, v5.2d, v12.2d[1] fmla v21.2d, v5.2d, v12.d[1]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
fmla v28.2d, v4.2d, v13.2d[1] fmla v28.2d, v4.2d, v13.d[1]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v29.2d, v5.2d, v13.2d[1] fmla v29.2d, v5.2d, v13.d[1]
fmla v20.2d, v4.2d, v12.2d[1] fmla v20.2d, v4.2d, v12.d[1]
fmla v25.2d, v5.2d, v13.2d[0] fmla v25.2d, v5.2d, v13.d[0]
fmla v24.2d, v4.2d, v13.2d[0] fmla v24.2d, v4.2d, v13.d[0]
fmla v21.2d, v5.2d, v12.2d[1] fmla v21.2d, v5.2d, v12.d[1]
fmla v28.2d, v4.2d, v13.2d[1] fmla v28.2d, v4.2d, v13.d[1]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -602,17 +602,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -660,10 +660,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -746,10 +746,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -782,8 +782,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -813,7 +813,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr d0 , [pA] ldr d0 , [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2d, v8.2d, v0.2d[0] fmla v16.2d, v8.2d, v0.d[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -842,8 +842,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32 add pA , pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -871,7 +871,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

478
kernel/arm64/dgemm_kernel_8x4.S Executable file → Normal file
View File

@ -52,12 +52,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define alpha0 d10 #define alpha0 d10
#define alphaV0 v10.d[0] #define alphaV0 v10.d[0]
#define alpha1 d11
#define alphaV1 v11.d[0] #define A_PRE_SIZE 2560
#define alpha2 d14 #define B_PRE_SIZE 448
#define alphaV2 v14.d[0] #define C_PRE_SIZE 128
#define alpha3 d15
#define alphaV3 v15.d[0]
// 00 origM // 00 origM
// 01 origN // 01 origN
@ -74,8 +72,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 12 pCRow0 // 12 pCRow0
// 13 pCRow1 // 13 pCRow1
// 14 pCRow2 // 14 pCRow2
// 15 pA // 15 pCRow3
// 16 // 16 pA
// 17 // 17
// 18 must save // 18 must save
// 19 must save // 19 must save
@ -100,14 +98,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//v05 pA1_2, pA1_3 //v05 pA1_2, pA1_3
//v06 pA1_4, pA1_5 //v06 pA1_4, pA1_5
//v07 pA1_6, pA1_7 //v07 pA1_6, pA1_7
//v08 must save pB0_0, pB0_1 //v08 must save pB0_0
//v09 must save pB0_2, pB0_3 //v09 must save pB0_1
//v10 must save ALPHA0 //v10 must save pB0_2 --> ALPHA0
//v11 must save ALPHA1 //v11 must save pB0_3
//v12 must save pB1_0, pB1_1 //v12 must save pB1_0
//v13 must save pB1_2, pB1_3 //v13 must save pB1_1
//v14 must save ALPHA2 //v14 must save pB1_2
//v15 must save ALPHA3 //v15 must save pB1_3
//v16 must save C00, C01 //v16 must save C00, C01
//v17 must save C02, C03 //v17 must save C02, C03
//v18 C04, C05 //v18 C04, C05
@ -149,244 +147,257 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_I .macro KERNEL8x4_I
ld1 {v0.2d, v1.2d}, [pA] ldp q0, q1, [pA], #32
add pA, pA, #32
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
ldp d8, d9, [pB]
add pB, pB, #16
ldp d10, d11, [pB]
add pB, pB, #16
fmul v16.2d, v0.2d, v8.2d[0] ldp d8, d9, [pB], #16
fmul v17.2d, v1.2d, v8.2d[0]
fmul v18.2d, v2.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.d[0]
fmul v19.2d, v3.2d, v8.2d[0] fmul v20.2d, v0.2d, v9.d[0]
fmul v20.2d, v0.2d, v9.2d[0] ldp d10, d11, [pB], #16
fmul v21.2d, v1.2d, v9.2d[0]
fmul v22.2d, v2.2d, v9.2d[0] fmul v17.2d, v1.2d, v8.d[0]
fmul v23.2d, v3.2d, v9.2d[0] fmul v21.2d, v1.2d, v9.d[0]
fmul v24.2d, v0.2d, v10.2d[0] ldp q2, q3, [pA], #32
fmul v25.2d, v1.2d, v10.2d[0]
fmul v26.2d, v2.2d, v10.2d[0] fmul v24.2d, v0.2d, v10.d[0]
fmul v27.2d, v3.2d, v10.2d[0] fmul v28.2d, v0.2d, v11.d[0]
fmul v28.2d, v0.2d, v11.2d[0] ldp q4, q5, [pA], #32
fmul v29.2d, v1.2d, v11.2d[0]
fmul v30.2d, v2.2d, v11.2d[0] fmul v25.2d, v1.2d, v10.d[0]
fmul v31.2d, v3.2d, v11.2d[0] fmul v29.2d, v1.2d, v11.d[0]
ld1 {v4.2d, v5.2d}, [pA] ldp d12, d13, [pB], #16
add pA, pA, #32
ld1 {v6.2d, v7.2d}, [pA] fmul v18.2d, v2.2d, v8.d[0]
add pA, pA, #32 fmul v22.2d, v2.2d, v9.d[0]
ldp d12, d13, [pB]
add pB, pB, #16 ldp d14, d15, [pB], #16
ldp d14, d15, [pB]
add pB, pB, #16 fmul v26.2d, v2.2d, v10.d[0]
fmul v30.2d, v2.2d, v11.d[0]
ldp q6, q7, [pA], #32
fmul v19.2d, v3.2d, v8.d[0]
fmul v27.2d, v3.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmul v31.2d, v3.2d, v11.d[0]
fmul v23.2d, v3.2d, v9.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v21.2d, v1.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.d[0]
fmla v26.2d, v2.2d, v10.2d[0]
fmla v31.2d, v3.2d, v11.2d[0]
ld1 {v4.2d}, [pA], #16 ldp q4, q5, [pA], #32
fmla v20.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v10.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v28.2d, v0.2d, v11.d[0]
ld1 {v5.2d}, [pA], #16 ldp d12, d13, [pB], #16
fmla v30.2d, v2.2d, v11.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v27.2d, v3.2d, v10.2d[0] fmla v25.2d, v1.2d, v10.d[0]
ldp d12, d13, [pB] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
add pB, pB, #16
fmla v28.2d, v0.2d, v11.2d[0] fmla v21.2d, v1.2d, v9.d[0]
fmla v25.2d, v1.2d, v10.2d[0] fmla v29.2d, v1.2d, v11.d[0]
ldp d14, d15, [pB] ldp d14, d15, [pB], #16
add pB, pB, #16
fmla v18.2d, v2.2d, v8.2d[0] fmla v18.2d, v2.2d, v8.d[0]
fmla v23.2d, v3.2d, v9.2d[0] fmla v22.2d, v2.2d, v9.d[0]
ld1 {v6.2d}, [pA], #16 prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla v24.2d, v0.2d, v10.2d[0] fmla v26.2d, v2.2d, v10.d[0]
fmla v29.2d, v1.2d, v11.2d[0] fmla v30.2d, v2.2d, v11.d[0]
fmla v19.2d, v3.2d, v8.d[0]
fmla v23.2d, v3.2d, v9.d[0]
ld1 {v7.2d}, [pA], #16 ldp q6, q7, [pA], #32
fmla v22.2d, v2.2d, v9.2d[0] fmla v27.2d, v3.2d, v10.d[0]
fmla v19.2d, v3.2d, v8.2d[0] fmla v31.2d, v3.2d, v11.d[0]
prfm PLDL1KEEP, [pA, #224]
prfm PLDL1KEEP, [pA, #224+64]
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v21.2d, v5.2d, v13.2d[0] fmla v20.2d, v4.2d, v13.d[0]
fmla v26.2d, v6.2d, v14.2d[0] fmla v24.2d, v4.2d, v14.d[0]
fmla v31.2d, v7.2d, v15.2d[0] fmla v28.2d, v4.2d, v15.d[0]
ld1 {v0.2d}, [pA], #16 ldp q0, q1, [pA], #32
fmla v20.2d, v4.2d, v13.2d[0] fmla v17.2d, v5.2d, v12.d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v25.2d, v5.2d, v14.d[0]
ld1 {v1.2d}, [pA], #16 ldp d8, d9, [pB], #16
fmla v30.2d, v6.2d, v15.2d[0] fmla v21.2d, v5.2d, v13.d[0]
fmla v27.2d, v7.2d, v14.2d[0] fmla v29.2d, v5.2d, v15.d[0]
ldp d8, d9, [pB] ldp d10, d11, [pB], #16
add pB, pB, #16
fmla v28.2d, v4.2d, v15.2d[0] fmla v18.2d, v6.2d, v12.d[0]
fmla v25.2d, v5.2d, v14.2d[0] fmla v22.2d, v6.2d, v13.d[0]
ldp d10, d11, [pB] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
add pB, pB, #16
fmla v22.2d, v6.2d, v13.2d[0] fmla v26.2d, v6.2d, v14.d[0]
fmla v19.2d, v7.2d, v12.2d[0] fmla v30.2d, v6.2d, v15.d[0]
ld1 {v2.2d}, [pA], #16 fmla v19.2d, v7.2d, v12.d[0]
fmla v23.2d, v7.2d, v13.d[0]
fmla v24.2d, v4.2d, v14.2d[0] ldp q2, q3, [pA], #32
fmla v29.2d, v5.2d, v15.2d[0]
ld1 {v3.2d}, [pA], #16 fmla v27.2d, v7.2d, v14.d[0]
fmla v31.2d, v7.2d, v15.d[0]
fmla v18.2d, v6.2d, v12.2d[0]
fmla v23.2d, v7.2d, v13.2d[0]
prfm PLDL1KEEP, [pB, #640]
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v20.2d, v4.2d, v13.d[0]
fmla v18.2d, v6.2d, v12.2d[0] fmla v24.2d, v4.2d, v14.d[0]
fmla v19.2d, v7.2d, v12.2d[0] fmla v28.2d, v4.2d, v15.d[0]
fmla v20.2d, v4.2d, v13.2d[0]
fmla v21.2d, v5.2d, v13.2d[0] fmla v17.2d, v5.2d, v12.d[0]
fmla v22.2d, v6.2d, v13.2d[0] fmla v25.2d, v5.2d, v14.d[0]
fmla v23.2d, v7.2d, v13.2d[0] fmla v21.2d, v5.2d, v13.d[0]
fmla v24.2d, v4.2d, v14.2d[0] fmla v29.2d, v5.2d, v15.d[0]
fmla v25.2d, v5.2d, v14.2d[0]
fmla v26.2d, v6.2d, v14.2d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v27.2d, v7.2d, v14.2d[0]
fmla v28.2d, v4.2d, v15.2d[0] fmla v18.2d, v6.2d, v12.d[0]
fmla v29.2d, v5.2d, v15.2d[0] fmla v22.2d, v6.2d, v13.d[0]
fmla v30.2d, v6.2d, v15.2d[0] fmla v26.2d, v6.2d, v14.d[0]
fmla v31.2d, v7.2d, v15.2d[0] fmla v30.2d, v6.2d, v15.d[0]
fmla v19.2d, v7.2d, v12.d[0]
fmla v23.2d, v7.2d, v13.d[0]
fmla v27.2d, v7.2d, v14.d[0]
fmla v31.2d, v7.2d, v15.d[0]
.endm .endm
.macro KERNEL8x4_SUB .macro KERNEL8x4_SUB
ld1 {v0.2d, v1.2d}, [pA] ldp q0, q1, [pA], #32
add pA, pA, #32
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
ldp d8, d9, [pB]
add pB, pB, #16
ldp d10, d11, [pB]
add pB, pB, #16
fmla v16.2d, v0.2d, v8.2d[0] ldp d8, d9, [pB], #16
fmla v17.2d, v1.2d, v8.2d[0]
fmla v18.2d, v2.2d, v8.2d[0]
fmla v19.2d, v3.2d, v8.2d[0]
fmla v20.2d, v0.2d, v9.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v21.2d, v1.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.d[0]
fmla v22.2d, v2.2d, v9.2d[0]
fmla v23.2d, v3.2d, v9.2d[0]
fmla v24.2d, v0.2d, v10.2d[0] ldp d10, d11, [pB], #16
fmla v25.2d, v1.2d, v10.2d[0]
fmla v26.2d, v2.2d, v10.2d[0]
fmla v27.2d, v3.2d, v10.2d[0]
fmla v28.2d, v0.2d, v11.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v29.2d, v1.2d, v11.2d[0] fmla v21.2d, v1.2d, v9.d[0]
fmla v30.2d, v2.2d, v11.2d[0]
fmla v31.2d, v3.2d, v11.2d[0] ldp q2, q3, [pA], #32
fmla v24.2d, v0.2d, v10.d[0]
fmla v28.2d, v0.2d, v11.d[0]
fmla v25.2d, v1.2d, v10.d[0]
fmla v29.2d, v1.2d, v11.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla v18.2d, v2.2d, v8.d[0]
fmla v22.2d, v2.2d, v9.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
fmla v26.2d, v2.2d, v10.d[0]
fmla v30.2d, v2.2d, v11.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v19.2d, v3.2d, v8.d[0]
fmla v27.2d, v3.2d, v10.d[0]
fmla v31.2d, v3.2d, v11.d[0]
fmla v23.2d, v3.2d, v9.d[0]
.endm .endm
.macro SAVE8x4 .macro SAVE8x4
fmov alpha0, alpha fmov alpha0, alpha
ld1 {v0.2d, v1.2d}, [pCRow0] prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ldp q0, q1, [pCRow0]
fmla v0.2d, v16.2d, alphaV0 fmla v0.2d, v16.2d, alphaV0
fmla v1.2d, v17.2d, alphaV0 fmla v1.2d, v17.2d, alphaV0
st1 {v0.2d, v1.2d}, [pCRow0] stp q0, q1, [pCRow0]
add pCRow0, pCRow0, #32 add pCRow0, pCRow0, #32
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld1 {v2.2d, v3.2d}, [pCRow0] ldp q2, q3, [pCRow0]
fmla v2.2d, v18.2d, alphaV0 fmla v2.2d, v18.2d, alphaV0
fmla v3.2d, v19.2d, alphaV0 fmla v3.2d, v19.2d, alphaV0
st1 {v2.2d, v3.2d}, [pCRow0] stp q2, q3, [pCRow0]
add pCRow0, pCRow0, #32 add pCRow0, pCRow0, #32
ld1 {v4.2d, v5.2d}, [pCRow1] prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ldp q4, q5, [pCRow1]
fmla v4.2d, v20.2d, alphaV0 fmla v4.2d, v20.2d, alphaV0
fmla v5.2d, v21.2d, alphaV0 fmla v5.2d, v21.2d, alphaV0
st1 {v4.2d, v5.2d}, [pCRow1] stp q4, q5, [pCRow1]
add pCRow1, pCRow1, #32 add pCRow1, pCRow1, #32
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld1 {v6.2d, v7.2d}, [pCRow1] ldp q6, q7, [pCRow1]
fmla v6.2d, v22.2d, alphaV0 fmla v6.2d, v22.2d, alphaV0
fmla v7.2d, v23.2d, alphaV0 fmla v7.2d, v23.2d, alphaV0
st1 {v6.2d, v7.2d}, [pCRow1] stp q6, q7, [pCRow1]
add pCRow1, pCRow1, #32 add pCRow1, pCRow1, #32
ld1 {v0.2d, v1.2d}, [pCRow2] prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
ldp q0, q1, [pCRow2]
fmla v0.2d, v24.2d, alphaV0 fmla v0.2d, v24.2d, alphaV0
fmla v1.2d, v25.2d, alphaV0 fmla v1.2d, v25.2d, alphaV0
st1 {v0.2d, v1.2d}, [pCRow2] stp q0, q1, [pCRow2]
add pCRow2, pCRow2, #32 add pCRow2, pCRow2, #32
ld1 {v2.2d, v3.2d}, [pCRow2] prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
ldp q2, q3, [pCRow2]
fmla v2.2d, v26.2d, alphaV0 fmla v2.2d, v26.2d, alphaV0
fmla v3.2d, v27.2d, alphaV0 fmla v3.2d, v27.2d, alphaV0
st1 {v2.2d, v3.2d}, [pCRow2] stp q2, q3, [pCRow2]
add pCRow2, pCRow2, #32 add pCRow2, pCRow2, #32
ld1 {v4.2d, v5.2d}, [pCRow3] prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
ldp q4, q5, [pCRow3]
fmla v4.2d, v28.2d, alphaV0 fmla v4.2d, v28.2d, alphaV0
fmla v5.2d, v29.2d, alphaV0 fmla v5.2d, v29.2d, alphaV0
st1 {v4.2d, v5.2d}, [pCRow3] stp q4, q5, [pCRow3]
add pCRow3, pCRow3, #32 add pCRow3, pCRow3, #32
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
ld1 {v6.2d, v7.2d}, [pCRow3] ldp q6, q7, [pCRow3]
fmla v6.2d, v30.2d, alphaV0 fmla v6.2d, v30.2d, alphaV0
fmla v7.2d, v31.2d, alphaV0 fmla v7.2d, v31.2d, alphaV0
st1 {v6.2d, v7.2d}, [pCRow3] stp q6, q7, [pCRow3]
add pCRow3, pCRow3, #32 add pCRow3, pCRow3, #32
prfm PLDL2KEEP, [pCRow0, #128]
prfm PLDL2KEEP, [pCRow1, #128]
prfm PLDL2KEEP, [pCRow2, #128]
prfm PLDL2KEEP, [pCRow3, #128]
.endm .endm
/******************************************************************************/ /******************************************************************************/
@ -408,44 +419,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
fmov alpha0, alpha
ld1 {v8.2d, v9.2d}, [pCRow0] ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV1 fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0] st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
ld1 {v12.2d, v13.2d}, [pCRow1] ld1 {v12.2d, v13.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV2 fmla v12.2d, v20.2d, alphaV0
fmla v13.2d, v21.2d, alphaV3 fmla v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1] st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
ld1 {v8.2d, v9.2d}, [pCRow2] ld1 {v8.2d, v9.2d}, [pCRow2]
fmla v8.2d, v24.2d, alphaV0 fmla v8.2d, v24.2d, alphaV0
fmla v9.2d, v25.2d, alphaV1 fmla v9.2d, v25.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow2] st1 {v8.2d, v9.2d}, [pCRow2]
add pCRow1, pCRow2, LDC add pCRow1, pCRow2, LDC
ld1 {v12.2d, v13.2d}, [pCRow1] ld1 {v12.2d, v13.2d}, [pCRow1]
fmla v12.2d, v28.2d, alphaV2 fmla v12.2d, v28.2d, alphaV0
fmla v13.2d, v29.2d, alphaV3 fmla v13.2d, v29.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1] st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32 add pCRow0, pCRow0, #32
@ -467,13 +479,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
fmov alpha0, alpha
ld1 {v8.2d}, [pCRow0] ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0] st1 {v8.2d}, [pCRow0]
@ -481,19 +494,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
ld1 {v12.2d}, [pCRow1] ld1 {v12.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV1 fmla v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1] st1 {v12.2d}, [pCRow1]
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
ld1 {v8.2d}, [pCRow2] ld1 {v8.2d}, [pCRow2]
fmla v8.2d, v24.2d, alphaV2 fmla v8.2d, v24.2d, alphaV0
st1 {v8.2d}, [pCRow2] st1 {v8.2d}, [pCRow2]
add pCRow1, pCRow2, LDC add pCRow1, pCRow2, LDC
ld1 {v12.2d}, [pCRow1] ld1 {v12.2d}, [pCRow1]
fmla v12.2d, v28.2d, alphaV3 fmla v12.2d, v28.2d, alphaV0
st1 {v12.2d}, [pCRow1] st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16 add pCRow0, pCRow0, #16
@ -518,6 +531,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE1x4 .macro SAVE1x4
fmov alpha0, alpha
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
ld1 {v8.d}[0], [pCRow0] ld1 {v8.d}[0], [pCRow0]
@ -531,7 +545,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v12.d}[0], [pCRow2] ld1 {v12.d}[0], [pCRow2]
ld1 {v12.d}[1], [pCRow1] ld1 {v12.d}[1], [pCRow1]
fmla v12.2d, v20.2d, alphaV1 fmla v12.2d, v20.2d, alphaV0
st1 {v12.d}[0], [pCRow2] st1 {v12.d}[0], [pCRow2]
st1 {v12.d}[1], [pCRow1] st1 {v12.d}[1], [pCRow1]
@ -559,32 +573,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.2d, v3.2d}, [pA] ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v2.2d, v8.2d[0] fmla v18.2d, v2.2d, v8.d[0]
fmla v19.2d, v3.2d, v8.2d[0] fmla v19.2d, v3.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
fmla v22.2d, v2.2d, v8.2d[1] fmla v22.2d, v2.2d, v8.d[1]
fmla v23.2d, v3.2d, v8.2d[1] fmla v23.2d, v3.2d, v8.d[1]
.endm .endm
.macro SAVE8x2 .macro SAVE8x2
fmov alpha0, alpha
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmla v0.2d, v16.2d, alphaV0 fmla v0.2d, v16.2d, alphaV0
fmla v1.2d, v17.2d, alphaV1 fmla v1.2d, v17.2d, alphaV0
fmla v2.2d, v18.2d, alphaV2 fmla v2.2d, v18.2d, alphaV0
fmla v3.2d, v19.2d, alphaV3 fmla v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0 fmla v4.2d, v20.2d, alphaV0
fmla v5.2d, v21.2d, alphaV1 fmla v5.2d, v21.2d, alphaV0
fmla v6.2d, v22.2d, alphaV2 fmla v6.2d, v22.2d, alphaV0
fmla v7.2d, v23.2d, alphaV3 fmla v7.2d, v23.2d, alphaV0
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
add pCRow0, pCRow0, #64 add pCRow0, pCRow0, #64
@ -605,23 +620,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
fmov alpha0, alpha
ld1 {v8.2d, v9.2d}, [pCRow0] ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV1 fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0] st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
ld1 {v12.2d, v13.2d}, [pCRow1] ld1 {v12.2d, v13.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV2 fmla v12.2d, v20.2d, alphaV0
fmla v13.2d, v21.2d, alphaV3 fmla v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1] st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32 add pCRow0, pCRow0, #32
@ -641,11 +657,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
fmov alpha0, alpha
ld1 {v8.2d}, [pCRow0] ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0] st1 {v8.2d}, [pCRow0]
@ -653,7 +670,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow1 , pCRow0, LDC add pCRow1 , pCRow0, LDC
ld1 {v12.2d}, [pCRow1] ld1 {v12.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV1 fmla v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1] st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16 add pCRow0, pCRow0, #16
@ -672,10 +689,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr d0 , [pA] ldr d0 , [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2d, v8.2d, v0.2d[0] fmla v16.2d, v8.2d, v0.d[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
fmov alpha0, alpha
add pCRow1 , pCRow0, LDC add pCRow1 , pCRow0, LDC
ld1 {v8.d}[0], [pCRow0] ld1 {v8.d}[0], [pCRow0]
@ -706,18 +724,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.2d, v3.2d}, [pA] ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v2.2d, v8.2d[0] fmla v18.2d, v2.2d, v8.d[0]
fmla v19.2d, v3.2d, v8.2d[0] fmla v19.2d, v3.2d, v8.d[0]
.endm .endm
.macro SAVE8x1 .macro SAVE8x1
fmov alpha0, alpha
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmla v0.2d, v16.2d, alphaV0 fmla v0.2d, v16.2d, alphaV0
fmla v1.2d, v17.2d, alphaV1 fmla v1.2d, v17.2d, alphaV0
fmla v2.2d, v18.2d, alphaV2 fmla v2.2d, v18.2d, alphaV0
fmla v3.2d, v19.2d, alphaV3 fmla v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
add pCRow0, pCRow0, #64 add pCRow0, pCRow0, #64
@ -738,14 +757,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32 add pA , pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
fmov alpha0, alpha
ld1 {v8.2d, v9.2d}, [pCRow0] ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV1 fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0] st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow0, pCRow0, #32 add pCRow0, pCRow0, #32
@ -765,10 +785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1
fmov alpha0, alpha
ld1 {v8.2d}, [pCRow0] ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0] st1 {v8.2d}, [pCRow0]
@ -793,6 +814,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE1x1 .macro SAVE1x1
fmov alpha0, alpha
ldr d8, [pCRow0] ldr d8, [pCRow0]
fmadd d8, d16, alpha0, d8 fmadd d8, d16, alpha0, d8
str d8, [pCRow0] str d8, [pCRow0]
@ -820,6 +842,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)] stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)] str x28, [sp, #(10 * 16)]
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alpha, d0 fmov alpha, d0
lsl LDC, LDC, #3 // ldc = ldc * 8 lsl LDC, LDC, #3 // ldc = ldc * 8
@ -838,6 +863,7 @@ dgemm_kernel_L4_BEGIN:
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
add pCRow3, pCRow2, LDC add pCRow3, pCRow2, LDC
add pC, pCRow3, LDC add pC, pCRow3, LDC
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
@ -849,6 +875,7 @@ dgemm_kernel_L4_M8_BEGIN:
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN ble dgemm_kernel_L4_M4_BEGIN
.align 5
dgemm_kernel_L4_M8_20: dgemm_kernel_L4_M8_20:
mov pB, origPB mov pB, origPB
@ -868,8 +895,8 @@ dgemm_kernel_L4_M8_20:
subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a ble dgemm_kernel_L4_M8_22a
.align 5
.align 5
dgemm_kernel_L4_M8_22: dgemm_kernel_L4_M8_22:
KERNEL8x4_M1 KERNEL8x4_M1
@ -884,7 +911,7 @@ dgemm_kernel_L4_M8_22:
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22 bgt dgemm_kernel_L4_M8_22
.align 5
dgemm_kernel_L4_M8_22a: dgemm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
@ -898,6 +925,7 @@ dgemm_kernel_L4_M8_22a:
b dgemm_kernel_L4_M8_44 b dgemm_kernel_L4_M8_44
.align 5
dgemm_kernel_L4_M8_32: dgemm_kernel_L4_M8_32:
tst counterL, #1 tst counterL, #1
@ -923,6 +951,7 @@ dgemm_kernel_L4_M8_44:
ands counterL , origK, #7 ands counterL , origK, #7
ble dgemm_kernel_L4_M8_100 ble dgemm_kernel_L4_M8_100
.align 5
dgemm_kernel_L4_M8_46: dgemm_kernel_L4_M8_46:
KERNEL8x4_SUB KERNEL8x4_SUB
@ -931,6 +960,9 @@ dgemm_kernel_L4_M8_46:
bne dgemm_kernel_L4_M8_46 bne dgemm_kernel_L4_M8_46
dgemm_kernel_L4_M8_100: dgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE8x4 SAVE8x4

View File

@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.2d, v0.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.d[0]
fmul v29.2d, v1.2d, v9.2d[1] fmul v29.2d, v1.2d, v9.d[1]
fmul v20.2d, v0.2d, v8.2d[1] fmul v20.2d, v0.2d, v8.d[1]
fmul v25.2d, v1.2d, v9.2d[0] fmul v25.2d, v1.2d, v9.d[0]
fmul v24.2d, v0.2d, v9.2d[0] fmul v24.2d, v0.2d, v9.d[0]
fmul v21.2d, v1.2d, v8.2d[1] fmul v21.2d, v1.2d, v8.d[1]
fmul v28.2d, v0.2d, v9.2d[1] fmul v28.2d, v0.2d, v9.d[1]
fmul v17.2d, v1.2d, v8.2d[0] fmul v17.2d, v1.2d, v8.d[0]
ld1 {v12.2d, v13.2d}, [pB] ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
ld1 {v12.2d, v13.2d}, [pB] // For next round ld1 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
ld1 {v4.2d, v5.2d}, [pA] // For next round ld1 {v4.2d, v5.2d}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v29.2d, v5.2d, v13.2d[1] fmla v29.2d, v5.2d, v13.d[1]
ld1 {v8.2d, v9.2d}, [pB] // For next round ld1 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
fmla v20.2d, v4.2d, v12.2d[1] fmla v20.2d, v4.2d, v12.d[1]
fmla v25.2d, v5.2d, v13.2d[0] fmla v25.2d, v5.2d, v13.d[0]
ld1 {v0.2d, v1.2d}, [pA] // For next round ld1 {v0.2d, v1.2d}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
fmla v24.2d, v4.2d, v13.2d[0] fmla v24.2d, v4.2d, v13.d[0]
fmla v21.2d, v5.2d, v12.2d[1] fmla v21.2d, v5.2d, v12.d[1]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
fmla v28.2d, v4.2d, v13.2d[1] fmla v28.2d, v4.2d, v13.d[1]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v29.2d, v5.2d, v13.2d[1] fmla v29.2d, v5.2d, v13.d[1]
fmla v20.2d, v4.2d, v12.2d[1] fmla v20.2d, v4.2d, v12.d[1]
fmla v25.2d, v5.2d, v13.2d[0] fmla v25.2d, v5.2d, v13.d[0]
fmla v24.2d, v4.2d, v13.2d[0] fmla v24.2d, v4.2d, v13.d[0]
fmla v21.2d, v5.2d, v12.2d[1] fmla v21.2d, v5.2d, v12.d[1]
fmla v28.2d, v4.2d, v13.2d[1] fmla v28.2d, v4.2d, v13.d[1]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -283,10 +283,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -361,10 +361,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -395,8 +395,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -424,7 +424,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr d0 , [pA] ldr d0 , [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2d, v8.2d, v0.2d[0] fmla v16.2d, v8.2d, v0.d[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -451,8 +451,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32 add pA , pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -479,7 +479,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

284
kernel/arm64/dtrmm_kernel_4x8.S Executable file → Normal file
View File

@ -157,25 +157,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v10.2d, v11.2d}, [pB] ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
fmul v16.2d, v0.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.d[0]
fmul v17.2d, v1.2d, v8.2d[0] fmul v17.2d, v1.2d, v8.d[0]
fmul v18.2d, v0.2d, v8.2d[1] fmul v18.2d, v0.2d, v8.d[1]
fmul v19.2d, v1.2d, v8.2d[1] fmul v19.2d, v1.2d, v8.d[1]
fmul v20.2d, v0.2d, v9.2d[0] fmul v20.2d, v0.2d, v9.d[0]
fmul v21.2d, v1.2d, v9.2d[0] fmul v21.2d, v1.2d, v9.d[0]
fmul v22.2d, v0.2d, v9.2d[1] fmul v22.2d, v0.2d, v9.d[1]
fmul v23.2d, v1.2d, v9.2d[1] fmul v23.2d, v1.2d, v9.d[1]
fmul v24.2d, v0.2d, v10.2d[0] fmul v24.2d, v0.2d, v10.d[0]
fmul v25.2d, v1.2d, v10.2d[0] fmul v25.2d, v1.2d, v10.d[0]
fmul v26.2d, v0.2d, v10.2d[1] fmul v26.2d, v0.2d, v10.d[1]
fmul v27.2d, v1.2d, v10.2d[1] fmul v27.2d, v1.2d, v10.d[1]
fmul v28.2d, v0.2d, v11.2d[0] fmul v28.2d, v0.2d, v11.d[0]
fmul v29.2d, v1.2d, v11.2d[0] fmul v29.2d, v1.2d, v11.d[0]
fmul v30.2d, v0.2d, v11.2d[1] fmul v30.2d, v0.2d, v11.d[1]
fmul v31.2d, v1.2d, v11.2d[1] fmul v31.2d, v1.2d, v11.d[1]
ld1 {v12.2d, v13.2d}, [pB] ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -186,25 +186,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_M1 .macro KERNEL4x8_M1
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v0.2d, v8.2d[1] fmla v18.2d, v0.2d, v8.d[1]
fmla v19.2d, v1.2d, v8.2d[1] fmla v19.2d, v1.2d, v8.d[1]
fmla v20.2d, v0.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v9.2d[0] fmla v21.2d, v1.2d, v9.d[0]
fmla v22.2d, v0.2d, v9.2d[1] fmla v22.2d, v0.2d, v9.d[1]
fmla v23.2d, v1.2d, v9.2d[1] fmla v23.2d, v1.2d, v9.d[1]
fmla v24.2d, v0.2d, v10.2d[0] fmla v24.2d, v0.2d, v10.d[0]
fmla v25.2d, v1.2d, v10.2d[0] fmla v25.2d, v1.2d, v10.d[0]
fmla v26.2d, v0.2d, v10.2d[1] fmla v26.2d, v0.2d, v10.d[1]
fmla v27.2d, v1.2d, v10.2d[1] fmla v27.2d, v1.2d, v10.d[1]
fmla v28.2d, v0.2d, v11.2d[0] fmla v28.2d, v0.2d, v11.d[0]
fmla v29.2d, v1.2d, v11.2d[0] fmla v29.2d, v1.2d, v11.d[0]
fmla v30.2d, v0.2d, v11.2d[1] fmla v30.2d, v0.2d, v11.d[1]
fmla v31.2d, v1.2d, v11.2d[1] fmla v31.2d, v1.2d, v11.d[1]
ld1 {v12.2d, v13.2d}, [pB] // For next round ld1 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
@ -217,25 +217,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_M2 .macro KERNEL4x8_M2
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
fmla v18.2d, v4.2d, v12.2d[1] fmla v18.2d, v4.2d, v12.d[1]
fmla v19.2d, v5.2d, v12.2d[1] fmla v19.2d, v5.2d, v12.d[1]
fmla v20.2d, v4.2d, v13.2d[0] fmla v20.2d, v4.2d, v13.d[0]
fmla v21.2d, v5.2d, v13.2d[0] fmla v21.2d, v5.2d, v13.d[0]
fmla v22.2d, v4.2d, v13.2d[1] fmla v22.2d, v4.2d, v13.d[1]
fmla v23.2d, v5.2d, v13.2d[1] fmla v23.2d, v5.2d, v13.d[1]
fmla v24.2d, v4.2d, v14.2d[0] fmla v24.2d, v4.2d, v14.d[0]
fmla v25.2d, v5.2d, v14.2d[0] fmla v25.2d, v5.2d, v14.d[0]
fmla v26.2d, v4.2d, v14.2d[1] fmla v26.2d, v4.2d, v14.d[1]
fmla v27.2d, v5.2d, v14.2d[1] fmla v27.2d, v5.2d, v14.d[1]
fmla v28.2d, v4.2d, v15.2d[0] fmla v28.2d, v4.2d, v15.d[0]
fmla v29.2d, v5.2d, v15.2d[0] fmla v29.2d, v5.2d, v15.d[0]
fmla v30.2d, v4.2d, v15.2d[1] fmla v30.2d, v4.2d, v15.d[1]
fmla v31.2d, v5.2d, v15.2d[1] fmla v31.2d, v5.2d, v15.d[1]
ld1 {v8.2d, v9.2d}, [pB] // For next round ld1 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
@ -248,25 +248,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_E .macro KERNEL4x8_E
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
fmla v18.2d, v4.2d, v12.2d[1] fmla v18.2d, v4.2d, v12.d[1]
fmla v19.2d, v5.2d, v12.2d[1] fmla v19.2d, v5.2d, v12.d[1]
fmla v20.2d, v4.2d, v13.2d[0] fmla v20.2d, v4.2d, v13.d[0]
fmla v21.2d, v5.2d, v13.2d[0] fmla v21.2d, v5.2d, v13.d[0]
fmla v22.2d, v4.2d, v13.2d[1] fmla v22.2d, v4.2d, v13.d[1]
fmla v23.2d, v5.2d, v13.2d[1] fmla v23.2d, v5.2d, v13.d[1]
fmla v24.2d, v4.2d, v14.2d[0] fmla v24.2d, v4.2d, v14.d[0]
fmla v25.2d, v5.2d, v14.2d[0] fmla v25.2d, v5.2d, v14.d[0]
fmla v26.2d, v4.2d, v14.2d[1] fmla v26.2d, v4.2d, v14.d[1]
fmla v27.2d, v5.2d, v14.2d[1] fmla v27.2d, v5.2d, v14.d[1]
fmla v28.2d, v4.2d, v15.2d[0] fmla v28.2d, v4.2d, v15.d[0]
fmla v29.2d, v5.2d, v15.2d[0] fmla v29.2d, v5.2d, v15.d[0]
fmla v30.2d, v4.2d, v15.2d[1] fmla v30.2d, v4.2d, v15.d[1]
fmla v31.2d, v5.2d, v15.2d[1] fmla v31.2d, v5.2d, v15.d[1]
.endm .endm
.macro KERNEL4x8_SUB .macro KERNEL4x8_SUB
@ -277,25 +277,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v10.2d, v11.2d}, [pB] ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v0.2d, v8.2d[1] fmla v18.2d, v0.2d, v8.d[1]
fmla v19.2d, v1.2d, v8.2d[1] fmla v19.2d, v1.2d, v8.d[1]
fmla v20.2d, v0.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v9.2d[0] fmla v21.2d, v1.2d, v9.d[0]
fmla v22.2d, v0.2d, v9.2d[1] fmla v22.2d, v0.2d, v9.d[1]
fmla v23.2d, v1.2d, v9.2d[1] fmla v23.2d, v1.2d, v9.d[1]
fmla v24.2d, v0.2d, v10.2d[0] fmla v24.2d, v0.2d, v10.d[0]
fmla v25.2d, v1.2d, v10.2d[0] fmla v25.2d, v1.2d, v10.d[0]
fmla v26.2d, v0.2d, v10.2d[1] fmla v26.2d, v0.2d, v10.d[1]
fmla v27.2d, v1.2d, v10.2d[1] fmla v27.2d, v1.2d, v10.d[1]
fmla v28.2d, v0.2d, v11.2d[0] fmla v28.2d, v0.2d, v11.d[0]
fmla v29.2d, v1.2d, v11.2d[0] fmla v29.2d, v1.2d, v11.d[0]
fmla v30.2d, v0.2d, v11.2d[1] fmla v30.2d, v0.2d, v11.d[1]
fmla v31.2d, v1.2d, v11.2d[1] fmla v31.2d, v1.2d, v11.d[1]
.endm .endm
.macro SAVE4x8 .macro SAVE4x8
@ -369,17 +369,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v10.2d, v11.2d}, [pB] ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v18.2d, v0.2d, v8.2d[1] fmla v18.2d, v0.2d, v8.d[1]
fmla v20.2d, v0.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.d[0]
fmla v22.2d, v0.2d, v9.2d[1] fmla v22.2d, v0.2d, v9.d[1]
fmla v24.2d, v0.2d, v10.2d[0] fmla v24.2d, v0.2d, v10.d[0]
fmla v26.2d, v0.2d, v10.2d[1] fmla v26.2d, v0.2d, v10.d[1]
fmla v28.2d, v0.2d, v11.2d[0] fmla v28.2d, v0.2d, v11.d[0]
fmla v30.2d, v0.2d, v11.2d[1] fmla v30.2d, v0.2d, v11.d[1]
.endm .endm
.macro SAVE2x8 .macro SAVE2x8
@ -499,17 +499,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.2d, v0.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.d[0]
fmul v29.2d, v1.2d, v9.2d[1] fmul v29.2d, v1.2d, v9.d[1]
fmul v20.2d, v0.2d, v8.2d[1] fmul v20.2d, v0.2d, v8.d[1]
fmul v25.2d, v1.2d, v9.2d[0] fmul v25.2d, v1.2d, v9.d[0]
fmul v24.2d, v0.2d, v9.2d[0] fmul v24.2d, v0.2d, v9.d[0]
fmul v21.2d, v1.2d, v8.2d[1] fmul v21.2d, v1.2d, v8.d[1]
fmul v28.2d, v0.2d, v9.2d[1] fmul v28.2d, v0.2d, v9.d[1]
fmul v17.2d, v1.2d, v8.2d[0] fmul v17.2d, v1.2d, v8.d[0]
ld1 {v12.2d, v13.2d}, [pB] ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -518,61 +518,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
ld1 {v12.2d, v13.2d}, [pB] // For next round ld1 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
ld1 {v4.2d, v5.2d}, [pA] // For next round ld1 {v4.2d, v5.2d}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v29.2d, v5.2d, v13.2d[1] fmla v29.2d, v5.2d, v13.d[1]
ld1 {v8.2d, v9.2d}, [pB] // For next round ld1 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
fmla v20.2d, v4.2d, v12.2d[1] fmla v20.2d, v4.2d, v12.d[1]
fmla v25.2d, v5.2d, v13.2d[0] fmla v25.2d, v5.2d, v13.d[0]
ld1 {v0.2d, v1.2d}, [pA] // For next round ld1 {v0.2d, v1.2d}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
fmla v24.2d, v4.2d, v13.2d[0] fmla v24.2d, v4.2d, v13.d[0]
fmla v21.2d, v5.2d, v12.2d[1] fmla v21.2d, v5.2d, v12.d[1]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
fmla v28.2d, v4.2d, v13.2d[1] fmla v28.2d, v4.2d, v13.d[1]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v29.2d, v5.2d, v13.2d[1] fmla v29.2d, v5.2d, v13.d[1]
fmla v20.2d, v4.2d, v12.2d[1] fmla v20.2d, v4.2d, v12.d[1]
fmla v25.2d, v5.2d, v13.2d[0] fmla v25.2d, v5.2d, v13.d[0]
fmla v24.2d, v4.2d, v13.2d[0] fmla v24.2d, v4.2d, v13.d[0]
fmla v21.2d, v5.2d, v12.2d[1] fmla v21.2d, v5.2d, v12.d[1]
fmla v28.2d, v4.2d, v13.2d[1] fmla v28.2d, v4.2d, v13.d[1]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -581,17 +581,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -635,10 +635,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -713,10 +713,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -747,8 +747,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -776,7 +776,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr d0 , [pA] ldr d0 , [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2d, v8.2d, v0.2d[0] fmla v16.2d, v8.2d, v0.d[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -803,8 +803,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32 add pA , pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -831,7 +831,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

228
kernel/arm64/dtrmm_kernel_8x4.S Executable file → Normal file
View File

@ -157,25 +157,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.2d, v3.2d}, [pA] ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.2d, v0.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.d[0]
fmul v17.2d, v1.2d, v8.2d[0] fmul v17.2d, v1.2d, v8.d[0]
fmul v18.2d, v2.2d, v8.2d[0] fmul v18.2d, v2.2d, v8.d[0]
fmul v19.2d, v3.2d, v8.2d[0] fmul v19.2d, v3.2d, v8.d[0]
fmul v20.2d, v0.2d, v8.2d[1] fmul v20.2d, v0.2d, v8.d[1]
fmul v21.2d, v1.2d, v8.2d[1] fmul v21.2d, v1.2d, v8.d[1]
fmul v22.2d, v2.2d, v8.2d[1] fmul v22.2d, v2.2d, v8.d[1]
fmul v23.2d, v3.2d, v8.2d[1] fmul v23.2d, v3.2d, v8.d[1]
fmul v24.2d, v0.2d, v9.2d[0] fmul v24.2d, v0.2d, v9.d[0]
fmul v25.2d, v1.2d, v9.2d[0] fmul v25.2d, v1.2d, v9.d[0]
fmul v26.2d, v2.2d, v9.2d[0] fmul v26.2d, v2.2d, v9.d[0]
fmul v27.2d, v3.2d, v9.2d[0] fmul v27.2d, v3.2d, v9.d[0]
fmul v28.2d, v0.2d, v9.2d[1] fmul v28.2d, v0.2d, v9.d[1]
fmul v29.2d, v1.2d, v9.2d[1] fmul v29.2d, v1.2d, v9.d[1]
fmul v30.2d, v2.2d, v9.2d[1] fmul v30.2d, v2.2d, v9.d[1]
fmul v31.2d, v3.2d, v9.2d[1] fmul v31.2d, v3.2d, v9.d[1]
ld1 {v4.2d, v5.2d}, [pA] ld1 {v4.2d, v5.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
@ -186,25 +186,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v2.2d, v8.2d[0] fmla v18.2d, v2.2d, v8.d[0]
fmla v19.2d, v3.2d, v8.2d[0] fmla v19.2d, v3.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
fmla v22.2d, v2.2d, v8.2d[1] fmla v22.2d, v2.2d, v8.d[1]
fmla v23.2d, v3.2d, v8.2d[1] fmla v23.2d, v3.2d, v8.d[1]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
fmla v26.2d, v2.2d, v9.2d[0] fmla v26.2d, v2.2d, v9.d[0]
fmla v27.2d, v3.2d, v9.2d[0] fmla v27.2d, v3.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
fmla v30.2d, v2.2d, v9.2d[1] fmla v30.2d, v2.2d, v9.d[1]
fmla v31.2d, v3.2d, v9.2d[1] fmla v31.2d, v3.2d, v9.d[1]
ld1 {v4.2d, v5.2d}, [pA] ld1 {v4.2d, v5.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
@ -217,25 +217,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
fmla v18.2d, v6.2d, v12.2d[0] fmla v18.2d, v6.2d, v12.d[0]
fmla v19.2d, v7.2d, v12.2d[0] fmla v19.2d, v7.2d, v12.d[0]
fmla v20.2d, v4.2d, v12.2d[1] fmla v20.2d, v4.2d, v12.d[1]
fmla v21.2d, v5.2d, v12.2d[1] fmla v21.2d, v5.2d, v12.d[1]
fmla v22.2d, v6.2d, v12.2d[1] fmla v22.2d, v6.2d, v12.d[1]
fmla v23.2d, v7.2d, v12.2d[1] fmla v23.2d, v7.2d, v12.d[1]
fmla v24.2d, v4.2d, v13.2d[0] fmla v24.2d, v4.2d, v13.d[0]
fmla v25.2d, v5.2d, v13.2d[0] fmla v25.2d, v5.2d, v13.d[0]
fmla v26.2d, v6.2d, v13.2d[0] fmla v26.2d, v6.2d, v13.d[0]
fmla v27.2d, v7.2d, v13.2d[0] fmla v27.2d, v7.2d, v13.d[0]
fmla v28.2d, v4.2d, v13.2d[1] fmla v28.2d, v4.2d, v13.d[1]
fmla v29.2d, v5.2d, v13.2d[1] fmla v29.2d, v5.2d, v13.d[1]
fmla v30.2d, v6.2d, v13.2d[1] fmla v30.2d, v6.2d, v13.d[1]
fmla v31.2d, v7.2d, v13.2d[1] fmla v31.2d, v7.2d, v13.d[1]
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
@ -248,25 +248,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
fmla v18.2d, v6.2d, v12.2d[0] fmla v18.2d, v6.2d, v12.d[0]
fmla v19.2d, v7.2d, v12.2d[0] fmla v19.2d, v7.2d, v12.d[0]
fmla v20.2d, v4.2d, v12.2d[1] fmla v20.2d, v4.2d, v12.d[1]
fmla v21.2d, v5.2d, v12.2d[1] fmla v21.2d, v5.2d, v12.d[1]
fmla v22.2d, v6.2d, v12.2d[1] fmla v22.2d, v6.2d, v12.d[1]
fmla v23.2d, v7.2d, v12.2d[1] fmla v23.2d, v7.2d, v12.d[1]
fmla v24.2d, v4.2d, v13.2d[0] fmla v24.2d, v4.2d, v13.d[0]
fmla v25.2d, v5.2d, v13.2d[0] fmla v25.2d, v5.2d, v13.d[0]
fmla v26.2d, v6.2d, v13.2d[0] fmla v26.2d, v6.2d, v13.d[0]
fmla v27.2d, v7.2d, v13.2d[0] fmla v27.2d, v7.2d, v13.d[0]
fmla v28.2d, v4.2d, v13.2d[1] fmla v28.2d, v4.2d, v13.d[1]
fmla v29.2d, v5.2d, v13.2d[1] fmla v29.2d, v5.2d, v13.d[1]
fmla v30.2d, v6.2d, v13.2d[1] fmla v30.2d, v6.2d, v13.d[1]
fmla v31.2d, v7.2d, v13.2d[1] fmla v31.2d, v7.2d, v13.d[1]
.endm .endm
.macro KERNEL8x4_SUB .macro KERNEL8x4_SUB
@ -277,25 +277,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.2d, v3.2d}, [pA] ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v2.2d, v8.2d[0] fmla v18.2d, v2.2d, v8.d[0]
fmla v19.2d, v3.2d, v8.2d[0] fmla v19.2d, v3.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
fmla v22.2d, v2.2d, v8.2d[1] fmla v22.2d, v2.2d, v8.d[1]
fmla v23.2d, v3.2d, v8.2d[1] fmla v23.2d, v3.2d, v8.d[1]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
fmla v26.2d, v2.2d, v9.2d[0] fmla v26.2d, v2.2d, v9.d[0]
fmla v27.2d, v3.2d, v9.2d[0] fmla v27.2d, v3.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
fmla v30.2d, v2.2d, v9.2d[1] fmla v30.2d, v2.2d, v9.d[1]
fmla v31.2d, v3.2d, v9.2d[1] fmla v31.2d, v3.2d, v9.d[1]
.endm .endm
.macro SAVE8x4 .macro SAVE8x4
@ -351,17 +351,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -406,10 +406,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -490,15 +490,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.2d, v3.2d}, [pA] ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v2.2d, v8.2d[0] fmla v18.2d, v2.2d, v8.d[0]
fmla v19.2d, v3.2d, v8.2d[0] fmla v19.2d, v3.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
fmla v22.2d, v2.2d, v8.2d[1] fmla v22.2d, v2.2d, v8.d[1]
fmla v23.2d, v3.2d, v8.2d[1] fmla v23.2d, v3.2d, v8.d[1]
.endm .endm
.macro SAVE8x2 .macro SAVE8x2
@ -534,10 +534,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -568,8 +568,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -597,7 +597,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr d0 , [pA] ldr d0 , [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2d, v8.2d, v0.2d[0] fmla v16.2d, v8.2d, v0.d[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -629,10 +629,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.2d, v3.2d}, [pA] ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v2.2d, v8.2d[0] fmla v18.2d, v2.2d, v8.d[0]
fmla v19.2d, v3.2d, v8.2d[0] fmla v19.2d, v3.2d, v8.d[0]
.endm .endm
.macro SAVE8x1 .macro SAVE8x1
@ -660,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32 add pA , pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -686,7 +686,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

View File

@ -158,25 +158,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA] ld1 {v3.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.4s, v0.4s, v8.2s[0] fmul v16.4s, v0.4s, v8.s[0]
fmul v17.4s, v1.4s, v8.2s[0] fmul v17.4s, v1.4s, v8.s[0]
fmul v18.4s, v2.4s, v8.2s[0] fmul v18.4s, v2.4s, v8.s[0]
fmul v19.4s, v3.4s, v8.2s[0] fmul v19.4s, v3.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.2s[1] fmul v20.4s, v0.4s, v8.s[1]
fmul v21.4s, v1.4s, v8.2s[1] fmul v21.4s, v1.4s, v8.s[1]
fmul v22.4s, v2.4s, v8.2s[1] fmul v22.4s, v2.4s, v8.s[1]
fmul v23.4s, v3.4s, v8.2s[1] fmul v23.4s, v3.4s, v8.s[1]
fmul v24.4s, v0.4s, v9.2s[0] fmul v24.4s, v0.4s, v9.s[0]
fmul v25.4s, v1.4s, v9.2s[0] fmul v25.4s, v1.4s, v9.s[0]
fmul v26.4s, v2.4s, v9.2s[0] fmul v26.4s, v2.4s, v9.s[0]
fmul v27.4s, v3.4s, v9.2s[0] fmul v27.4s, v3.4s, v9.s[0]
fmul v28.4s, v0.4s, v9.2s[1] fmul v28.4s, v0.4s, v9.s[1]
fmul v29.4s, v1.4s, v9.2s[1] fmul v29.4s, v1.4s, v9.s[1]
fmul v30.4s, v2.4s, v9.2s[1] fmul v30.4s, v2.4s, v9.s[1]
fmul v31.4s, v3.4s, v9.2s[1] fmul v31.4s, v3.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -191,25 +191,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL16x4_M1 .macro KERNEL16x4_M1
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v18.4s, v2.4s, v8.2s[0] fmla v18.4s, v2.4s, v8.s[0]
fmla v19.4s, v3.4s, v8.2s[0] fmla v19.4s, v3.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v22.4s, v2.4s, v8.2s[1] fmla v22.4s, v2.4s, v8.s[1]
fmla v23.4s, v3.4s, v8.2s[1] fmla v23.4s, v3.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v26.4s, v2.4s, v9.2s[0] fmla v26.4s, v2.4s, v9.s[0]
fmla v27.4s, v3.4s, v9.2s[0] fmla v27.4s, v3.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
fmla v30.4s, v2.4s, v9.2s[1] fmla v30.4s, v2.4s, v9.s[1]
fmla v31.4s, v3.4s, v9.2s[1] fmla v31.4s, v3.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -224,25 +224,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL16x4_M2 .macro KERNEL16x4_M2
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v18.4s, v6.4s, v12.2s[0] fmla v18.4s, v6.4s, v12.s[0]
fmla v19.4s, v7.4s, v12.2s[0] fmla v19.4s, v7.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v22.4s, v6.4s, v12.2s[1] fmla v22.4s, v6.4s, v12.s[1]
fmla v23.4s, v7.4s, v12.2s[1] fmla v23.4s, v7.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v26.4s, v6.4s, v13.2s[0] fmla v26.4s, v6.4s, v13.s[0]
fmla v27.4s, v7.4s, v13.2s[0] fmla v27.4s, v7.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
fmla v30.4s, v6.4s, v13.2s[1] fmla v30.4s, v6.4s, v13.s[1]
fmla v31.4s, v7.4s, v13.2s[1] fmla v31.4s, v7.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -257,25 +257,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL16x4_E .macro KERNEL16x4_E
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v18.4s, v6.4s, v12.2s[0] fmla v18.4s, v6.4s, v12.s[0]
fmla v19.4s, v7.4s, v12.2s[0] fmla v19.4s, v7.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v22.4s, v6.4s, v12.2s[1] fmla v22.4s, v6.4s, v12.s[1]
fmla v23.4s, v7.4s, v12.2s[1] fmla v23.4s, v7.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v26.4s, v6.4s, v13.2s[0] fmla v26.4s, v6.4s, v13.s[0]
fmla v27.4s, v7.4s, v13.2s[0] fmla v27.4s, v7.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
fmla v30.4s, v6.4s, v13.2s[1] fmla v30.4s, v6.4s, v13.s[1]
fmla v31.4s, v7.4s, v13.2s[1] fmla v31.4s, v7.4s, v13.s[1]
.endm .endm
.macro KERNEL16x4_SUB .macro KERNEL16x4_SUB
@ -290,25 +290,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA] ld1 {v3.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v18.4s, v2.4s, v8.2s[0] fmla v18.4s, v2.4s, v8.s[0]
fmla v19.4s, v3.4s, v8.2s[0] fmla v19.4s, v3.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v22.4s, v2.4s, v8.2s[1] fmla v22.4s, v2.4s, v8.s[1]
fmla v23.4s, v3.4s, v8.2s[1] fmla v23.4s, v3.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v26.4s, v2.4s, v9.2s[0] fmla v26.4s, v2.4s, v9.s[0]
fmla v27.4s, v3.4s, v9.2s[0] fmla v27.4s, v3.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
fmla v30.4s, v2.4s, v9.2s[1] fmla v30.4s, v2.4s, v9.s[1]
fmla v31.4s, v3.4s, v9.2s[1] fmla v31.4s, v3.4s, v9.s[1]
.endm .endm
.macro SAVE16x4 .macro SAVE16x4
@ -370,14 +370,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.4s, v0.4s, v8.2s[0] fmul v16.4s, v0.4s, v8.s[0]
fmul v17.4s, v1.4s, v8.2s[0] fmul v17.4s, v1.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.2s[1] fmul v20.4s, v0.4s, v8.s[1]
fmul v21.4s, v1.4s, v8.2s[1] fmul v21.4s, v1.4s, v8.s[1]
fmul v24.4s, v0.4s, v9.2s[0] fmul v24.4s, v0.4s, v9.s[0]
fmul v25.4s, v1.4s, v9.2s[0] fmul v25.4s, v1.4s, v9.s[0]
fmul v28.4s, v0.4s, v9.2s[1] fmul v28.4s, v0.4s, v9.s[1]
fmul v29.4s, v1.4s, v9.2s[1] fmul v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -388,14 +388,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -406,14 +406,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -424,14 +424,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
.endm .endm
.macro KERNEL8x4_SUB .macro KERNEL8x4_SUB
@ -442,14 +442,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
.endm .endm
.macro SAVE8x4 .macro SAVE8x4
@ -501,17 +501,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.2s, v0.2s, v8.2s[0] fmul v16.2s, v0.2s, v8.s[0]
fmul v29.2s, v1.2s, v9.2s[1] fmul v29.2s, v1.2s, v9.s[1]
fmul v20.2s, v0.2s, v8.2s[1] fmul v20.2s, v0.2s, v8.s[1]
fmul v25.2s, v1.2s, v9.2s[0] fmul v25.2s, v1.2s, v9.s[0]
fmul v24.2s, v0.2s, v9.2s[0] fmul v24.2s, v0.2s, v9.s[0]
fmul v21.2s, v1.2s, v8.2s[1] fmul v21.2s, v1.2s, v8.s[1]
fmul v28.2s, v0.2s, v9.2s[1] fmul v28.2s, v0.2s, v9.s[1]
fmul v17.2s, v1.2s, v8.2s[0] fmul v17.2s, v1.2s, v8.s[0]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -520,61 +520,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] // For next round ld1 {v12.2s, v13.2s}, [pB] // For next round
add pB, pB, #16 add pB, pB, #16
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
ld1 {v4.2s, v5.2s}, [pA] // For next round ld1 {v4.2s, v5.2s}, [pA] // For next round
add pA, pA, #16 add pA, pA, #16
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
fmla v16.2s, v4.2s, v12.2s[0] fmla v16.2s, v4.2s, v12.s[0]
fmla v29.2s, v5.2s, v13.2s[1] fmla v29.2s, v5.2s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] // For next round ld1 {v8.2s, v9.2s}, [pB] // For next round
add pB, pB, #16 add pB, pB, #16
fmla v20.2s, v4.2s, v12.2s[1] fmla v20.2s, v4.2s, v12.s[1]
fmla v25.2s, v5.2s, v13.2s[0] fmla v25.2s, v5.2s, v13.s[0]
ld1 {v0.2s, v1.2s}, [pA] // For next round ld1 {v0.2s, v1.2s}, [pA] // For next round
add pA, pA, #16 add pA, pA, #16
fmla v24.2s, v4.2s, v13.2s[0] fmla v24.2s, v4.2s, v13.s[0]
fmla v21.2s, v5.2s, v12.2s[1] fmla v21.2s, v5.2s, v12.s[1]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
fmla v28.2s, v4.2s, v13.2s[1] fmla v28.2s, v4.2s, v13.s[1]
fmla v17.2s, v5.2s, v12.2s[0] fmla v17.2s, v5.2s, v12.s[0]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
fmla v16.2s, v4.2s, v12.2s[0] fmla v16.2s, v4.2s, v12.s[0]
fmla v29.2s, v5.2s, v13.2s[1] fmla v29.2s, v5.2s, v13.s[1]
fmla v20.2s, v4.2s, v12.2s[1] fmla v20.2s, v4.2s, v12.s[1]
fmla v25.2s, v5.2s, v13.2s[0] fmla v25.2s, v5.2s, v13.s[0]
fmla v24.2s, v4.2s, v13.2s[0] fmla v24.2s, v4.2s, v13.s[0]
fmla v21.2s, v5.2s, v12.2s[1] fmla v21.2s, v5.2s, v12.s[1]
fmla v28.2s, v4.2s, v13.2s[1] fmla v28.2s, v4.2s, v13.s[1]
fmla v17.2s, v5.2s, v12.2s[0] fmla v17.2s, v5.2s, v12.s[0]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -583,17 +583,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -638,10 +638,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -729,15 +729,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA] ld1 {v3.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v18.4s, v2.4s, v8.2s[0] fmla v18.4s, v2.4s, v8.s[0]
fmla v19.4s, v3.4s, v8.2s[0] fmla v19.4s, v3.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v22.4s, v2.4s, v8.2s[1] fmla v22.4s, v2.4s, v8.s[1]
fmla v23.4s, v3.4s, v8.2s[1] fmla v23.4s, v3.4s, v8.s[1]
.endm .endm
.macro SAVE16x2 .macro SAVE16x2
@ -777,11 +777,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
.endm .endm
.macro SAVE8x2 .macro SAVE8x2
@ -817,10 +817,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -852,8 +852,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -882,7 +882,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0 , [pA] ldr s0 , [pA]
add pA, pA, #4 add pA, pA, #4
fmla v16.2s, v8.2s, v0.2s[0] fmla v16.2s, v8.2s, v0.s[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -918,10 +918,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA] ld1 {v3.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v18.4s, v2.4s, v8.2s[0] fmla v18.4s, v2.4s, v8.s[0]
fmla v19.4s, v3.4s, v8.2s[0] fmla v19.4s, v3.4s, v8.s[0]
.endm .endm
.macro SAVE16x1 .macro SAVE16x1
@ -951,8 +951,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
.endm .endm
.macro SAVE8x1 .macro SAVE8x1
@ -978,8 +978,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -1004,7 +1004,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA , pA, #8 add pA , pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

View File

@ -192,164 +192,164 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.4s}, [pA_0] ld1 {v0.4s}, [pA_0]
add pA_0, pA_0, #16 add pA_0, pA_0, #16
fmul v16.4s, v0.4s, v8.4s[0] fmul v16.4s, v0.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.4s[1] fmul v20.4s, v0.4s, v8.s[1]
ld1 {v2.4s}, [pA_1] ld1 {v2.4s}, [pA_1]
add pA_1, pA_1, #16 add pA_1, pA_1, #16
fmul v24.4s, v0.4s, v8.4s[2] fmul v24.4s, v0.4s, v8.s[2]
fmul v28.4s, v0.4s, v8.4s[3] fmul v28.4s, v0.4s, v8.s[3]
ld1 {v4.4s}, [pA_2] ld1 {v4.4s}, [pA_2]
add pA_2, pA_2, #16 add pA_2, pA_2, #16
fmul v17.4s, v2.4s, v8.4s[0] fmul v17.4s, v2.4s, v8.s[0]
fmul v21.4s, v2.4s, v8.4s[1] fmul v21.4s, v2.4s, v8.s[1]
ld1 {v6.4s}, [pA_3] ld1 {v6.4s}, [pA_3]
add pA_3, pA_3, #16 add pA_3, pA_3, #16
fmul v25.4s, v2.4s, v8.4s[2] fmul v25.4s, v2.4s, v8.s[2]
fmul v29.4s, v2.4s, v8.4s[3] fmul v29.4s, v2.4s, v8.s[3]
ld1 {v12.4s}, [pB] // for next round ld1 {v12.4s}, [pB] // for next round
add pB, pB, #16 add pB, pB, #16
fmul v18.4s, v4.4s, v8.4s[0] fmul v18.4s, v4.4s, v8.s[0]
fmul v19.4s, v6.4s, v8.4s[0] fmul v19.4s, v6.4s, v8.s[0]
ld1 {v1.4s}, [pA_0] // for next round ld1 {v1.4s}, [pA_0] // for next round
add pA_0, pA_0, #16 add pA_0, pA_0, #16
fmul v22.4s, v4.4s, v8.4s[1] fmul v22.4s, v4.4s, v8.s[1]
fmul v23.4s, v6.4s, v8.4s[1] fmul v23.4s, v6.4s, v8.s[1]
ld1 {v3.4s}, [pA_1] // for next round ld1 {v3.4s}, [pA_1] // for next round
add pA_1, pA_1, #16 add pA_1, pA_1, #16
fmul v26.4s, v4.4s, v8.4s[2] fmul v26.4s, v4.4s, v8.s[2]
fmul v27.4s, v6.4s, v8.4s[2] fmul v27.4s, v6.4s, v8.s[2]
ld1 {v5.4s}, [pA_2] // for next round ld1 {v5.4s}, [pA_2] // for next round
add pA_2, pA_2, #16 add pA_2, pA_2, #16
fmul v30.4s, v4.4s, v8.4s[3] fmul v30.4s, v4.4s, v8.s[3]
fmul v31.4s, v6.4s, v8.4s[3] fmul v31.4s, v6.4s, v8.s[3]
ld1 {v7.4s}, [pA_3] // for next round ld1 {v7.4s}, [pA_3] // for next round
add pA_3, pA_3, #16 add pA_3, pA_3, #16
.endm .endm
.macro KERNEL16x4_M2 .macro KERNEL16x4_M2
fmla v16.4s, v1.4s, v12.4s[0] fmla v16.4s, v1.4s, v12.s[0]
fmla v17.4s, v3.4s, v12.4s[0] fmla v17.4s, v3.4s, v12.s[0]
ld1 {v8.4s}, [pB] // for next round ld1 {v8.4s}, [pB] // for next round
add pB, pB, #16 add pB, pB, #16
fmla v18.4s, v5.4s, v12.4s[0] fmla v18.4s, v5.4s, v12.s[0]
fmla v19.4s, v7.4s, v12.4s[0] fmla v19.4s, v7.4s, v12.s[0]
ld1 {v0.4s}, [pA_0] // for next round ld1 {v0.4s}, [pA_0] // for next round
add pA_0, pA_0, #16 add pA_0, pA_0, #16
fmla v20.4s, v1.4s, v12.4s[1] fmla v20.4s, v1.4s, v12.s[1]
fmla v21.4s, v3.4s, v12.4s[1] fmla v21.4s, v3.4s, v12.s[1]
ld1 {v2.4s}, [pA_1] // for next round ld1 {v2.4s}, [pA_1] // for next round
add pA_1, pA_1, #16 add pA_1, pA_1, #16
fmla v22.4s, v5.4s, v12.4s[1] fmla v22.4s, v5.4s, v12.s[1]
fmla v23.4s, v7.4s, v12.4s[1] fmla v23.4s, v7.4s, v12.s[1]
ld1 {v4.4s}, [pA_2] // for next round ld1 {v4.4s}, [pA_2] // for next round
add pA_2, pA_2, #16 add pA_2, pA_2, #16
fmla v24.4s, v1.4s, v12.4s[2] fmla v24.4s, v1.4s, v12.s[2]
fmla v25.4s, v3.4s, v12.4s[2] fmla v25.4s, v3.4s, v12.s[2]
ld1 {v6.4s}, [pA_3] // for next round ld1 {v6.4s}, [pA_3] // for next round
add pA_3, pA_3, #16 add pA_3, pA_3, #16
fmla v26.4s, v5.4s, v12.4s[2] fmla v26.4s, v5.4s, v12.s[2]
fmla v27.4s, v7.4s, v12.4s[2] fmla v27.4s, v7.4s, v12.s[2]
prfm PLDL1KEEP, [pA_2, #512] prfm PLDL1KEEP, [pA_2, #512]
fmla v28.4s, v1.4s, v12.4s[3] fmla v28.4s, v1.4s, v12.s[3]
fmla v29.4s, v3.4s, v12.4s[3] fmla v29.4s, v3.4s, v12.s[3]
prfm PLDL1KEEP, [pA_3, #512] prfm PLDL1KEEP, [pA_3, #512]
fmla v30.4s, v5.4s, v12.4s[3] fmla v30.4s, v5.4s, v12.s[3]
fmla v31.4s, v7.4s, v12.4s[3] fmla v31.4s, v7.4s, v12.s[3]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
.endm .endm
.macro KERNEL16x4_M1 .macro KERNEL16x4_M1
fmla v16.4s, v0.4s, v8.4s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v2.4s, v8.4s[0] fmla v17.4s, v2.4s, v8.s[0]
ld1 {v12.4s}, [pB] // for next round ld1 {v12.4s}, [pB] // for next round
add pB, pB, #16 add pB, pB, #16
fmla v18.4s, v4.4s, v8.4s[0] fmla v18.4s, v4.4s, v8.s[0]
fmla v19.4s, v6.4s, v8.4s[0] fmla v19.4s, v6.4s, v8.s[0]
ld1 {v1.4s}, [pA_0] // for next round ld1 {v1.4s}, [pA_0] // for next round
add pA_0, pA_0, #16 add pA_0, pA_0, #16
fmla v20.4s, v0.4s, v8.4s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v2.4s, v8.4s[1] fmla v21.4s, v2.4s, v8.s[1]
ld1 {v3.4s}, [pA_1] // for next round ld1 {v3.4s}, [pA_1] // for next round
add pA_1, pA_1, #16 add pA_1, pA_1, #16
fmla v22.4s, v4.4s, v8.4s[1] fmla v22.4s, v4.4s, v8.s[1]
fmla v23.4s, v6.4s, v8.4s[1] fmla v23.4s, v6.4s, v8.s[1]
ld1 {v5.4s}, [pA_2] // for next round ld1 {v5.4s}, [pA_2] // for next round
add pA_2, pA_2, #16 add pA_2, pA_2, #16
fmla v24.4s, v0.4s, v8.4s[2] fmla v24.4s, v0.4s, v8.s[2]
fmla v25.4s, v2.4s, v8.4s[2] fmla v25.4s, v2.4s, v8.s[2]
ld1 {v7.4s}, [pA_3] // for next round ld1 {v7.4s}, [pA_3] // for next round
add pA_3, pA_3, #16 add pA_3, pA_3, #16
fmla v26.4s, v4.4s, v8.4s[2] fmla v26.4s, v4.4s, v8.s[2]
fmla v27.4s, v6.4s, v8.4s[2] fmla v27.4s, v6.4s, v8.s[2]
prfm PLDL1KEEP, [pA_0, #512] prfm PLDL1KEEP, [pA_0, #512]
fmla v28.4s, v0.4s, v8.4s[3] fmla v28.4s, v0.4s, v8.s[3]
fmla v29.4s, v2.4s, v8.4s[3] fmla v29.4s, v2.4s, v8.s[3]
prfm PLDL1KEEP, [pA_1, #512] prfm PLDL1KEEP, [pA_1, #512]
fmla v30.4s, v4.4s, v8.4s[3] fmla v30.4s, v4.4s, v8.s[3]
fmla v31.4s, v6.4s, v8.4s[3] fmla v31.4s, v6.4s, v8.s[3]
.endm .endm
.macro KERNEL16x4_E .macro KERNEL16x4_E
fmla v16.4s, v1.4s, v12.4s[0] fmla v16.4s, v1.4s, v12.s[0]
fmla v17.4s, v3.4s, v12.4s[0] fmla v17.4s, v3.4s, v12.s[0]
fmla v18.4s, v5.4s, v12.4s[0] fmla v18.4s, v5.4s, v12.s[0]
fmla v19.4s, v7.4s, v12.4s[0] fmla v19.4s, v7.4s, v12.s[0]
fmla v20.4s, v1.4s, v12.4s[1] fmla v20.4s, v1.4s, v12.s[1]
fmla v21.4s, v3.4s, v12.4s[1] fmla v21.4s, v3.4s, v12.s[1]
fmla v22.4s, v5.4s, v12.4s[1] fmla v22.4s, v5.4s, v12.s[1]
fmla v23.4s, v7.4s, v12.4s[1] fmla v23.4s, v7.4s, v12.s[1]
fmla v24.4s, v1.4s, v12.4s[2] fmla v24.4s, v1.4s, v12.s[2]
fmla v25.4s, v3.4s, v12.4s[2] fmla v25.4s, v3.4s, v12.s[2]
fmla v26.4s, v5.4s, v12.4s[2] fmla v26.4s, v5.4s, v12.s[2]
fmla v27.4s, v7.4s, v12.4s[2] fmla v27.4s, v7.4s, v12.s[2]
fmla v28.4s, v1.4s, v12.4s[3] fmla v28.4s, v1.4s, v12.s[3]
fmla v29.4s, v3.4s, v12.4s[3] fmla v29.4s, v3.4s, v12.s[3]
fmla v30.4s, v5.4s, v12.4s[3] fmla v30.4s, v5.4s, v12.s[3]
fmla v31.4s, v7.4s, v12.4s[3] fmla v31.4s, v7.4s, v12.s[3]
.endm .endm
.macro KERNEL16x4_SUB .macro KERNEL16x4_SUB
@ -359,34 +359,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.4s}, [pA_0] ld1 {v0.4s}, [pA_0]
add pA_0, pA_0, #16 add pA_0, pA_0, #16
fmla v16.4s, v0.4s, v8.4s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.4s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v24.4s, v0.4s, v8.4s[2] fmla v24.4s, v0.4s, v8.s[2]
fmla v28.4s, v0.4s, v8.4s[3] fmla v28.4s, v0.4s, v8.s[3]
ld1 {v2.4s}, [pA_1] ld1 {v2.4s}, [pA_1]
add pA_1, pA_1, #16 add pA_1, pA_1, #16
fmla v17.4s, v2.4s, v8.4s[0] fmla v17.4s, v2.4s, v8.s[0]
fmla v21.4s, v2.4s, v8.4s[1] fmla v21.4s, v2.4s, v8.s[1]
fmla v25.4s, v2.4s, v8.4s[2] fmla v25.4s, v2.4s, v8.s[2]
fmla v29.4s, v2.4s, v8.4s[3] fmla v29.4s, v2.4s, v8.s[3]
ld1 {v4.4s}, [pA_2] ld1 {v4.4s}, [pA_2]
add pA_2, pA_2, #16 add pA_2, pA_2, #16
fmla v18.4s, v4.4s, v8.4s[0] fmla v18.4s, v4.4s, v8.s[0]
fmla v22.4s, v4.4s, v8.4s[1] fmla v22.4s, v4.4s, v8.s[1]
fmla v26.4s, v4.4s, v8.4s[2] fmla v26.4s, v4.4s, v8.s[2]
fmla v30.4s, v4.4s, v8.4s[3] fmla v30.4s, v4.4s, v8.s[3]
ld1 {v6.4s}, [pA_3] ld1 {v6.4s}, [pA_3]
add pA_3, pA_3, #16 add pA_3, pA_3, #16
fmla v19.4s, v6.4s, v8.4s[0] fmla v19.4s, v6.4s, v8.s[0]
fmla v23.4s, v6.4s, v8.4s[1] fmla v23.4s, v6.4s, v8.s[1]
fmla v27.4s, v6.4s, v8.4s[2] fmla v27.4s, v6.4s, v8.s[2]
fmla v31.4s, v6.4s, v8.4s[3] fmla v31.4s, v6.4s, v8.s[3]
.endm .endm
.macro SAVE16x4 .macro SAVE16x4
@ -456,28 +456,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA_0] ld1 {v0.2s, v1.2s}, [pA_0]
add pA_0, pA_0, #16 add pA_0, pA_0, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
ld1 {v2.2s, v3.2s}, [pA_1] ld1 {v2.2s, v3.2s}, [pA_1]
add pA_1, pA_1, #16 add pA_1, pA_1, #16
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
fmla v18.2s, v2.2s, v8.2s[0] fmla v18.2s, v2.2s, v8.s[0]
fmla v31.2s, v3.2s, v9.2s[1] fmla v31.2s, v3.2s, v9.s[1]
fmla v22.2s, v2.2s, v8.2s[1] fmla v22.2s, v2.2s, v8.s[1]
fmla v27.2s, v3.2s, v9.2s[0] fmla v27.2s, v3.2s, v9.s[0]
fmla v26.2s, v2.2s, v9.2s[0] fmla v26.2s, v2.2s, v9.s[0]
fmla v23.2s, v3.2s, v8.2s[1] fmla v23.2s, v3.2s, v8.s[1]
fmla v30.2s, v2.2s, v9.2s[1] fmla v30.2s, v2.2s, v9.s[1]
fmla v19.2s, v3.2s, v8.2s[0] fmla v19.2s, v3.2s, v8.s[0]
.endm .endm
.macro SAVE8x4 .macro SAVE8x4
@ -556,17 +556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA_0] ld1 {v0.2s, v1.2s}, [pA_0]
add pA_0, pA_0, #16 add pA_0, pA_0, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -614,10 +614,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA_0] ld1 {v0.2s}, [pA_0]
add pA_0, pA_0, #8 add pA_0, pA_0, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -700,10 +700,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA_0] ld1 {v0.2s, v1.2s}, [pA_0]
add pA_0, pA_0, #16 add pA_0, pA_0, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -736,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA_0] ld1 {v0.2s}, [pA_0]
add pA_0, pA_0, #8 add pA_0, pA_0, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -767,7 +767,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0 , [pA_0] ldr s0 , [pA_0]
add pA_0, pA_0, #4 add pA_0, pA_0, #4
fmla v16.2s, v8.2s, v0.2s[0] fmla v16.2s, v8.2s, v0.s[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -796,8 +796,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA_0] ld1 {v0.2s, v1.2s}, [pA_0]
add pA_0 , pA_0, #16 add pA_0 , pA_0, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -825,7 +825,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA_0] ld1 {v0.2s}, [pA_0]
add pA_0 , pA_0, #8 add pA_0 , pA_0, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

View File

@ -157,22 +157,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.4s, v0.4s, v4.4s[0] fmul v16.4s, v0.4s, v4.s[0]
fmul v17.4s, v1.4s, v4.4s[0] fmul v17.4s, v1.4s, v4.s[0]
fmul v18.4s, v0.4s, v4.4s[1] fmul v18.4s, v0.4s, v4.s[1]
fmul v19.4s, v1.4s, v4.4s[1] fmul v19.4s, v1.4s, v4.s[1]
fmul v20.4s, v0.4s, v4.4s[2] fmul v20.4s, v0.4s, v4.s[2]
fmul v21.4s, v1.4s, v4.4s[2] fmul v21.4s, v1.4s, v4.s[2]
fmul v22.4s, v0.4s, v4.4s[3] fmul v22.4s, v0.4s, v4.s[3]
fmul v23.4s, v1.4s, v4.4s[3] fmul v23.4s, v1.4s, v4.s[3]
fmul v24.4s, v0.4s, v5.4s[0] fmul v24.4s, v0.4s, v5.s[0]
fmul v25.4s, v1.4s, v5.4s[0] fmul v25.4s, v1.4s, v5.s[0]
fmul v26.4s, v0.4s, v5.4s[1] fmul v26.4s, v0.4s, v5.s[1]
fmul v27.4s, v1.4s, v5.4s[1] fmul v27.4s, v1.4s, v5.s[1]
fmul v28.4s, v0.4s, v5.4s[2] fmul v28.4s, v0.4s, v5.s[2]
fmul v29.4s, v1.4s, v5.4s[2] fmul v29.4s, v1.4s, v5.s[2]
fmul v30.4s, v0.4s, v5.4s[3] fmul v30.4s, v0.4s, v5.s[3]
fmul v31.4s, v1.4s, v5.4s[3] fmul v31.4s, v1.4s, v5.s[3]
ld1 {v6.4s}, [pB] ld1 {v6.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -185,22 +185,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x8_M1 .macro KERNEL8x8_M1
fmla v16.4s, v0.4s, v4.4s[0] fmla v16.4s, v0.4s, v4.s[0]
fmla v17.4s, v1.4s, v4.4s[0] fmla v17.4s, v1.4s, v4.s[0]
fmla v18.4s, v0.4s, v4.4s[1] fmla v18.4s, v0.4s, v4.s[1]
fmla v19.4s, v1.4s, v4.4s[1] fmla v19.4s, v1.4s, v4.s[1]
fmla v20.4s, v0.4s, v4.4s[2] fmla v20.4s, v0.4s, v4.s[2]
fmla v21.4s, v1.4s, v4.4s[2] fmla v21.4s, v1.4s, v4.s[2]
fmla v22.4s, v0.4s, v4.4s[3] fmla v22.4s, v0.4s, v4.s[3]
fmla v23.4s, v1.4s, v4.4s[3] fmla v23.4s, v1.4s, v4.s[3]
fmla v24.4s, v0.4s, v5.4s[0] fmla v24.4s, v0.4s, v5.s[0]
fmla v25.4s, v1.4s, v5.4s[0] fmla v25.4s, v1.4s, v5.s[0]
fmla v26.4s, v0.4s, v5.4s[1] fmla v26.4s, v0.4s, v5.s[1]
fmla v27.4s, v1.4s, v5.4s[1] fmla v27.4s, v1.4s, v5.s[1]
fmla v28.4s, v0.4s, v5.4s[2] fmla v28.4s, v0.4s, v5.s[2]
fmla v29.4s, v1.4s, v5.4s[2] fmla v29.4s, v1.4s, v5.s[2]
fmla v30.4s, v0.4s, v5.4s[3] fmla v30.4s, v0.4s, v5.s[3]
fmla v31.4s, v1.4s, v5.4s[3] fmla v31.4s, v1.4s, v5.s[3]
ld1 {v6.4s}, [pB] ld1 {v6.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -213,22 +213,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x8_M2 .macro KERNEL8x8_M2
fmla v16.4s, v2.4s, v6.4s[0] fmla v16.4s, v2.4s, v6.s[0]
fmla v17.4s, v3.4s, v6.4s[0] fmla v17.4s, v3.4s, v6.s[0]
fmla v18.4s, v2.4s, v6.4s[1] fmla v18.4s, v2.4s, v6.s[1]
fmla v19.4s, v3.4s, v6.4s[1] fmla v19.4s, v3.4s, v6.s[1]
fmla v20.4s, v2.4s, v6.4s[2] fmla v20.4s, v2.4s, v6.s[2]
fmla v21.4s, v3.4s, v6.4s[2] fmla v21.4s, v3.4s, v6.s[2]
fmla v22.4s, v2.4s, v6.4s[3] fmla v22.4s, v2.4s, v6.s[3]
fmla v23.4s, v3.4s, v6.4s[3] fmla v23.4s, v3.4s, v6.s[3]
fmla v24.4s, v2.4s, v7.4s[0] fmla v24.4s, v2.4s, v7.s[0]
fmla v25.4s, v3.4s, v7.4s[0] fmla v25.4s, v3.4s, v7.s[0]
fmla v26.4s, v2.4s, v7.4s[1] fmla v26.4s, v2.4s, v7.s[1]
fmla v27.4s, v3.4s, v7.4s[1] fmla v27.4s, v3.4s, v7.s[1]
fmla v28.4s, v2.4s, v7.4s[2] fmla v28.4s, v2.4s, v7.s[2]
fmla v29.4s, v3.4s, v7.4s[2] fmla v29.4s, v3.4s, v7.s[2]
fmla v30.4s, v2.4s, v7.4s[3] fmla v30.4s, v2.4s, v7.s[3]
fmla v31.4s, v3.4s, v7.4s[3] fmla v31.4s, v3.4s, v7.s[3]
ld1 {v4.4s}, [pB] ld1 {v4.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -241,22 +241,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x8_E .macro KERNEL8x8_E
fmla v16.4s, v2.4s, v6.4s[0] fmla v16.4s, v2.4s, v6.s[0]
fmla v17.4s, v3.4s, v6.4s[0] fmla v17.4s, v3.4s, v6.s[0]
fmla v18.4s, v2.4s, v6.4s[1] fmla v18.4s, v2.4s, v6.s[1]
fmla v19.4s, v3.4s, v6.4s[1] fmla v19.4s, v3.4s, v6.s[1]
fmla v20.4s, v2.4s, v6.4s[2] fmla v20.4s, v2.4s, v6.s[2]
fmla v21.4s, v3.4s, v6.4s[2] fmla v21.4s, v3.4s, v6.s[2]
fmla v22.4s, v2.4s, v6.4s[3] fmla v22.4s, v2.4s, v6.s[3]
fmla v23.4s, v3.4s, v6.4s[3] fmla v23.4s, v3.4s, v6.s[3]
fmla v24.4s, v2.4s, v7.4s[0] fmla v24.4s, v2.4s, v7.s[0]
fmla v25.4s, v3.4s, v7.4s[0] fmla v25.4s, v3.4s, v7.s[0]
fmla v26.4s, v2.4s, v7.4s[1] fmla v26.4s, v2.4s, v7.s[1]
fmla v27.4s, v3.4s, v7.4s[1] fmla v27.4s, v3.4s, v7.s[1]
fmla v28.4s, v2.4s, v7.4s[2] fmla v28.4s, v2.4s, v7.s[2]
fmla v29.4s, v3.4s, v7.4s[2] fmla v29.4s, v3.4s, v7.s[2]
fmla v30.4s, v2.4s, v7.4s[3] fmla v30.4s, v2.4s, v7.s[3]
fmla v31.4s, v3.4s, v7.4s[3] fmla v31.4s, v3.4s, v7.s[3]
.endm .endm
.macro KERNEL8x8_SUB .macro KERNEL8x8_SUB
@ -269,22 +269,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v4.4s[0] fmla v16.4s, v0.4s, v4.s[0]
fmla v17.4s, v1.4s, v4.4s[0] fmla v17.4s, v1.4s, v4.s[0]
fmla v18.4s, v0.4s, v4.4s[1] fmla v18.4s, v0.4s, v4.s[1]
fmla v19.4s, v1.4s, v4.4s[1] fmla v19.4s, v1.4s, v4.s[1]
fmla v20.4s, v0.4s, v4.4s[2] fmla v20.4s, v0.4s, v4.s[2]
fmla v21.4s, v1.4s, v4.4s[2] fmla v21.4s, v1.4s, v4.s[2]
fmla v22.4s, v0.4s, v4.4s[3] fmla v22.4s, v0.4s, v4.s[3]
fmla v23.4s, v1.4s, v4.4s[3] fmla v23.4s, v1.4s, v4.s[3]
fmla v24.4s, v0.4s, v5.4s[0] fmla v24.4s, v0.4s, v5.s[0]
fmla v25.4s, v1.4s, v5.4s[0] fmla v25.4s, v1.4s, v5.s[0]
fmla v26.4s, v0.4s, v5.4s[1] fmla v26.4s, v0.4s, v5.s[1]
fmla v27.4s, v1.4s, v5.4s[1] fmla v27.4s, v1.4s, v5.s[1]
fmla v28.4s, v0.4s, v5.4s[2] fmla v28.4s, v0.4s, v5.s[2]
fmla v29.4s, v1.4s, v5.4s[2] fmla v29.4s, v1.4s, v5.s[2]
fmla v30.4s, v0.4s, v5.4s[3] fmla v30.4s, v0.4s, v5.s[3]
fmla v31.4s, v1.4s, v5.4s[3] fmla v31.4s, v1.4s, v5.s[3]
.endm .endm
.macro SAVE8x8 .macro SAVE8x8
@ -367,14 +367,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.4s}, [pA] ld1 {v0.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.4s, v0.4s, v4.4s[0] fmul v16.4s, v0.4s, v4.s[0]
fmul v18.4s, v0.4s, v4.4s[1] fmul v18.4s, v0.4s, v4.s[1]
fmul v20.4s, v0.4s, v4.4s[2] fmul v20.4s, v0.4s, v4.s[2]
fmul v22.4s, v0.4s, v4.4s[3] fmul v22.4s, v0.4s, v4.s[3]
fmul v24.4s, v0.4s, v5.4s[0] fmul v24.4s, v0.4s, v5.s[0]
fmul v26.4s, v0.4s, v5.4s[1] fmul v26.4s, v0.4s, v5.s[1]
fmul v28.4s, v0.4s, v5.4s[2] fmul v28.4s, v0.4s, v5.s[2]
fmul v30.4s, v0.4s, v5.4s[3] fmul v30.4s, v0.4s, v5.s[3]
ld1 {v6.4s}, [pB] ld1 {v6.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -385,14 +385,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_M1 .macro KERNEL4x8_M1
fmla v16.4s, v0.4s, v4.4s[0] fmla v16.4s, v0.4s, v4.s[0]
fmla v18.4s, v0.4s, v4.4s[1] fmla v18.4s, v0.4s, v4.s[1]
fmla v20.4s, v0.4s, v4.4s[2] fmla v20.4s, v0.4s, v4.s[2]
fmla v22.4s, v0.4s, v4.4s[3] fmla v22.4s, v0.4s, v4.s[3]
fmla v24.4s, v0.4s, v5.4s[0] fmla v24.4s, v0.4s, v5.s[0]
fmla v26.4s, v0.4s, v5.4s[1] fmla v26.4s, v0.4s, v5.s[1]
fmla v28.4s, v0.4s, v5.4s[2] fmla v28.4s, v0.4s, v5.s[2]
fmla v30.4s, v0.4s, v5.4s[3] fmla v30.4s, v0.4s, v5.s[3]
ld1 {v6.4s}, [pB] ld1 {v6.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -403,14 +403,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_M2 .macro KERNEL4x8_M2
fmla v16.4s, v2.4s, v6.4s[0] fmla v16.4s, v2.4s, v6.s[0]
fmla v18.4s, v2.4s, v6.4s[1] fmla v18.4s, v2.4s, v6.s[1]
fmla v20.4s, v2.4s, v6.4s[2] fmla v20.4s, v2.4s, v6.s[2]
fmla v22.4s, v2.4s, v6.4s[3] fmla v22.4s, v2.4s, v6.s[3]
fmla v24.4s, v2.4s, v7.4s[0] fmla v24.4s, v2.4s, v7.s[0]
fmla v26.4s, v2.4s, v7.4s[1] fmla v26.4s, v2.4s, v7.s[1]
fmla v28.4s, v2.4s, v7.4s[2] fmla v28.4s, v2.4s, v7.s[2]
fmla v30.4s, v2.4s, v7.4s[3] fmla v30.4s, v2.4s, v7.s[3]
ld1 {v4.4s}, [pB] ld1 {v4.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -421,14 +421,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_E .macro KERNEL4x8_E
fmla v16.4s, v2.4s, v6.4s[0] fmla v16.4s, v2.4s, v6.s[0]
fmla v18.4s, v2.4s, v6.4s[1] fmla v18.4s, v2.4s, v6.s[1]
fmla v20.4s, v2.4s, v6.4s[2] fmla v20.4s, v2.4s, v6.s[2]
fmla v22.4s, v2.4s, v6.4s[3] fmla v22.4s, v2.4s, v6.s[3]
fmla v24.4s, v2.4s, v7.4s[0] fmla v24.4s, v2.4s, v7.s[0]
fmla v26.4s, v2.4s, v7.4s[1] fmla v26.4s, v2.4s, v7.s[1]
fmla v28.4s, v2.4s, v7.4s[2] fmla v28.4s, v2.4s, v7.s[2]
fmla v30.4s, v2.4s, v7.4s[3] fmla v30.4s, v2.4s, v7.s[3]
.endm .endm
.macro KERNEL4x8_SUB .macro KERNEL4x8_SUB
@ -439,14 +439,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.4s}, [pA] ld1 {v0.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v4.4s[0] fmla v16.4s, v0.4s, v4.s[0]
fmla v18.4s, v0.4s, v4.4s[1] fmla v18.4s, v0.4s, v4.s[1]
fmla v20.4s, v0.4s, v4.4s[2] fmla v20.4s, v0.4s, v4.s[2]
fmla v22.4s, v0.4s, v4.4s[3] fmla v22.4s, v0.4s, v4.s[3]
fmla v24.4s, v0.4s, v5.4s[0] fmla v24.4s, v0.4s, v5.s[0]
fmla v26.4s, v0.4s, v5.4s[1] fmla v26.4s, v0.4s, v5.s[1]
fmla v28.4s, v0.4s, v5.4s[2] fmla v28.4s, v0.4s, v5.s[2]
fmla v30.4s, v0.4s, v5.4s[3] fmla v30.4s, v0.4s, v5.s[3]
.endm .endm
.macro SAVE4x8 .macro SAVE4x8
@ -520,14 +520,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v4.4s[0] fmla v16.2s, v0.2s, v4.s[0]
fmla v18.2s, v0.2s, v4.4s[1] fmla v18.2s, v0.2s, v4.s[1]
fmla v20.2s, v0.2s, v4.4s[2] fmla v20.2s, v0.2s, v4.s[2]
fmla v22.2s, v0.2s, v4.4s[3] fmla v22.2s, v0.2s, v4.s[3]
fmla v24.2s, v0.2s, v5.4s[0] fmla v24.2s, v0.2s, v5.s[0]
fmla v26.2s, v0.2s, v5.4s[1] fmla v26.2s, v0.2s, v5.s[1]
fmla v28.2s, v0.2s, v5.4s[2] fmla v28.2s, v0.2s, v5.s[2]
fmla v30.2s, v0.2s, v5.4s[3] fmla v30.2s, v0.2s, v5.s[3]
.endm .endm
.macro SAVE2x8 .macro SAVE2x8
@ -601,14 +601,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0, [pA] ldr s0, [pA]
add pA, pA, #4 add pA, pA, #4
fmla s16, s0, v4.4s[0] fmla s16, s0, v4.s[0]
fmla s18, s0, v4.4s[1] fmla s18, s0, v4.s[1]
fmla s20, s0, v4.4s[2] fmla s20, s0, v4.s[2]
fmla s22, s0, v4.4s[3] fmla s22, s0, v4.s[3]
fmla s24, s0, v5.4s[0] fmla s24, s0, v5.s[0]
fmla s26, s0, v5.4s[1] fmla s26, s0, v5.s[1]
fmla s28, s0, v5.4s[2] fmla s28, s0, v5.s[2]
fmla s30, s0, v5.4s[3] fmla s30, s0, v5.s[3]
.endm .endm
.macro SAVE1x8 .macro SAVE1x8
@ -682,14 +682,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.4s, v0.4s, v8.2s[0] fmul v16.4s, v0.4s, v8.s[0]
fmul v17.4s, v1.4s, v8.2s[0] fmul v17.4s, v1.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.2s[1] fmul v20.4s, v0.4s, v8.s[1]
fmul v21.4s, v1.4s, v8.2s[1] fmul v21.4s, v1.4s, v8.s[1]
fmul v24.4s, v0.4s, v9.2s[0] fmul v24.4s, v0.4s, v9.s[0]
fmul v25.4s, v1.4s, v9.2s[0] fmul v25.4s, v1.4s, v9.s[0]
fmul v28.4s, v0.4s, v9.2s[1] fmul v28.4s, v0.4s, v9.s[1]
fmul v29.4s, v1.4s, v9.2s[1] fmul v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -700,14 +700,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -718,14 +718,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -736,14 +736,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
.endm .endm
.macro KERNEL8x4_SUB .macro KERNEL8x4_SUB
@ -754,14 +754,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
.endm .endm
.macro SAVE8x4 .macro SAVE8x4
@ -814,17 +814,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.2s, v0.2s, v8.2s[0] fmul v16.2s, v0.2s, v8.s[0]
fmul v29.2s, v1.2s, v9.2s[1] fmul v29.2s, v1.2s, v9.s[1]
fmul v20.2s, v0.2s, v8.2s[1] fmul v20.2s, v0.2s, v8.s[1]
fmul v25.2s, v1.2s, v9.2s[0] fmul v25.2s, v1.2s, v9.s[0]
fmul v24.2s, v0.2s, v9.2s[0] fmul v24.2s, v0.2s, v9.s[0]
fmul v21.2s, v1.2s, v8.2s[1] fmul v21.2s, v1.2s, v8.s[1]
fmul v28.2s, v0.2s, v9.2s[1] fmul v28.2s, v0.2s, v9.s[1]
fmul v17.2s, v1.2s, v8.2s[0] fmul v17.2s, v1.2s, v8.s[0]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -833,61 +833,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] // For next round ld1 {v12.2s, v13.2s}, [pB] // For next round
add pB, pB, #16 add pB, pB, #16
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
ld1 {v4.2s, v5.2s}, [pA] // For next round ld1 {v4.2s, v5.2s}, [pA] // For next round
add pA, pA, #16 add pA, pA, #16
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
fmla v16.2s, v4.2s, v12.2s[0] fmla v16.2s, v4.2s, v12.s[0]
fmla v29.2s, v5.2s, v13.2s[1] fmla v29.2s, v5.2s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] // For next round ld1 {v8.2s, v9.2s}, [pB] // For next round
add pB, pB, #16 add pB, pB, #16
fmla v20.2s, v4.2s, v12.2s[1] fmla v20.2s, v4.2s, v12.s[1]
fmla v25.2s, v5.2s, v13.2s[0] fmla v25.2s, v5.2s, v13.s[0]
ld1 {v0.2s, v1.2s}, [pA] // For next round ld1 {v0.2s, v1.2s}, [pA] // For next round
add pA, pA, #16 add pA, pA, #16
fmla v24.2s, v4.2s, v13.2s[0] fmla v24.2s, v4.2s, v13.s[0]
fmla v21.2s, v5.2s, v12.2s[1] fmla v21.2s, v5.2s, v12.s[1]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
fmla v28.2s, v4.2s, v13.2s[1] fmla v28.2s, v4.2s, v13.s[1]
fmla v17.2s, v5.2s, v12.2s[0] fmla v17.2s, v5.2s, v12.s[0]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
fmla v16.2s, v4.2s, v12.2s[0] fmla v16.2s, v4.2s, v12.s[0]
fmla v29.2s, v5.2s, v13.2s[1] fmla v29.2s, v5.2s, v13.s[1]
fmla v20.2s, v4.2s, v12.2s[1] fmla v20.2s, v4.2s, v12.s[1]
fmla v25.2s, v5.2s, v13.2s[0] fmla v25.2s, v5.2s, v13.s[0]
fmla v24.2s, v4.2s, v13.2s[0] fmla v24.2s, v4.2s, v13.s[0]
fmla v21.2s, v5.2s, v12.2s[1] fmla v21.2s, v5.2s, v12.s[1]
fmla v28.2s, v4.2s, v13.2s[1] fmla v28.2s, v4.2s, v13.s[1]
fmla v17.2s, v5.2s, v12.2s[0] fmla v17.2s, v5.2s, v12.s[0]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -896,17 +896,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -951,10 +951,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -1034,11 +1034,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
.endm .endm
.macro SAVE8x2 .macro SAVE8x2
@ -1074,10 +1074,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -1109,8 +1109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -1139,7 +1139,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0 , [pA] ldr s0 , [pA]
add pA, pA, #4 add pA, pA, #4
fmla v16.2s, v8.2s, v0.2s[0] fmla v16.2s, v8.2s, v0.s[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -1169,8 +1169,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
.endm .endm
.macro SAVE8x1 .macro SAVE8x1
@ -1196,8 +1196,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -1222,7 +1222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA , pA, #8 add pA , pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

384
kernel/arm64/strmm_kernel_16x4.S Executable file → Normal file
View File

@ -161,25 +161,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA] ld1 {v3.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.4s, v0.4s, v8.2s[0] fmul v16.4s, v0.4s, v8.s[0]
fmul v17.4s, v1.4s, v8.2s[0] fmul v17.4s, v1.4s, v8.s[0]
fmul v18.4s, v2.4s, v8.2s[0] fmul v18.4s, v2.4s, v8.s[0]
fmul v19.4s, v3.4s, v8.2s[0] fmul v19.4s, v3.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.2s[1] fmul v20.4s, v0.4s, v8.s[1]
fmul v21.4s, v1.4s, v8.2s[1] fmul v21.4s, v1.4s, v8.s[1]
fmul v22.4s, v2.4s, v8.2s[1] fmul v22.4s, v2.4s, v8.s[1]
fmul v23.4s, v3.4s, v8.2s[1] fmul v23.4s, v3.4s, v8.s[1]
fmul v24.4s, v0.4s, v9.2s[0] fmul v24.4s, v0.4s, v9.s[0]
fmul v25.4s, v1.4s, v9.2s[0] fmul v25.4s, v1.4s, v9.s[0]
fmul v26.4s, v2.4s, v9.2s[0] fmul v26.4s, v2.4s, v9.s[0]
fmul v27.4s, v3.4s, v9.2s[0] fmul v27.4s, v3.4s, v9.s[0]
fmul v28.4s, v0.4s, v9.2s[1] fmul v28.4s, v0.4s, v9.s[1]
fmul v29.4s, v1.4s, v9.2s[1] fmul v29.4s, v1.4s, v9.s[1]
fmul v30.4s, v2.4s, v9.2s[1] fmul v30.4s, v2.4s, v9.s[1]
fmul v31.4s, v3.4s, v9.2s[1] fmul v31.4s, v3.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -194,25 +194,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL16x4_M1 .macro KERNEL16x4_M1
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v18.4s, v2.4s, v8.2s[0] fmla v18.4s, v2.4s, v8.s[0]
fmla v19.4s, v3.4s, v8.2s[0] fmla v19.4s, v3.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v22.4s, v2.4s, v8.2s[1] fmla v22.4s, v2.4s, v8.s[1]
fmla v23.4s, v3.4s, v8.2s[1] fmla v23.4s, v3.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v26.4s, v2.4s, v9.2s[0] fmla v26.4s, v2.4s, v9.s[0]
fmla v27.4s, v3.4s, v9.2s[0] fmla v27.4s, v3.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
fmla v30.4s, v2.4s, v9.2s[1] fmla v30.4s, v2.4s, v9.s[1]
fmla v31.4s, v3.4s, v9.2s[1] fmla v31.4s, v3.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -227,25 +227,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL16x4_M2 .macro KERNEL16x4_M2
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v18.4s, v6.4s, v12.2s[0] fmla v18.4s, v6.4s, v12.s[0]
fmla v19.4s, v7.4s, v12.2s[0] fmla v19.4s, v7.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v22.4s, v6.4s, v12.2s[1] fmla v22.4s, v6.4s, v12.s[1]
fmla v23.4s, v7.4s, v12.2s[1] fmla v23.4s, v7.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v26.4s, v6.4s, v13.2s[0] fmla v26.4s, v6.4s, v13.s[0]
fmla v27.4s, v7.4s, v13.2s[0] fmla v27.4s, v7.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
fmla v30.4s, v6.4s, v13.2s[1] fmla v30.4s, v6.4s, v13.s[1]
fmla v31.4s, v7.4s, v13.2s[1] fmla v31.4s, v7.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -260,25 +260,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL16x4_E .macro KERNEL16x4_E
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v18.4s, v6.4s, v12.2s[0] fmla v18.4s, v6.4s, v12.s[0]
fmla v19.4s, v7.4s, v12.2s[0] fmla v19.4s, v7.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v22.4s, v6.4s, v12.2s[1] fmla v22.4s, v6.4s, v12.s[1]
fmla v23.4s, v7.4s, v12.2s[1] fmla v23.4s, v7.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v26.4s, v6.4s, v13.2s[0] fmla v26.4s, v6.4s, v13.s[0]
fmla v27.4s, v7.4s, v13.2s[0] fmla v27.4s, v7.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
fmla v30.4s, v6.4s, v13.2s[1] fmla v30.4s, v6.4s, v13.s[1]
fmla v31.4s, v7.4s, v13.2s[1] fmla v31.4s, v7.4s, v13.s[1]
.endm .endm
.macro KERNEL16x4_SUB .macro KERNEL16x4_SUB
@ -293,25 +293,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA] ld1 {v3.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v18.4s, v2.4s, v8.2s[0] fmla v18.4s, v2.4s, v8.s[0]
fmla v19.4s, v3.4s, v8.2s[0] fmla v19.4s, v3.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v22.4s, v2.4s, v8.2s[1] fmla v22.4s, v2.4s, v8.s[1]
fmla v23.4s, v3.4s, v8.2s[1] fmla v23.4s, v3.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v26.4s, v2.4s, v9.2s[0] fmla v26.4s, v2.4s, v9.s[0]
fmla v27.4s, v3.4s, v9.2s[0] fmla v27.4s, v3.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
fmla v30.4s, v2.4s, v9.2s[1] fmla v30.4s, v2.4s, v9.s[1]
fmla v31.4s, v3.4s, v9.2s[1] fmla v31.4s, v3.4s, v9.s[1]
.endm .endm
.macro SAVE16x4 .macro SAVE16x4
@ -369,14 +369,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.4s, v0.4s, v8.2s[0] fmul v16.4s, v0.4s, v8.s[0]
fmul v17.4s, v1.4s, v8.2s[0] fmul v17.4s, v1.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.2s[1] fmul v20.4s, v0.4s, v8.s[1]
fmul v21.4s, v1.4s, v8.2s[1] fmul v21.4s, v1.4s, v8.s[1]
fmul v24.4s, v0.4s, v9.2s[0] fmul v24.4s, v0.4s, v9.s[0]
fmul v25.4s, v1.4s, v9.2s[0] fmul v25.4s, v1.4s, v9.s[0]
fmul v28.4s, v0.4s, v9.2s[1] fmul v28.4s, v0.4s, v9.s[1]
fmul v29.4s, v1.4s, v9.2s[1] fmul v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -387,14 +387,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -405,14 +405,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -423,14 +423,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
.endm .endm
.macro KERNEL8x4_SUB .macro KERNEL8x4_SUB
@ -441,14 +441,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
.endm .endm
.macro SAVE8x4 .macro SAVE8x4
@ -496,17 +496,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.2s, v0.2s, v8.2s[0] fmul v16.2s, v0.2s, v8.s[0]
fmul v29.2s, v1.2s, v9.2s[1] fmul v29.2s, v1.2s, v9.s[1]
fmul v20.2s, v0.2s, v8.2s[1] fmul v20.2s, v0.2s, v8.s[1]
fmul v25.2s, v1.2s, v9.2s[0] fmul v25.2s, v1.2s, v9.s[0]
fmul v24.2s, v0.2s, v9.2s[0] fmul v24.2s, v0.2s, v9.s[0]
fmul v21.2s, v1.2s, v8.2s[1] fmul v21.2s, v1.2s, v8.s[1]
fmul v28.2s, v0.2s, v9.2s[1] fmul v28.2s, v0.2s, v9.s[1]
fmul v17.2s, v1.2s, v8.2s[0] fmul v17.2s, v1.2s, v8.s[0]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -515,61 +515,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] // For next round ld1 {v12.2s, v13.2s}, [pB] // For next round
add pB, pB, #16 add pB, pB, #16
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
ld1 {v4.2s, v5.2s}, [pA] // For next round ld1 {v4.2s, v5.2s}, [pA] // For next round
add pA, pA, #16 add pA, pA, #16
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
fmla v16.2s, v4.2s, v12.2s[0] fmla v16.2s, v4.2s, v12.s[0]
fmla v29.2s, v5.2s, v13.2s[1] fmla v29.2s, v5.2s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] // For next round ld1 {v8.2s, v9.2s}, [pB] // For next round
add pB, pB, #16 add pB, pB, #16
fmla v20.2s, v4.2s, v12.2s[1] fmla v20.2s, v4.2s, v12.s[1]
fmla v25.2s, v5.2s, v13.2s[0] fmla v25.2s, v5.2s, v13.s[0]
ld1 {v0.2s, v1.2s}, [pA] // For next round ld1 {v0.2s, v1.2s}, [pA] // For next round
add pA, pA, #16 add pA, pA, #16
fmla v24.2s, v4.2s, v13.2s[0] fmla v24.2s, v4.2s, v13.s[0]
fmla v21.2s, v5.2s, v12.2s[1] fmla v21.2s, v5.2s, v12.s[1]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
fmla v28.2s, v4.2s, v13.2s[1] fmla v28.2s, v4.2s, v13.s[1]
fmla v17.2s, v5.2s, v12.2s[0] fmla v17.2s, v5.2s, v12.s[0]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
fmla v16.2s, v4.2s, v12.2s[0] fmla v16.2s, v4.2s, v12.s[0]
fmla v29.2s, v5.2s, v13.2s[1] fmla v29.2s, v5.2s, v13.s[1]
fmla v20.2s, v4.2s, v12.2s[1] fmla v20.2s, v4.2s, v12.s[1]
fmla v25.2s, v5.2s, v13.2s[0] fmla v25.2s, v5.2s, v13.s[0]
fmla v24.2s, v4.2s, v13.2s[0] fmla v24.2s, v4.2s, v13.s[0]
fmla v21.2s, v5.2s, v12.2s[1] fmla v21.2s, v5.2s, v12.s[1]
fmla v28.2s, v4.2s, v13.2s[1] fmla v28.2s, v4.2s, v13.s[1]
fmla v17.2s, v5.2s, v12.2s[0] fmla v17.2s, v5.2s, v12.s[0]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -578,17 +578,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -633,10 +633,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -718,15 +718,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA] ld1 {v3.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v18.4s, v2.4s, v8.2s[0] fmla v18.4s, v2.4s, v8.s[0]
fmla v19.4s, v3.4s, v8.2s[0] fmla v19.4s, v3.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v22.4s, v2.4s, v8.2s[1] fmla v22.4s, v2.4s, v8.s[1]
fmla v23.4s, v3.4s, v8.2s[1] fmla v23.4s, v3.4s, v8.s[1]
.endm .endm
.macro SAVE16x2 .macro SAVE16x2
@ -764,11 +764,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
.endm .endm
.macro SAVE8x2 .macro SAVE8x2
@ -802,10 +802,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -837,8 +837,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -866,7 +866,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0 , [pA] ldr s0 , [pA]
add pA, pA, #4 add pA, pA, #4
fmla v16.2s, v8.2s, v0.2s[0] fmla v16.2s, v8.2s, v0.s[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -901,10 +901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA] ld1 {v3.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v18.4s, v2.4s, v8.2s[0] fmla v18.4s, v2.4s, v8.s[0]
fmla v19.4s, v3.4s, v8.2s[0] fmla v19.4s, v3.4s, v8.s[0]
.endm .endm
.macro SAVE16x1 .macro SAVE16x1
@ -934,8 +934,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
.endm .endm
.macro SAVE8x1 .macro SAVE8x1
@ -961,8 +961,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -987,7 +987,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA , pA, #8 add pA , pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

View File

@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.2s, v0.2s, v8.2s[0] fmul v16.2s, v0.2s, v8.s[0]
fmul v29.2s, v1.2s, v9.2s[1] fmul v29.2s, v1.2s, v9.s[1]
fmul v20.2s, v0.2s, v8.2s[1] fmul v20.2s, v0.2s, v8.s[1]
fmul v25.2s, v1.2s, v9.2s[0] fmul v25.2s, v1.2s, v9.s[0]
fmul v24.2s, v0.2s, v9.2s[0] fmul v24.2s, v0.2s, v9.s[0]
fmul v21.2s, v1.2s, v8.2s[1] fmul v21.2s, v1.2s, v8.s[1]
fmul v28.2s, v0.2s, v9.2s[1] fmul v28.2s, v0.2s, v9.s[1]
fmul v17.2s, v1.2s, v8.2s[0] fmul v17.2s, v1.2s, v8.s[0]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] // For next round ld1 {v12.2s, v13.2s}, [pB] // For next round
add pB, pB, #16 add pB, pB, #16
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
ld1 {v4.2s, v5.2s}, [pA] // For next round ld1 {v4.2s, v5.2s}, [pA] // For next round
add pA, pA, #16 add pA, pA, #16
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
fmla v16.2s, v4.2s, v12.2s[0] fmla v16.2s, v4.2s, v12.s[0]
fmla v29.2s, v5.2s, v13.2s[1] fmla v29.2s, v5.2s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] // For next round ld1 {v8.2s, v9.2s}, [pB] // For next round
add pB, pB, #16 add pB, pB, #16
fmla v20.2s, v4.2s, v12.2s[1] fmla v20.2s, v4.2s, v12.s[1]
fmla v25.2s, v5.2s, v13.2s[0] fmla v25.2s, v5.2s, v13.s[0]
ld1 {v0.2s, v1.2s}, [pA] // For next round ld1 {v0.2s, v1.2s}, [pA] // For next round
add pA, pA, #16 add pA, pA, #16
fmla v24.2s, v4.2s, v13.2s[0] fmla v24.2s, v4.2s, v13.s[0]
fmla v21.2s, v5.2s, v12.2s[1] fmla v21.2s, v5.2s, v12.s[1]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
fmla v28.2s, v4.2s, v13.2s[1] fmla v28.2s, v4.2s, v13.s[1]
fmla v17.2s, v5.2s, v12.2s[0] fmla v17.2s, v5.2s, v12.s[0]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
fmla v16.2s, v4.2s, v12.2s[0] fmla v16.2s, v4.2s, v12.s[0]
fmla v29.2s, v5.2s, v13.2s[1] fmla v29.2s, v5.2s, v13.s[1]
fmla v20.2s, v4.2s, v12.2s[1] fmla v20.2s, v4.2s, v12.s[1]
fmla v25.2s, v5.2s, v13.2s[0] fmla v25.2s, v5.2s, v13.s[0]
fmla v24.2s, v4.2s, v13.2s[0] fmla v24.2s, v4.2s, v13.s[0]
fmla v21.2s, v5.2s, v12.2s[1] fmla v21.2s, v5.2s, v12.s[1]
fmla v28.2s, v4.2s, v13.2s[1] fmla v28.2s, v4.2s, v13.s[1]
fmla v17.2s, v5.2s, v12.2s[0] fmla v17.2s, v5.2s, v12.s[0]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -280,10 +280,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -353,10 +353,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -386,8 +386,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -414,7 +414,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0 , [pA] ldr s0 , [pA]
add pA, pA, #4 add pA, pA, #4
fmla v16.2s, v8.2s, v0.2s[0] fmla v16.2s, v8.2s, v0.s[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -440,8 +440,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -468,7 +468,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA , pA, #8 add pA , pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

472
kernel/arm64/strmm_kernel_8x8.S Executable file → Normal file
View File

@ -159,22 +159,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.4s, v0.4s, v4.4s[0] fmul v16.4s, v0.4s, v4.s[0]
fmul v17.4s, v1.4s, v4.4s[0] fmul v17.4s, v1.4s, v4.s[0]
fmul v18.4s, v0.4s, v4.4s[1] fmul v18.4s, v0.4s, v4.s[1]
fmul v19.4s, v1.4s, v4.4s[1] fmul v19.4s, v1.4s, v4.s[1]
fmul v20.4s, v0.4s, v4.4s[2] fmul v20.4s, v0.4s, v4.s[2]
fmul v21.4s, v1.4s, v4.4s[2] fmul v21.4s, v1.4s, v4.s[2]
fmul v22.4s, v0.4s, v4.4s[3] fmul v22.4s, v0.4s, v4.s[3]
fmul v23.4s, v1.4s, v4.4s[3] fmul v23.4s, v1.4s, v4.s[3]
fmul v24.4s, v0.4s, v5.4s[0] fmul v24.4s, v0.4s, v5.s[0]
fmul v25.4s, v1.4s, v5.4s[0] fmul v25.4s, v1.4s, v5.s[0]
fmul v26.4s, v0.4s, v5.4s[1] fmul v26.4s, v0.4s, v5.s[1]
fmul v27.4s, v1.4s, v5.4s[1] fmul v27.4s, v1.4s, v5.s[1]
fmul v28.4s, v0.4s, v5.4s[2] fmul v28.4s, v0.4s, v5.s[2]
fmul v29.4s, v1.4s, v5.4s[2] fmul v29.4s, v1.4s, v5.s[2]
fmul v30.4s, v0.4s, v5.4s[3] fmul v30.4s, v0.4s, v5.s[3]
fmul v31.4s, v1.4s, v5.4s[3] fmul v31.4s, v1.4s, v5.s[3]
ld1 {v6.4s}, [pB] ld1 {v6.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -187,22 +187,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x8_M1 .macro KERNEL8x8_M1
fmla v16.4s, v0.4s, v4.4s[0] fmla v16.4s, v0.4s, v4.s[0]
fmla v17.4s, v1.4s, v4.4s[0] fmla v17.4s, v1.4s, v4.s[0]
fmla v18.4s, v0.4s, v4.4s[1] fmla v18.4s, v0.4s, v4.s[1]
fmla v19.4s, v1.4s, v4.4s[1] fmla v19.4s, v1.4s, v4.s[1]
fmla v20.4s, v0.4s, v4.4s[2] fmla v20.4s, v0.4s, v4.s[2]
fmla v21.4s, v1.4s, v4.4s[2] fmla v21.4s, v1.4s, v4.s[2]
fmla v22.4s, v0.4s, v4.4s[3] fmla v22.4s, v0.4s, v4.s[3]
fmla v23.4s, v1.4s, v4.4s[3] fmla v23.4s, v1.4s, v4.s[3]
fmla v24.4s, v0.4s, v5.4s[0] fmla v24.4s, v0.4s, v5.s[0]
fmla v25.4s, v1.4s, v5.4s[0] fmla v25.4s, v1.4s, v5.s[0]
fmla v26.4s, v0.4s, v5.4s[1] fmla v26.4s, v0.4s, v5.s[1]
fmla v27.4s, v1.4s, v5.4s[1] fmla v27.4s, v1.4s, v5.s[1]
fmla v28.4s, v0.4s, v5.4s[2] fmla v28.4s, v0.4s, v5.s[2]
fmla v29.4s, v1.4s, v5.4s[2] fmla v29.4s, v1.4s, v5.s[2]
fmla v30.4s, v0.4s, v5.4s[3] fmla v30.4s, v0.4s, v5.s[3]
fmla v31.4s, v1.4s, v5.4s[3] fmla v31.4s, v1.4s, v5.s[3]
ld1 {v6.4s}, [pB] ld1 {v6.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -215,22 +215,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x8_M2 .macro KERNEL8x8_M2
fmla v16.4s, v2.4s, v6.4s[0] fmla v16.4s, v2.4s, v6.s[0]
fmla v17.4s, v3.4s, v6.4s[0] fmla v17.4s, v3.4s, v6.s[0]
fmla v18.4s, v2.4s, v6.4s[1] fmla v18.4s, v2.4s, v6.s[1]
fmla v19.4s, v3.4s, v6.4s[1] fmla v19.4s, v3.4s, v6.s[1]
fmla v20.4s, v2.4s, v6.4s[2] fmla v20.4s, v2.4s, v6.s[2]
fmla v21.4s, v3.4s, v6.4s[2] fmla v21.4s, v3.4s, v6.s[2]
fmla v22.4s, v2.4s, v6.4s[3] fmla v22.4s, v2.4s, v6.s[3]
fmla v23.4s, v3.4s, v6.4s[3] fmla v23.4s, v3.4s, v6.s[3]
fmla v24.4s, v2.4s, v7.4s[0] fmla v24.4s, v2.4s, v7.s[0]
fmla v25.4s, v3.4s, v7.4s[0] fmla v25.4s, v3.4s, v7.s[0]
fmla v26.4s, v2.4s, v7.4s[1] fmla v26.4s, v2.4s, v7.s[1]
fmla v27.4s, v3.4s, v7.4s[1] fmla v27.4s, v3.4s, v7.s[1]
fmla v28.4s, v2.4s, v7.4s[2] fmla v28.4s, v2.4s, v7.s[2]
fmla v29.4s, v3.4s, v7.4s[2] fmla v29.4s, v3.4s, v7.s[2]
fmla v30.4s, v2.4s, v7.4s[3] fmla v30.4s, v2.4s, v7.s[3]
fmla v31.4s, v3.4s, v7.4s[3] fmla v31.4s, v3.4s, v7.s[3]
ld1 {v4.4s}, [pB] ld1 {v4.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -243,22 +243,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x8_E .macro KERNEL8x8_E
fmla v16.4s, v2.4s, v6.4s[0] fmla v16.4s, v2.4s, v6.s[0]
fmla v17.4s, v3.4s, v6.4s[0] fmla v17.4s, v3.4s, v6.s[0]
fmla v18.4s, v2.4s, v6.4s[1] fmla v18.4s, v2.4s, v6.s[1]
fmla v19.4s, v3.4s, v6.4s[1] fmla v19.4s, v3.4s, v6.s[1]
fmla v20.4s, v2.4s, v6.4s[2] fmla v20.4s, v2.4s, v6.s[2]
fmla v21.4s, v3.4s, v6.4s[2] fmla v21.4s, v3.4s, v6.s[2]
fmla v22.4s, v2.4s, v6.4s[3] fmla v22.4s, v2.4s, v6.s[3]
fmla v23.4s, v3.4s, v6.4s[3] fmla v23.4s, v3.4s, v6.s[3]
fmla v24.4s, v2.4s, v7.4s[0] fmla v24.4s, v2.4s, v7.s[0]
fmla v25.4s, v3.4s, v7.4s[0] fmla v25.4s, v3.4s, v7.s[0]
fmla v26.4s, v2.4s, v7.4s[1] fmla v26.4s, v2.4s, v7.s[1]
fmla v27.4s, v3.4s, v7.4s[1] fmla v27.4s, v3.4s, v7.s[1]
fmla v28.4s, v2.4s, v7.4s[2] fmla v28.4s, v2.4s, v7.s[2]
fmla v29.4s, v3.4s, v7.4s[2] fmla v29.4s, v3.4s, v7.s[2]
fmla v30.4s, v2.4s, v7.4s[3] fmla v30.4s, v2.4s, v7.s[3]
fmla v31.4s, v3.4s, v7.4s[3] fmla v31.4s, v3.4s, v7.s[3]
.endm .endm
.macro KERNEL8x8_SUB .macro KERNEL8x8_SUB
@ -271,22 +271,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v4.4s[0] fmla v16.4s, v0.4s, v4.s[0]
fmla v17.4s, v1.4s, v4.4s[0] fmla v17.4s, v1.4s, v4.s[0]
fmla v18.4s, v0.4s, v4.4s[1] fmla v18.4s, v0.4s, v4.s[1]
fmla v19.4s, v1.4s, v4.4s[1] fmla v19.4s, v1.4s, v4.s[1]
fmla v20.4s, v0.4s, v4.4s[2] fmla v20.4s, v0.4s, v4.s[2]
fmla v21.4s, v1.4s, v4.4s[2] fmla v21.4s, v1.4s, v4.s[2]
fmla v22.4s, v0.4s, v4.4s[3] fmla v22.4s, v0.4s, v4.s[3]
fmla v23.4s, v1.4s, v4.4s[3] fmla v23.4s, v1.4s, v4.s[3]
fmla v24.4s, v0.4s, v5.4s[0] fmla v24.4s, v0.4s, v5.s[0]
fmla v25.4s, v1.4s, v5.4s[0] fmla v25.4s, v1.4s, v5.s[0]
fmla v26.4s, v0.4s, v5.4s[1] fmla v26.4s, v0.4s, v5.s[1]
fmla v27.4s, v1.4s, v5.4s[1] fmla v27.4s, v1.4s, v5.s[1]
fmla v28.4s, v0.4s, v5.4s[2] fmla v28.4s, v0.4s, v5.s[2]
fmla v29.4s, v1.4s, v5.4s[2] fmla v29.4s, v1.4s, v5.s[2]
fmla v30.4s, v0.4s, v5.4s[3] fmla v30.4s, v0.4s, v5.s[3]
fmla v31.4s, v1.4s, v5.4s[3] fmla v31.4s, v1.4s, v5.s[3]
.endm .endm
.macro SAVE8x8 .macro SAVE8x8
@ -361,14 +361,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.4s}, [pA] ld1 {v0.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.4s, v0.4s, v4.4s[0] fmul v16.4s, v0.4s, v4.s[0]
fmul v18.4s, v0.4s, v4.4s[1] fmul v18.4s, v0.4s, v4.s[1]
fmul v20.4s, v0.4s, v4.4s[2] fmul v20.4s, v0.4s, v4.s[2]
fmul v22.4s, v0.4s, v4.4s[3] fmul v22.4s, v0.4s, v4.s[3]
fmul v24.4s, v0.4s, v5.4s[0] fmul v24.4s, v0.4s, v5.s[0]
fmul v26.4s, v0.4s, v5.4s[1] fmul v26.4s, v0.4s, v5.s[1]
fmul v28.4s, v0.4s, v5.4s[2] fmul v28.4s, v0.4s, v5.s[2]
fmul v30.4s, v0.4s, v5.4s[3] fmul v30.4s, v0.4s, v5.s[3]
ld1 {v6.4s}, [pB] ld1 {v6.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -379,14 +379,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_M1 .macro KERNEL4x8_M1
fmla v16.4s, v0.4s, v4.4s[0] fmla v16.4s, v0.4s, v4.s[0]
fmla v18.4s, v0.4s, v4.4s[1] fmla v18.4s, v0.4s, v4.s[1]
fmla v20.4s, v0.4s, v4.4s[2] fmla v20.4s, v0.4s, v4.s[2]
fmla v22.4s, v0.4s, v4.4s[3] fmla v22.4s, v0.4s, v4.s[3]
fmla v24.4s, v0.4s, v5.4s[0] fmla v24.4s, v0.4s, v5.s[0]
fmla v26.4s, v0.4s, v5.4s[1] fmla v26.4s, v0.4s, v5.s[1]
fmla v28.4s, v0.4s, v5.4s[2] fmla v28.4s, v0.4s, v5.s[2]
fmla v30.4s, v0.4s, v5.4s[3] fmla v30.4s, v0.4s, v5.s[3]
ld1 {v6.4s}, [pB] ld1 {v6.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -397,14 +397,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_M2 .macro KERNEL4x8_M2
fmla v16.4s, v2.4s, v6.4s[0] fmla v16.4s, v2.4s, v6.s[0]
fmla v18.4s, v2.4s, v6.4s[1] fmla v18.4s, v2.4s, v6.s[1]
fmla v20.4s, v2.4s, v6.4s[2] fmla v20.4s, v2.4s, v6.s[2]
fmla v22.4s, v2.4s, v6.4s[3] fmla v22.4s, v2.4s, v6.s[3]
fmla v24.4s, v2.4s, v7.4s[0] fmla v24.4s, v2.4s, v7.s[0]
fmla v26.4s, v2.4s, v7.4s[1] fmla v26.4s, v2.4s, v7.s[1]
fmla v28.4s, v2.4s, v7.4s[2] fmla v28.4s, v2.4s, v7.s[2]
fmla v30.4s, v2.4s, v7.4s[3] fmla v30.4s, v2.4s, v7.s[3]
ld1 {v4.4s}, [pB] ld1 {v4.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -415,14 +415,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_E .macro KERNEL4x8_E
fmla v16.4s, v2.4s, v6.4s[0] fmla v16.4s, v2.4s, v6.s[0]
fmla v18.4s, v2.4s, v6.4s[1] fmla v18.4s, v2.4s, v6.s[1]
fmla v20.4s, v2.4s, v6.4s[2] fmla v20.4s, v2.4s, v6.s[2]
fmla v22.4s, v2.4s, v6.4s[3] fmla v22.4s, v2.4s, v6.s[3]
fmla v24.4s, v2.4s, v7.4s[0] fmla v24.4s, v2.4s, v7.s[0]
fmla v26.4s, v2.4s, v7.4s[1] fmla v26.4s, v2.4s, v7.s[1]
fmla v28.4s, v2.4s, v7.4s[2] fmla v28.4s, v2.4s, v7.s[2]
fmla v30.4s, v2.4s, v7.4s[3] fmla v30.4s, v2.4s, v7.s[3]
.endm .endm
.macro KERNEL4x8_SUB .macro KERNEL4x8_SUB
@ -433,14 +433,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.4s}, [pA] ld1 {v0.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v4.4s[0] fmla v16.4s, v0.4s, v4.s[0]
fmla v18.4s, v0.4s, v4.4s[1] fmla v18.4s, v0.4s, v4.s[1]
fmla v20.4s, v0.4s, v4.4s[2] fmla v20.4s, v0.4s, v4.s[2]
fmla v22.4s, v0.4s, v4.4s[3] fmla v22.4s, v0.4s, v4.s[3]
fmla v24.4s, v0.4s, v5.4s[0] fmla v24.4s, v0.4s, v5.s[0]
fmla v26.4s, v0.4s, v5.4s[1] fmla v26.4s, v0.4s, v5.s[1]
fmla v28.4s, v0.4s, v5.4s[2] fmla v28.4s, v0.4s, v5.s[2]
fmla v30.4s, v0.4s, v5.4s[3] fmla v30.4s, v0.4s, v5.s[3]
.endm .endm
.macro SAVE4x8 .macro SAVE4x8
@ -514,14 +514,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v4.4s[0] fmla v16.2s, v0.2s, v4.s[0]
fmla v18.2s, v0.2s, v4.4s[1] fmla v18.2s, v0.2s, v4.s[1]
fmla v20.2s, v0.2s, v4.4s[2] fmla v20.2s, v0.2s, v4.s[2]
fmla v22.2s, v0.2s, v4.4s[3] fmla v22.2s, v0.2s, v4.s[3]
fmla v24.2s, v0.2s, v5.4s[0] fmla v24.2s, v0.2s, v5.s[0]
fmla v26.2s, v0.2s, v5.4s[1] fmla v26.2s, v0.2s, v5.s[1]
fmla v28.2s, v0.2s, v5.4s[2] fmla v28.2s, v0.2s, v5.s[2]
fmla v30.2s, v0.2s, v5.4s[3] fmla v30.2s, v0.2s, v5.s[3]
.endm .endm
.macro SAVE2x8 .macro SAVE2x8
@ -595,14 +595,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0, [pA] ldr s0, [pA]
add pA, pA, #4 add pA, pA, #4
fmla s16, s0, v4.4s[0] fmla s16, s0, v4.s[0]
fmla s18, s0, v4.4s[1] fmla s18, s0, v4.s[1]
fmla s20, s0, v4.4s[2] fmla s20, s0, v4.s[2]
fmla s22, s0, v4.4s[3] fmla s22, s0, v4.s[3]
fmla s24, s0, v5.4s[0] fmla s24, s0, v5.s[0]
fmla s26, s0, v5.4s[1] fmla s26, s0, v5.s[1]
fmla s28, s0, v5.4s[2] fmla s28, s0, v5.s[2]
fmla s30, s0, v5.4s[3] fmla s30, s0, v5.s[3]
.endm .endm
.macro SAVE1x8 .macro SAVE1x8
@ -676,14 +676,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.4s, v0.4s, v8.2s[0] fmul v16.4s, v0.4s, v8.s[0]
fmul v17.4s, v1.4s, v8.2s[0] fmul v17.4s, v1.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.2s[1] fmul v20.4s, v0.4s, v8.s[1]
fmul v21.4s, v1.4s, v8.2s[1] fmul v21.4s, v1.4s, v8.s[1]
fmul v24.4s, v0.4s, v9.2s[0] fmul v24.4s, v0.4s, v9.s[0]
fmul v25.4s, v1.4s, v9.2s[0] fmul v25.4s, v1.4s, v9.s[0]
fmul v28.4s, v0.4s, v9.2s[1] fmul v28.4s, v0.4s, v9.s[1]
fmul v29.4s, v1.4s, v9.2s[1] fmul v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -694,14 +694,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -712,14 +712,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -730,14 +730,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
.endm .endm
.macro KERNEL8x4_SUB .macro KERNEL8x4_SUB
@ -748,14 +748,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
.endm .endm
.macro SAVE8x4 .macro SAVE8x4
@ -808,17 +808,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.2s, v0.2s, v8.2s[0] fmul v16.2s, v0.2s, v8.s[0]
fmul v29.2s, v1.2s, v9.2s[1] fmul v29.2s, v1.2s, v9.s[1]
fmul v20.2s, v0.2s, v8.2s[1] fmul v20.2s, v0.2s, v8.s[1]
fmul v25.2s, v1.2s, v9.2s[0] fmul v25.2s, v1.2s, v9.s[0]
fmul v24.2s, v0.2s, v9.2s[0] fmul v24.2s, v0.2s, v9.s[0]
fmul v21.2s, v1.2s, v8.2s[1] fmul v21.2s, v1.2s, v8.s[1]
fmul v28.2s, v0.2s, v9.2s[1] fmul v28.2s, v0.2s, v9.s[1]
fmul v17.2s, v1.2s, v8.2s[0] fmul v17.2s, v1.2s, v8.s[0]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -827,61 +827,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] // For next round ld1 {v12.2s, v13.2s}, [pB] // For next round
add pB, pB, #16 add pB, pB, #16
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
ld1 {v4.2s, v5.2s}, [pA] // For next round ld1 {v4.2s, v5.2s}, [pA] // For next round
add pA, pA, #16 add pA, pA, #16
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
fmla v16.2s, v4.2s, v12.2s[0] fmla v16.2s, v4.2s, v12.s[0]
fmla v29.2s, v5.2s, v13.2s[1] fmla v29.2s, v5.2s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] // For next round ld1 {v8.2s, v9.2s}, [pB] // For next round
add pB, pB, #16 add pB, pB, #16
fmla v20.2s, v4.2s, v12.2s[1] fmla v20.2s, v4.2s, v12.s[1]
fmla v25.2s, v5.2s, v13.2s[0] fmla v25.2s, v5.2s, v13.s[0]
ld1 {v0.2s, v1.2s}, [pA] // For next round ld1 {v0.2s, v1.2s}, [pA] // For next round
add pA, pA, #16 add pA, pA, #16
fmla v24.2s, v4.2s, v13.2s[0] fmla v24.2s, v4.2s, v13.s[0]
fmla v21.2s, v5.2s, v12.2s[1] fmla v21.2s, v5.2s, v12.s[1]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
fmla v28.2s, v4.2s, v13.2s[1] fmla v28.2s, v4.2s, v13.s[1]
fmla v17.2s, v5.2s, v12.2s[0] fmla v17.2s, v5.2s, v12.s[0]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
fmla v16.2s, v4.2s, v12.2s[0] fmla v16.2s, v4.2s, v12.s[0]
fmla v29.2s, v5.2s, v13.2s[1] fmla v29.2s, v5.2s, v13.s[1]
fmla v20.2s, v4.2s, v12.2s[1] fmla v20.2s, v4.2s, v12.s[1]
fmla v25.2s, v5.2s, v13.2s[0] fmla v25.2s, v5.2s, v13.s[0]
fmla v24.2s, v4.2s, v13.2s[0] fmla v24.2s, v4.2s, v13.s[0]
fmla v21.2s, v5.2s, v12.2s[1] fmla v21.2s, v5.2s, v12.s[1]
fmla v28.2s, v4.2s, v13.2s[1] fmla v28.2s, v4.2s, v13.s[1]
fmla v17.2s, v5.2s, v12.2s[0] fmla v17.2s, v5.2s, v12.s[0]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -890,17 +890,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -945,10 +945,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -1028,11 +1028,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
.endm .endm
.macro SAVE8x2 .macro SAVE8x2
@ -1068,10 +1068,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -1103,8 +1103,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -1133,7 +1133,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0 , [pA] ldr s0 , [pA]
add pA, pA, #4 add pA, pA, #4
fmla v16.2s, v8.2s, v0.2s[0] fmla v16.2s, v8.2s, v0.s[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -1163,8 +1163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
.endm .endm
.macro SAVE8x1 .macro SAVE8x1
@ -1190,8 +1190,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -1216,7 +1216,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA , pA, #8 add pA , pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

View File

@ -182,93 +182,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.2d, v3.2d}, [pA] ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.2d, v0.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b eor v17.16b, v17.16b, v17.16b
fmls v17.2d, v0.2d, v9.2d[0] fmls v17.2d, v0.2d, v9.d[0]
#else #else
fmul v17.2d, v0.2d, v9.2d[0] fmul v17.2d, v0.2d, v9.d[0]
#endif #endif
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
fmul v18.2d, v2.2d, v8.2d[0] fmul v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.2d[0] OP_ii v18.2d, v3.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b eor v19.16b, v19.16b, v19.16b
fmls v19.2d, v2.2d, v9.2d[0] fmls v19.2d, v2.2d, v9.d[0]
#else #else
fmul v19.2d, v2.2d, v9.2d[0] fmul v19.2d, v2.2d, v9.d[0]
#endif #endif
OP_ir v19.2d, v3.2d, v8.2d[0] OP_ir v19.2d, v3.2d, v8.d[0]
fmul v20.2d, v0.2d, v8.2d[1] fmul v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b eor v21.16b, v21.16b, v21.16b
fmls v21.2d, v0.2d, v9.2d[1] fmls v21.2d, v0.2d, v9.d[1]
#else #else
fmul v21.2d, v0.2d, v9.2d[1] fmul v21.2d, v0.2d, v9.d[1]
#endif #endif
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
fmul v22.2d, v2.2d, v8.2d[1] fmul v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.2d[1] OP_ii v22.2d, v3.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v23.16b, v23.16b, v23.16b eor v23.16b, v23.16b, v23.16b
fmls v23.2d, v2.2d, v9.2d[1] fmls v23.2d, v2.2d, v9.d[1]
#else #else
fmul v23.2d, v2.2d, v9.2d[1] fmul v23.2d, v2.2d, v9.d[1]
#endif #endif
OP_ir v23.2d, v3.2d, v8.2d[1] OP_ir v23.2d, v3.2d, v8.d[1]
fmul v24.2d, v0.2d, v10.2d[0] fmul v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.2d[0] OP_ii v24.2d, v1.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b eor v25.16b, v25.16b, v25.16b
fmls v25.2d, v0.2d, v11.2d[0] fmls v25.2d, v0.2d, v11.d[0]
#else #else
fmul v25.2d, v0.2d, v11.2d[0] fmul v25.2d, v0.2d, v11.d[0]
#endif #endif
OP_ir v25.2d, v1.2d, v10.2d[0] OP_ir v25.2d, v1.2d, v10.d[0]
fmul v26.2d, v2.2d, v10.2d[0] fmul v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.2d[0] OP_ii v26.2d, v3.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b eor v27.16b, v27.16b, v27.16b
fmls v27.2d, v2.2d, v11.2d[0] fmls v27.2d, v2.2d, v11.d[0]
#else #else
fmul v27.2d, v2.2d, v11.2d[0] fmul v27.2d, v2.2d, v11.d[0]
#endif #endif
OP_ir v27.2d, v3.2d, v10.2d[0] OP_ir v27.2d, v3.2d, v10.d[0]
fmul v28.2d, v0.2d, v10.2d[1] fmul v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.2d[1] OP_ii v28.2d, v1.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b eor v29.16b, v29.16b, v29.16b
fmls v29.2d, v0.2d, v11.2d[1] fmls v29.2d, v0.2d, v11.d[1]
#else #else
fmul v29.2d, v0.2d, v11.2d[1] fmul v29.2d, v0.2d, v11.d[1]
#endif #endif
OP_ir v29.2d, v1.2d, v10.2d[1] OP_ir v29.2d, v1.2d, v10.d[1]
fmul v30.2d, v2.2d, v10.2d[1] fmul v30.2d, v2.2d, v10.d[1]
OP_ii v30.2d, v3.2d, v11.2d[1] OP_ii v30.2d, v3.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b eor v31.16b, v31.16b, v31.16b
fmls v31.2d, v2.2d, v11.2d[1] fmls v31.2d, v2.2d, v11.d[1]
#else #else
fmul v31.2d, v2.2d, v11.2d[1] fmul v31.2d, v2.2d, v11.d[1]
#endif #endif
OP_ir v31.2d, v3.2d, v10.2d[1] OP_ir v31.2d, v3.2d, v10.d[1]
ld2 {v12.2d, v13.2d}, [pB] ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -281,161 +281,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
OP_rr v16.2d, v0.2d, v8.2d[0] OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.2d[0] OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
ld2 {v12.2d, v13.2d}, [pB] // For next round ld2 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v18.2d, v2.2d, v8.2d[0] OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.2d[0] OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.2d[0] OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.2d[0] OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v14.2d, v15.2d}, [pB] // For next round ld2 {v14.2d, v15.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.2d, v0.2d, v8.2d[1] OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.2d[1] OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v4.2d, v5.2d} , [pA] // For next round ld2 {v4.2d, v5.2d} , [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v22.2d, v2.2d, v8.2d[1] OP_rr v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.2d[1] OP_ii v22.2d, v3.2d, v9.d[1]
OP_ri v23.2d, v2.2d, v9.2d[1] OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.2d[1] OP_ir v23.2d, v3.2d, v8.d[1]
ld2 {v6.2d, v7.2d} , [pA] // For next round ld2 {v6.2d, v7.2d} , [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v24.2d, v0.2d, v10.2d[0] OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.2d[0] OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.2d[0] OP_ri v25.2d, v0.2d, v11.d[0]
OP_ir v25.2d, v1.2d, v10.2d[0] OP_ir v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
OP_rr v26.2d, v2.2d, v10.2d[0] OP_rr v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.2d[0] OP_ii v26.2d, v3.2d, v11.d[0]
OP_ri v27.2d, v2.2d, v11.2d[0] OP_ri v27.2d, v2.2d, v11.d[0]
OP_ir v27.2d, v3.2d, v10.2d[0] OP_ir v27.2d, v3.2d, v10.d[0]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
OP_rr v28.2d, v0.2d, v10.2d[1] OP_rr v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.2d[1] OP_ii v28.2d, v1.2d, v11.d[1]
OP_ri v29.2d, v0.2d, v11.2d[1] OP_ri v29.2d, v0.2d, v11.d[1]
OP_ir v29.2d, v1.2d, v10.2d[1] OP_ir v29.2d, v1.2d, v10.d[1]
OP_rr v30.2d, v2.2d, v10.2d[1] OP_rr v30.2d, v2.2d, v10.d[1]
OP_ii v30.2d, v3.2d, v11.2d[1] OP_ii v30.2d, v3.2d, v11.d[1]
OP_ri v31.2d, v2.2d, v11.2d[1] OP_ri v31.2d, v2.2d, v11.d[1]
OP_ir v31.2d, v3.2d, v10.2d[1] OP_ir v31.2d, v3.2d, v10.d[1]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
OP_rr v16.2d, v4.2d, v12.2d[0] OP_rr v16.2d, v4.2d, v12.d[0]
OP_ii v16.2d, v5.2d, v13.2d[0] OP_ii v16.2d, v5.2d, v13.d[0]
OP_ri v17.2d, v4.2d, v13.2d[0] OP_ri v17.2d, v4.2d, v13.d[0]
OP_ir v17.2d, v5.2d, v12.2d[0] OP_ir v17.2d, v5.2d, v12.d[0]
ld2 {v8.2d, v9.2d}, [pB] // For next round ld2 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v18.2d, v6.2d, v12.2d[0] OP_rr v18.2d, v6.2d, v12.d[0]
OP_ii v18.2d, v7.2d, v13.2d[0] OP_ii v18.2d, v7.2d, v13.d[0]
OP_ri v19.2d, v6.2d, v13.2d[0] OP_ri v19.2d, v6.2d, v13.d[0]
OP_ir v19.2d, v7.2d, v12.2d[0] OP_ir v19.2d, v7.2d, v12.d[0]
ld2 {v10.2d, v11.2d}, [pB] // For next round ld2 {v10.2d, v11.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.2d, v4.2d, v12.2d[1] OP_rr v20.2d, v4.2d, v12.d[1]
OP_ii v20.2d, v5.2d, v13.2d[1] OP_ii v20.2d, v5.2d, v13.d[1]
OP_ri v21.2d, v4.2d, v13.2d[1] OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.2d[1] OP_ir v21.2d, v5.2d, v12.d[1]
ld2 {v0.2d, v1.2d}, [pA] // For next round ld2 {v0.2d, v1.2d}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v22.2d, v6.2d, v12.2d[1] OP_rr v22.2d, v6.2d, v12.d[1]
OP_ii v22.2d, v7.2d, v13.2d[1] OP_ii v22.2d, v7.2d, v13.d[1]
OP_ri v23.2d, v6.2d, v13.2d[1] OP_ri v23.2d, v6.2d, v13.d[1]
OP_ir v23.2d, v7.2d, v12.2d[1] OP_ir v23.2d, v7.2d, v12.d[1]
ld2 {v2.2d, v3.2d}, [pA] // For next round ld2 {v2.2d, v3.2d}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v24.2d, v4.2d, v14.2d[0] OP_rr v24.2d, v4.2d, v14.d[0]
OP_ii v24.2d, v5.2d, v15.2d[0] OP_ii v24.2d, v5.2d, v15.d[0]
OP_ri v25.2d, v4.2d, v15.2d[0] OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.2d[0] OP_ir v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
OP_rr v26.2d, v6.2d, v14.2d[0] OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.2d[0] OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.2d[0] OP_ri v27.2d, v6.2d, v15.d[0]
OP_ir v27.2d, v7.2d, v14.2d[0] OP_ir v27.2d, v7.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
OP_rr v28.2d, v4.2d, v14.2d[1] OP_rr v28.2d, v4.2d, v14.d[1]
OP_ii v28.2d, v5.2d, v15.2d[1] OP_ii v28.2d, v5.2d, v15.d[1]
OP_ri v29.2d, v4.2d, v15.2d[1] OP_ri v29.2d, v4.2d, v15.d[1]
OP_ir v29.2d, v5.2d, v14.2d[1] OP_ir v29.2d, v5.2d, v14.d[1]
OP_rr v30.2d, v6.2d, v14.2d[1] OP_rr v30.2d, v6.2d, v14.d[1]
OP_ii v30.2d, v7.2d, v15.2d[1] OP_ii v30.2d, v7.2d, v15.d[1]
OP_ri v31.2d, v6.2d, v15.2d[1] OP_ri v31.2d, v6.2d, v15.d[1]
OP_ir v31.2d, v7.2d, v14.2d[1] OP_ir v31.2d, v7.2d, v14.d[1]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
OP_rr v16.2d, v4.2d, v12.2d[0] OP_rr v16.2d, v4.2d, v12.d[0]
OP_ii v16.2d, v5.2d, v13.2d[0] OP_ii v16.2d, v5.2d, v13.d[0]
OP_ri v17.2d, v4.2d, v13.2d[0] OP_ri v17.2d, v4.2d, v13.d[0]
OP_ir v17.2d, v5.2d, v12.2d[0] OP_ir v17.2d, v5.2d, v12.d[0]
OP_rr v18.2d, v6.2d, v12.2d[0] OP_rr v18.2d, v6.2d, v12.d[0]
OP_ii v18.2d, v7.2d, v13.2d[0] OP_ii v18.2d, v7.2d, v13.d[0]
OP_ri v19.2d, v6.2d, v13.2d[0] OP_ri v19.2d, v6.2d, v13.d[0]
OP_ir v19.2d, v7.2d, v12.2d[0] OP_ir v19.2d, v7.2d, v12.d[0]
OP_rr v20.2d, v4.2d, v12.2d[1] OP_rr v20.2d, v4.2d, v12.d[1]
OP_ii v20.2d, v5.2d, v13.2d[1] OP_ii v20.2d, v5.2d, v13.d[1]
OP_ri v21.2d, v4.2d, v13.2d[1] OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.2d[1] OP_ir v21.2d, v5.2d, v12.d[1]
OP_rr v22.2d, v6.2d, v12.2d[1] OP_rr v22.2d, v6.2d, v12.d[1]
OP_ii v22.2d, v7.2d, v13.2d[1] OP_ii v22.2d, v7.2d, v13.d[1]
OP_ri v23.2d, v6.2d, v13.2d[1] OP_ri v23.2d, v6.2d, v13.d[1]
OP_ir v23.2d, v7.2d, v12.2d[1] OP_ir v23.2d, v7.2d, v12.d[1]
OP_rr v24.2d, v4.2d, v14.2d[0] OP_rr v24.2d, v4.2d, v14.d[0]
OP_ii v24.2d, v5.2d, v15.2d[0] OP_ii v24.2d, v5.2d, v15.d[0]
OP_ri v25.2d, v4.2d, v15.2d[0] OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.2d[0] OP_ir v25.2d, v5.2d, v14.d[0]
OP_rr v26.2d, v6.2d, v14.2d[0] OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.2d[0] OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.2d[0] OP_ri v27.2d, v6.2d, v15.d[0]
OP_ir v27.2d, v7.2d, v14.2d[0] OP_ir v27.2d, v7.2d, v14.d[0]
OP_rr v28.2d, v4.2d, v14.2d[1] OP_rr v28.2d, v4.2d, v14.d[1]
OP_ii v28.2d, v5.2d, v15.2d[1] OP_ii v28.2d, v5.2d, v15.d[1]
OP_ri v29.2d, v4.2d, v15.2d[1] OP_ri v29.2d, v4.2d, v15.d[1]
OP_ir v29.2d, v5.2d, v14.2d[1] OP_ir v29.2d, v5.2d, v14.d[1]
OP_rr v30.2d, v6.2d, v14.2d[1] OP_rr v30.2d, v6.2d, v14.d[1]
OP_ii v30.2d, v7.2d, v15.2d[1] OP_ii v30.2d, v7.2d, v15.d[1]
OP_ri v31.2d, v6.2d, v15.2d[1] OP_ri v31.2d, v6.2d, v15.d[1]
OP_ir v31.2d, v7.2d, v14.2d[1] OP_ir v31.2d, v7.2d, v14.d[1]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -448,45 +448,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.2d, v3.2d}, [pA] ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.2d[0] OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.2d[0] OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v18.2d, v2.2d, v8.2d[0] OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.2d[0] OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.2d[0] OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.2d[0] OP_ir v19.2d, v3.2d, v8.d[0]
OP_rr v20.2d, v0.2d, v8.2d[1] OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.2d[1] OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
OP_rr v22.2d, v2.2d, v8.2d[1] OP_rr v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.2d[1] OP_ii v22.2d, v3.2d, v9.d[1]
OP_ri v23.2d, v2.2d, v9.2d[1] OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.2d[1] OP_ir v23.2d, v3.2d, v8.d[1]
OP_rr v24.2d, v0.2d, v10.2d[0] OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.2d[0] OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.2d[0] OP_ri v25.2d, v0.2d, v11.d[0]
OP_ir v25.2d, v1.2d, v10.2d[0] OP_ir v25.2d, v1.2d, v10.d[0]
OP_rr v26.2d, v2.2d, v10.2d[0] OP_rr v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.2d[0] OP_ii v26.2d, v3.2d, v11.d[0]
OP_ri v27.2d, v2.2d, v11.2d[0] OP_ri v27.2d, v2.2d, v11.d[0]
OP_ir v27.2d, v3.2d, v10.2d[0] OP_ir v27.2d, v3.2d, v10.d[0]
OP_rr v28.2d, v0.2d, v10.2d[1] OP_rr v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.2d[1] OP_ii v28.2d, v1.2d, v11.d[1]
OP_ri v29.2d, v0.2d, v11.2d[1] OP_ri v29.2d, v0.2d, v11.d[1]
OP_ir v29.2d, v1.2d, v10.2d[1] OP_ir v29.2d, v1.2d, v10.d[1]
OP_rr v30.2d, v2.2d, v10.2d[1] OP_rr v30.2d, v2.2d, v10.d[1]
OP_ii v30.2d, v3.2d, v11.2d[1] OP_ii v30.2d, v3.2d, v11.d[1]
OP_ri v31.2d, v2.2d, v11.2d[1] OP_ri v31.2d, v2.2d, v11.d[1]
OP_ir v31.2d, v3.2d, v10.2d[1] OP_ir v31.2d, v3.2d, v10.d[1]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -582,25 +582,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2d, v1.2d}, [pA] ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.2d[0] OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.2d[0] OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v20.2d, v0.2d, v8.2d[1] OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.2d[1] OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
OP_rr v24.2d, v0.2d, v10.2d[0] OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.2d[0] OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.2d[0] OP_ri v25.2d, v0.2d, v11.d[0]
OP_ir v25.2d, v1.2d, v10.2d[0] OP_ir v25.2d, v1.2d, v10.d[0]
OP_rr v28.2d, v0.2d, v10.2d[1] OP_rr v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.2d[1] OP_ii v28.2d, v1.2d, v11.d[1]
OP_ri v29.2d, v0.2d, v11.2d[1] OP_ri v29.2d, v0.2d, v11.d[1]
OP_ir v29.2d, v1.2d, v10.2d[1] OP_ir v29.2d, v1.2d, v10.d[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -669,25 +669,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.d, v1.d}[0], [pA] ld2 {v0.d, v1.d}[0], [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr d16, d0, v8.2d[0] OP_rr d16, d0, v8.d[0]
OP_ii d16, d1, v9.2d[0] OP_ii d16, d1, v9.d[0]
OP_ri d17, d0, v9.2d[0] OP_ri d17, d0, v9.d[0]
OP_ir d17, d1, v8.2d[0] OP_ir d17, d1, v8.d[0]
OP_rr d20, d0, v8.2d[1] OP_rr d20, d0, v8.d[1]
OP_ii d20, d1, v9.2d[1] OP_ii d20, d1, v9.d[1]
OP_ri d21, d0, v9.2d[1] OP_ri d21, d0, v9.d[1]
OP_ir d21, d1, v8.2d[1] OP_ir d21, d1, v8.d[1]
OP_rr d24, d0, v10.2d[0] OP_rr d24, d0, v10.d[0]
OP_ii d24, d1, v11.2d[0] OP_ii d24, d1, v11.d[0]
OP_ri d25, d0, v11.2d[0] OP_ri d25, d0, v11.d[0]
OP_ir d25, d1, v10.2d[0] OP_ir d25, d1, v10.d[0]
OP_rr d28, d0, v10.2d[1] OP_rr d28, d0, v10.d[1]
OP_ii d28, d1, v11.2d[1] OP_ii d28, d1, v11.d[1]
OP_ri d29, d0, v11.2d[1] OP_ri d29, d0, v11.d[1]
OP_ir d29, d1, v10.2d[1] OP_ir d29, d1, v10.d[1]
.endm .endm
.macro SAVE1x4 .macro SAVE1x4
@ -756,25 +756,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.2d, v3.2d}, [pA] ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.2d[0] OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.2d[0] OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v18.2d, v2.2d, v8.2d[0] OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.2d[0] OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.2d[0] OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.2d[0] OP_ir v19.2d, v3.2d, v8.d[0]
OP_rr v20.2d, v0.2d, v8.2d[1] OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.2d[1] OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
OP_rr v22.2d, v2.2d, v8.2d[1] OP_rr v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.2d[1] OP_ii v22.2d, v3.2d, v9.d[1]
OP_ri v23.2d, v2.2d, v9.2d[1] OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.2d[1] OP_ir v23.2d, v3.2d, v8.d[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -833,15 +833,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2d, v1.2d}, [pA] ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.2d[0] OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.2d[0] OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v20.2d, v0.2d, v8.2d[1] OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.2d[1] OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -886,15 +886,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.d, v1.d}[0], [pA] ld2 {v0.d, v1.d}[0], [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr d16, d0, v8.2d[0] OP_rr d16, d0, v8.d[0]
OP_ii d16, d1, v9.2d[0] OP_ii d16, d1, v9.d[0]
OP_ri d17, d0, v9.2d[0] OP_ri d17, d0, v9.d[0]
OP_ir d17, d1, v8.2d[0] OP_ir d17, d1, v8.d[0]
OP_rr d20, d0, v8.2d[1] OP_rr d20, d0, v8.d[1]
OP_ii d20, d1, v9.2d[1] OP_ii d20, d1, v9.d[1]
OP_ri d21, d0, v9.2d[1] OP_ri d21, d0, v9.d[1]
OP_ir d21, d1, v8.2d[1] OP_ir d21, d1, v8.d[1]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2

View File

@ -185,93 +185,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.2d, v3.2d}, [pA] ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.2d, v0.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b eor v17.16b, v17.16b, v17.16b
fmls v17.2d, v0.2d, v9.2d[0] fmls v17.2d, v0.2d, v9.d[0]
#else #else
fmul v17.2d, v0.2d, v9.2d[0] fmul v17.2d, v0.2d, v9.d[0]
#endif #endif
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
fmul v18.2d, v2.2d, v8.2d[0] fmul v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.2d[0] OP_ii v18.2d, v3.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b eor v19.16b, v19.16b, v19.16b
fmls v19.2d, v2.2d, v9.2d[0] fmls v19.2d, v2.2d, v9.d[0]
#else #else
fmul v19.2d, v2.2d, v9.2d[0] fmul v19.2d, v2.2d, v9.d[0]
#endif #endif
OP_ir v19.2d, v3.2d, v8.2d[0] OP_ir v19.2d, v3.2d, v8.d[0]
fmul v20.2d, v0.2d, v8.2d[1] fmul v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b eor v21.16b, v21.16b, v21.16b
fmls v21.2d, v0.2d, v9.2d[1] fmls v21.2d, v0.2d, v9.d[1]
#else #else
fmul v21.2d, v0.2d, v9.2d[1] fmul v21.2d, v0.2d, v9.d[1]
#endif #endif
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
fmul v22.2d, v2.2d, v8.2d[1] fmul v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.2d[1] OP_ii v22.2d, v3.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v23.16b, v23.16b, v23.16b eor v23.16b, v23.16b, v23.16b
fmls v23.2d, v2.2d, v9.2d[1] fmls v23.2d, v2.2d, v9.d[1]
#else #else
fmul v23.2d, v2.2d, v9.2d[1] fmul v23.2d, v2.2d, v9.d[1]
#endif #endif
OP_ir v23.2d, v3.2d, v8.2d[1] OP_ir v23.2d, v3.2d, v8.d[1]
fmul v24.2d, v0.2d, v10.2d[0] fmul v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.2d[0] OP_ii v24.2d, v1.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b eor v25.16b, v25.16b, v25.16b
fmls v25.2d, v0.2d, v11.2d[0] fmls v25.2d, v0.2d, v11.d[0]
#else #else
fmul v25.2d, v0.2d, v11.2d[0] fmul v25.2d, v0.2d, v11.d[0]
#endif #endif
OP_ir v25.2d, v1.2d, v10.2d[0] OP_ir v25.2d, v1.2d, v10.d[0]
fmul v26.2d, v2.2d, v10.2d[0] fmul v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.2d[0] OP_ii v26.2d, v3.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b eor v27.16b, v27.16b, v27.16b
fmls v27.2d, v2.2d, v11.2d[0] fmls v27.2d, v2.2d, v11.d[0]
#else #else
fmul v27.2d, v2.2d, v11.2d[0] fmul v27.2d, v2.2d, v11.d[0]
#endif #endif
OP_ir v27.2d, v3.2d, v10.2d[0] OP_ir v27.2d, v3.2d, v10.d[0]
fmul v28.2d, v0.2d, v10.2d[1] fmul v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.2d[1] OP_ii v28.2d, v1.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b eor v29.16b, v29.16b, v29.16b
fmls v29.2d, v0.2d, v11.2d[1] fmls v29.2d, v0.2d, v11.d[1]
#else #else
fmul v29.2d, v0.2d, v11.2d[1] fmul v29.2d, v0.2d, v11.d[1]
#endif #endif
OP_ir v29.2d, v1.2d, v10.2d[1] OP_ir v29.2d, v1.2d, v10.d[1]
fmul v30.2d, v2.2d, v10.2d[1] fmul v30.2d, v2.2d, v10.d[1]
OP_ii v30.2d, v3.2d, v11.2d[1] OP_ii v30.2d, v3.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b eor v31.16b, v31.16b, v31.16b
fmls v31.2d, v2.2d, v11.2d[1] fmls v31.2d, v2.2d, v11.d[1]
#else #else
fmul v31.2d, v2.2d, v11.2d[1] fmul v31.2d, v2.2d, v11.d[1]
#endif #endif
OP_ir v31.2d, v3.2d, v10.2d[1] OP_ir v31.2d, v3.2d, v10.d[1]
ld2 {v12.2d, v13.2d}, [pB] ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -284,161 +284,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
OP_rr v16.2d, v0.2d, v8.2d[0] OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.2d[0] OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
ld2 {v12.2d, v13.2d}, [pB] // For next round ld2 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v18.2d, v2.2d, v8.2d[0] OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.2d[0] OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.2d[0] OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.2d[0] OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v14.2d, v15.2d}, [pB] // For next round ld2 {v14.2d, v15.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.2d, v0.2d, v8.2d[1] OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.2d[1] OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v4.2d, v5.2d} , [pA] // For next round ld2 {v4.2d, v5.2d} , [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v22.2d, v2.2d, v8.2d[1] OP_rr v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.2d[1] OP_ii v22.2d, v3.2d, v9.d[1]
OP_ri v23.2d, v2.2d, v9.2d[1] OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.2d[1] OP_ir v23.2d, v3.2d, v8.d[1]
ld2 {v6.2d, v7.2d} , [pA] // For next round ld2 {v6.2d, v7.2d} , [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v24.2d, v0.2d, v10.2d[0] OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.2d[0] OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.2d[0] OP_ri v25.2d, v0.2d, v11.d[0]
OP_ir v25.2d, v1.2d, v10.2d[0] OP_ir v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
OP_rr v26.2d, v2.2d, v10.2d[0] OP_rr v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.2d[0] OP_ii v26.2d, v3.2d, v11.d[0]
OP_ri v27.2d, v2.2d, v11.2d[0] OP_ri v27.2d, v2.2d, v11.d[0]
OP_ir v27.2d, v3.2d, v10.2d[0] OP_ir v27.2d, v3.2d, v10.d[0]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
OP_rr v28.2d, v0.2d, v10.2d[1] OP_rr v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.2d[1] OP_ii v28.2d, v1.2d, v11.d[1]
OP_ri v29.2d, v0.2d, v11.2d[1] OP_ri v29.2d, v0.2d, v11.d[1]
OP_ir v29.2d, v1.2d, v10.2d[1] OP_ir v29.2d, v1.2d, v10.d[1]
OP_rr v30.2d, v2.2d, v10.2d[1] OP_rr v30.2d, v2.2d, v10.d[1]
OP_ii v30.2d, v3.2d, v11.2d[1] OP_ii v30.2d, v3.2d, v11.d[1]
OP_ri v31.2d, v2.2d, v11.2d[1] OP_ri v31.2d, v2.2d, v11.d[1]
OP_ir v31.2d, v3.2d, v10.2d[1] OP_ir v31.2d, v3.2d, v10.d[1]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
OP_rr v16.2d, v4.2d, v12.2d[0] OP_rr v16.2d, v4.2d, v12.d[0]
OP_ii v16.2d, v5.2d, v13.2d[0] OP_ii v16.2d, v5.2d, v13.d[0]
OP_ri v17.2d, v4.2d, v13.2d[0] OP_ri v17.2d, v4.2d, v13.d[0]
OP_ir v17.2d, v5.2d, v12.2d[0] OP_ir v17.2d, v5.2d, v12.d[0]
ld2 {v8.2d, v9.2d}, [pB] // For next round ld2 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v18.2d, v6.2d, v12.2d[0] OP_rr v18.2d, v6.2d, v12.d[0]
OP_ii v18.2d, v7.2d, v13.2d[0] OP_ii v18.2d, v7.2d, v13.d[0]
OP_ri v19.2d, v6.2d, v13.2d[0] OP_ri v19.2d, v6.2d, v13.d[0]
OP_ir v19.2d, v7.2d, v12.2d[0] OP_ir v19.2d, v7.2d, v12.d[0]
ld2 {v10.2d, v11.2d}, [pB] // For next round ld2 {v10.2d, v11.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.2d, v4.2d, v12.2d[1] OP_rr v20.2d, v4.2d, v12.d[1]
OP_ii v20.2d, v5.2d, v13.2d[1] OP_ii v20.2d, v5.2d, v13.d[1]
OP_ri v21.2d, v4.2d, v13.2d[1] OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.2d[1] OP_ir v21.2d, v5.2d, v12.d[1]
ld2 {v0.2d, v1.2d}, [pA] // For next round ld2 {v0.2d, v1.2d}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v22.2d, v6.2d, v12.2d[1] OP_rr v22.2d, v6.2d, v12.d[1]
OP_ii v22.2d, v7.2d, v13.2d[1] OP_ii v22.2d, v7.2d, v13.d[1]
OP_ri v23.2d, v6.2d, v13.2d[1] OP_ri v23.2d, v6.2d, v13.d[1]
OP_ir v23.2d, v7.2d, v12.2d[1] OP_ir v23.2d, v7.2d, v12.d[1]
ld2 {v2.2d, v3.2d}, [pA] // For next round ld2 {v2.2d, v3.2d}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v24.2d, v4.2d, v14.2d[0] OP_rr v24.2d, v4.2d, v14.d[0]
OP_ii v24.2d, v5.2d, v15.2d[0] OP_ii v24.2d, v5.2d, v15.d[0]
OP_ri v25.2d, v4.2d, v15.2d[0] OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.2d[0] OP_ir v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
OP_rr v26.2d, v6.2d, v14.2d[0] OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.2d[0] OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.2d[0] OP_ri v27.2d, v6.2d, v15.d[0]
OP_ir v27.2d, v7.2d, v14.2d[0] OP_ir v27.2d, v7.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
OP_rr v28.2d, v4.2d, v14.2d[1] OP_rr v28.2d, v4.2d, v14.d[1]
OP_ii v28.2d, v5.2d, v15.2d[1] OP_ii v28.2d, v5.2d, v15.d[1]
OP_ri v29.2d, v4.2d, v15.2d[1] OP_ri v29.2d, v4.2d, v15.d[1]
OP_ir v29.2d, v5.2d, v14.2d[1] OP_ir v29.2d, v5.2d, v14.d[1]
OP_rr v30.2d, v6.2d, v14.2d[1] OP_rr v30.2d, v6.2d, v14.d[1]
OP_ii v30.2d, v7.2d, v15.2d[1] OP_ii v30.2d, v7.2d, v15.d[1]
OP_ri v31.2d, v6.2d, v15.2d[1] OP_ri v31.2d, v6.2d, v15.d[1]
OP_ir v31.2d, v7.2d, v14.2d[1] OP_ir v31.2d, v7.2d, v14.d[1]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
OP_rr v16.2d, v4.2d, v12.2d[0] OP_rr v16.2d, v4.2d, v12.d[0]
OP_ii v16.2d, v5.2d, v13.2d[0] OP_ii v16.2d, v5.2d, v13.d[0]
OP_ri v17.2d, v4.2d, v13.2d[0] OP_ri v17.2d, v4.2d, v13.d[0]
OP_ir v17.2d, v5.2d, v12.2d[0] OP_ir v17.2d, v5.2d, v12.d[0]
OP_rr v18.2d, v6.2d, v12.2d[0] OP_rr v18.2d, v6.2d, v12.d[0]
OP_ii v18.2d, v7.2d, v13.2d[0] OP_ii v18.2d, v7.2d, v13.d[0]
OP_ri v19.2d, v6.2d, v13.2d[0] OP_ri v19.2d, v6.2d, v13.d[0]
OP_ir v19.2d, v7.2d, v12.2d[0] OP_ir v19.2d, v7.2d, v12.d[0]
OP_rr v20.2d, v4.2d, v12.2d[1] OP_rr v20.2d, v4.2d, v12.d[1]
OP_ii v20.2d, v5.2d, v13.2d[1] OP_ii v20.2d, v5.2d, v13.d[1]
OP_ri v21.2d, v4.2d, v13.2d[1] OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.2d[1] OP_ir v21.2d, v5.2d, v12.d[1]
OP_rr v22.2d, v6.2d, v12.2d[1] OP_rr v22.2d, v6.2d, v12.d[1]
OP_ii v22.2d, v7.2d, v13.2d[1] OP_ii v22.2d, v7.2d, v13.d[1]
OP_ri v23.2d, v6.2d, v13.2d[1] OP_ri v23.2d, v6.2d, v13.d[1]
OP_ir v23.2d, v7.2d, v12.2d[1] OP_ir v23.2d, v7.2d, v12.d[1]
OP_rr v24.2d, v4.2d, v14.2d[0] OP_rr v24.2d, v4.2d, v14.d[0]
OP_ii v24.2d, v5.2d, v15.2d[0] OP_ii v24.2d, v5.2d, v15.d[0]
OP_ri v25.2d, v4.2d, v15.2d[0] OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.2d[0] OP_ir v25.2d, v5.2d, v14.d[0]
OP_rr v26.2d, v6.2d, v14.2d[0] OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.2d[0] OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.2d[0] OP_ri v27.2d, v6.2d, v15.d[0]
OP_ir v27.2d, v7.2d, v14.2d[0] OP_ir v27.2d, v7.2d, v14.d[0]
OP_rr v28.2d, v4.2d, v14.2d[1] OP_rr v28.2d, v4.2d, v14.d[1]
OP_ii v28.2d, v5.2d, v15.2d[1] OP_ii v28.2d, v5.2d, v15.d[1]
OP_ri v29.2d, v4.2d, v15.2d[1] OP_ri v29.2d, v4.2d, v15.d[1]
OP_ir v29.2d, v5.2d, v14.2d[1] OP_ir v29.2d, v5.2d, v14.d[1]
OP_rr v30.2d, v6.2d, v14.2d[1] OP_rr v30.2d, v6.2d, v14.d[1]
OP_ii v30.2d, v7.2d, v15.2d[1] OP_ii v30.2d, v7.2d, v15.d[1]
OP_ri v31.2d, v6.2d, v15.2d[1] OP_ri v31.2d, v6.2d, v15.d[1]
OP_ir v31.2d, v7.2d, v14.2d[1] OP_ir v31.2d, v7.2d, v14.d[1]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -451,45 +451,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.2d, v3.2d}, [pA] ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.2d[0] OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.2d[0] OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v18.2d, v2.2d, v8.2d[0] OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.2d[0] OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.2d[0] OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.2d[0] OP_ir v19.2d, v3.2d, v8.d[0]
OP_rr v20.2d, v0.2d, v8.2d[1] OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.2d[1] OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
OP_rr v22.2d, v2.2d, v8.2d[1] OP_rr v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.2d[1] OP_ii v22.2d, v3.2d, v9.d[1]
OP_ri v23.2d, v2.2d, v9.2d[1] OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.2d[1] OP_ir v23.2d, v3.2d, v8.d[1]
OP_rr v24.2d, v0.2d, v10.2d[0] OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.2d[0] OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.2d[0] OP_ri v25.2d, v0.2d, v11.d[0]
OP_ir v25.2d, v1.2d, v10.2d[0] OP_ir v25.2d, v1.2d, v10.d[0]
OP_rr v26.2d, v2.2d, v10.2d[0] OP_rr v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.2d[0] OP_ii v26.2d, v3.2d, v11.d[0]
OP_ri v27.2d, v2.2d, v11.2d[0] OP_ri v27.2d, v2.2d, v11.d[0]
OP_ir v27.2d, v3.2d, v10.2d[0] OP_ir v27.2d, v3.2d, v10.d[0]
OP_rr v28.2d, v0.2d, v10.2d[1] OP_rr v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.2d[1] OP_ii v28.2d, v1.2d, v11.d[1]
OP_ri v29.2d, v0.2d, v11.2d[1] OP_ri v29.2d, v0.2d, v11.d[1]
OP_ir v29.2d, v1.2d, v10.2d[1] OP_ir v29.2d, v1.2d, v10.d[1]
OP_rr v30.2d, v2.2d, v10.2d[1] OP_rr v30.2d, v2.2d, v10.d[1]
OP_ii v30.2d, v3.2d, v11.2d[1] OP_ii v30.2d, v3.2d, v11.d[1]
OP_ri v31.2d, v2.2d, v11.2d[1] OP_ri v31.2d, v2.2d, v11.d[1]
OP_ir v31.2d, v3.2d, v10.2d[1] OP_ir v31.2d, v3.2d, v10.d[1]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -577,25 +577,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2d, v1.2d}, [pA] ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.2d[0] OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.2d[0] OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v20.2d, v0.2d, v8.2d[1] OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.2d[1] OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
OP_rr v24.2d, v0.2d, v10.2d[0] OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.2d[0] OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.2d[0] OP_ri v25.2d, v0.2d, v11.d[0]
OP_ir v25.2d, v1.2d, v10.2d[0] OP_ir v25.2d, v1.2d, v10.d[0]
OP_rr v28.2d, v0.2d, v10.2d[1] OP_rr v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.2d[1] OP_ii v28.2d, v1.2d, v11.d[1]
OP_ri v29.2d, v0.2d, v11.2d[1] OP_ri v29.2d, v0.2d, v11.d[1]
OP_ir v29.2d, v1.2d, v10.2d[1] OP_ir v29.2d, v1.2d, v10.d[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -660,25 +660,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.d, v1.d}[0], [pA] ld2 {v0.d, v1.d}[0], [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr d16, d0, v8.2d[0] OP_rr d16, d0, v8.d[0]
OP_ii d16, d1, v9.2d[0] OP_ii d16, d1, v9.d[0]
OP_ri d17, d0, v9.2d[0] OP_ri d17, d0, v9.d[0]
OP_ir d17, d1, v8.2d[0] OP_ir d17, d1, v8.d[0]
OP_rr d20, d0, v8.2d[1] OP_rr d20, d0, v8.d[1]
OP_ii d20, d1, v9.2d[1] OP_ii d20, d1, v9.d[1]
OP_ri d21, d0, v9.2d[1] OP_ri d21, d0, v9.d[1]
OP_ir d21, d1, v8.2d[1] OP_ir d21, d1, v8.d[1]
OP_rr d24, d0, v10.2d[0] OP_rr d24, d0, v10.d[0]
OP_ii d24, d1, v11.2d[0] OP_ii d24, d1, v11.d[0]
OP_ri d25, d0, v11.2d[0] OP_ri d25, d0, v11.d[0]
OP_ir d25, d1, v10.2d[0] OP_ir d25, d1, v10.d[0]
OP_rr d28, d0, v10.2d[1] OP_rr d28, d0, v10.d[1]
OP_ii d28, d1, v11.2d[1] OP_ii d28, d1, v11.d[1]
OP_ri d29, d0, v11.2d[1] OP_ri d29, d0, v11.d[1]
OP_ir d29, d1, v10.2d[1] OP_ir d29, d1, v10.d[1]
.endm .endm
.macro SAVE1x4 .macro SAVE1x4
@ -743,25 +743,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.2d, v3.2d}, [pA] ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.2d[0] OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.2d[0] OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v18.2d, v2.2d, v8.2d[0] OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.2d[0] OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.2d[0] OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.2d[0] OP_ir v19.2d, v3.2d, v8.d[0]
OP_rr v20.2d, v0.2d, v8.2d[1] OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.2d[1] OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
OP_rr v22.2d, v2.2d, v8.2d[1] OP_rr v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.2d[1] OP_ii v22.2d, v3.2d, v9.d[1]
OP_ri v23.2d, v2.2d, v9.2d[1] OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.2d[1] OP_ir v23.2d, v3.2d, v8.d[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -816,15 +816,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2d, v1.2d}, [pA] ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.2d[0] OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.2d[0] OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v20.2d, v0.2d, v8.2d[1] OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.2d[1] OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -867,15 +867,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.d, v1.d}[0], [pA] ld2 {v0.d, v1.d}[0], [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr d16, d0, v8.2d[0] OP_rr d16, d0, v8.d[0]
OP_ii d16, d1, v9.2d[0] OP_ii d16, d1, v9.d[0]
OP_ri d17, d0, v9.2d[0] OP_ri d17, d0, v9.d[0]
OP_ir d17, d1, v8.2d[0] OP_ir d17, d1, v8.d[0]
OP_rr d20, d0, v8.2d[1] OP_rr d20, d0, v8.d[1]
OP_ii d20, d1, v9.2d[1] OP_ii d20, d1, v9.d[1]
OP_ri d21, d0, v9.2d[1] OP_ri d21, d0, v9.d[1]
OP_ir d21, d1, v8.2d[1] OP_ir d21, d1, v8.d[1]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2