THUNDERX2T99: Improve DGEMM
This commit is contained in:
parent
981064acc6
commit
0f1d6e8b39
|
@ -151,187 +151,164 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro KERNEL8x4_I
|
.macro KERNEL8x4_I
|
||||||
ldur q0, [pA]
|
ldp q0, q1, [pA]
|
||||||
ldur q1, [pA, #16]
|
ldp q8, q9, [pB]
|
||||||
|
ldp q2, q3, [pA, #32]
|
||||||
ldur q8, [pB]
|
ldp q4, q5, [pA, #64]
|
||||||
ldur q9, [pB, #16]
|
ldp q12, q13, [pB, #32]
|
||||||
|
ldp q6, q7, [pA, #96]
|
||||||
|
|
||||||
fmul v16.2d, v0.2d, v8.d[0]
|
fmul v16.2d, v0.2d, v8.d[0]
|
||||||
fmul v20.2d, v0.2d, v8.d[1]
|
fmul v20.2d, v0.2d, v8.d[1]
|
||||||
|
|
||||||
fmul v17.2d, v1.2d, v8.d[0]
|
fmul v17.2d, v1.2d, v8.d[0]
|
||||||
fmul v21.2d, v1.2d, v8.d[1]
|
fmul v21.2d, v1.2d, v8.d[1]
|
||||||
|
|
||||||
ldp q2, q3, [pA, #32]
|
add pA, pA, #128
|
||||||
fmul v24.2d, v0.2d, v9.d[0]
|
add pB, pB, #64
|
||||||
ldp q4, q5, [pA, #64]
|
|
||||||
fmul v28.2d, v0.2d, v9.d[1]
|
|
||||||
|
|
||||||
|
fmul v24.2d, v0.2d, v9.d[0]
|
||||||
|
fmul v28.2d, v0.2d, v9.d[1]
|
||||||
fmul v25.2d, v1.2d, v9.d[0]
|
fmul v25.2d, v1.2d, v9.d[0]
|
||||||
fmul v29.2d, v1.2d, v9.d[1]
|
fmul v29.2d, v1.2d, v9.d[1]
|
||||||
|
|
||||||
ldur q12, [pB, #32]
|
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
|
||||||
ldur q13, [pB, #48]
|
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
|
||||||
|
|
||||||
fmul v18.2d, v2.2d, v8.d[0]
|
fmul v18.2d, v2.2d, v8.d[0]
|
||||||
fmul v22.2d, v2.2d, v8.d[1]
|
fmul v22.2d, v2.2d, v8.d[1]
|
||||||
|
|
||||||
ldur q6, [pA, #96]
|
|
||||||
ldur q7, [pA, #112]
|
|
||||||
|
|
||||||
add pB, pB, #64
|
|
||||||
add pA, pA, #128
|
|
||||||
|
|
||||||
fmul v26.2d, v2.2d, v9.d[0]
|
fmul v26.2d, v2.2d, v9.d[0]
|
||||||
fmul v30.2d, v2.2d, v9.d[1]
|
fmul v30.2d, v2.2d, v9.d[1]
|
||||||
|
|
||||||
fmul v19.2d, v3.2d, v8.d[0]
|
fmul v19.2d, v3.2d, v8.d[0]
|
||||||
fmul v27.2d, v3.2d, v9.d[0]
|
fmul v27.2d, v3.2d, v9.d[0]
|
||||||
|
|
||||||
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
|
|
||||||
|
|
||||||
fmul v31.2d, v3.2d, v9.d[1]
|
fmul v31.2d, v3.2d, v9.d[1]
|
||||||
fmul v23.2d, v3.2d, v8.d[1]
|
fmul v23.2d, v3.2d, v8.d[1]
|
||||||
|
|
||||||
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro KERNEL8x4_M1_M2
|
.macro KERNEL8x4_M1_M2
|
||||||
|
|
||||||
|
ldp q12, q13, [pB]
|
||||||
|
ldp q4, q5, [pA]
|
||||||
|
ldp q6, q7, [pA, #32]
|
||||||
|
|
||||||
fmla v16.2d, v0.2d, v8.d[0]
|
fmla v16.2d, v0.2d, v8.d[0]
|
||||||
fmla v20.2d, v0.2d, v8.d[1]
|
fmla v20.2d, v0.2d, v8.d[1]
|
||||||
|
|
||||||
ldp q4, q5, [pA]
|
|
||||||
|
|
||||||
fmla v24.2d, v0.2d, v9.d[0]
|
fmla v24.2d, v0.2d, v9.d[0]
|
||||||
fmla v28.2d, v0.2d, v9.d[1]
|
fmla v28.2d, v0.2d, v9.d[1]
|
||||||
|
|
||||||
ldp q12, q13, [pB]
|
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
|
||||||
|
|
||||||
fmla v17.2d, v1.2d, v8.d[0]
|
fmla v17.2d, v1.2d, v8.d[0]
|
||||||
fmla v25.2d, v1.2d, v9.d[0]
|
fmla v25.2d, v1.2d, v9.d[0]
|
||||||
|
|
||||||
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
|
|
||||||
|
|
||||||
fmla v21.2d, v1.2d, v8.d[1]
|
fmla v21.2d, v1.2d, v8.d[1]
|
||||||
fmla v29.2d, v1.2d, v9.d[1]
|
fmla v29.2d, v1.2d, v9.d[1]
|
||||||
|
|
||||||
|
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
|
||||||
|
|
||||||
fmla v18.2d, v2.2d, v8.d[0]
|
fmla v18.2d, v2.2d, v8.d[0]
|
||||||
fmla v22.2d, v2.2d, v8.d[1]
|
fmla v22.2d, v2.2d, v8.d[1]
|
||||||
|
|
||||||
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
|
|
||||||
|
|
||||||
fmla v26.2d, v2.2d, v9.d[0]
|
fmla v26.2d, v2.2d, v9.d[0]
|
||||||
fmla v30.2d, v2.2d, v9.d[1]
|
fmla v30.2d, v2.2d, v9.d[1]
|
||||||
|
|
||||||
|
prfm PLDL1KEEP, [pA, #3840]
|
||||||
|
|
||||||
fmla v19.2d, v3.2d, v8.d[0]
|
fmla v19.2d, v3.2d, v8.d[0]
|
||||||
fmla v23.2d, v3.2d, v8.d[1]
|
fmla v23.2d, v3.2d, v8.d[1]
|
||||||
|
|
||||||
ldp q6, q7, [pA, #32]
|
|
||||||
|
|
||||||
fmla v27.2d, v3.2d, v9.d[0]
|
fmla v27.2d, v3.2d, v9.d[0]
|
||||||
fmla v31.2d, v3.2d, v9.d[1]
|
fmla v31.2d, v3.2d, v9.d[1]
|
||||||
|
|
||||||
|
|
||||||
|
ldp q8, q9, [pB, #32]
|
||||||
|
ldp q0, q1, [pA, #64]
|
||||||
|
ldp q2, q3, [pA, #96]
|
||||||
|
|
||||||
fmla v16.2d, v4.2d, v12.d[0]
|
fmla v16.2d, v4.2d, v12.d[0]
|
||||||
fmla v20.2d, v4.2d, v12.d[1]
|
fmla v20.2d, v4.2d, v12.d[1]
|
||||||
fmla v24.2d, v4.2d, v13.d[0]
|
fmla v24.2d, v4.2d, v13.d[0]
|
||||||
fmla v28.2d, v4.2d, v13.d[1]
|
fmla v28.2d, v4.2d, v13.d[1]
|
||||||
|
|
||||||
ldp q0, q1, [pA, #64]
|
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
|
||||||
|
|
||||||
fmla v17.2d, v5.2d, v12.d[0]
|
fmla v17.2d, v5.2d, v12.d[0]
|
||||||
fmla v25.2d, v5.2d, v13.d[0]
|
fmla v25.2d, v5.2d, v13.d[0]
|
||||||
|
|
||||||
ldp q8, q9, [pB, #32]
|
|
||||||
ldp q2, q3, [pA, #96]
|
|
||||||
|
|
||||||
fmla v21.2d, v5.2d, v12.d[1]
|
fmla v21.2d, v5.2d, v12.d[1]
|
||||||
fmla v29.2d, v5.2d, v13.d[1]
|
fmla v29.2d, v5.2d, v13.d[1]
|
||||||
|
|
||||||
fmla v18.2d, v6.2d, v12.d[0]
|
fmla v18.2d, v6.2d, v12.d[0]
|
||||||
fmla v22.2d, v6.2d, v12.d[1]
|
fmla v22.2d, v6.2d, v12.d[1]
|
||||||
|
|
||||||
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
|
|
||||||
|
|
||||||
fmla v26.2d, v6.2d, v13.d[0]
|
fmla v26.2d, v6.2d, v13.d[0]
|
||||||
fmla v30.2d, v6.2d, v13.d[1]
|
fmla v30.2d, v6.2d, v13.d[1]
|
||||||
|
|
||||||
fmla v19.2d, v7.2d, v12.d[0]
|
|
||||||
fmla v23.2d, v7.2d, v12.d[1]
|
|
||||||
|
|
||||||
add pB, pB, #64
|
add pB, pB, #64
|
||||||
add pA, pA, #128
|
add pA, pA, #128
|
||||||
|
|
||||||
|
fmla v19.2d, v7.2d, v12.d[0]
|
||||||
|
fmla v23.2d, v7.2d, v12.d[1]
|
||||||
fmla v27.2d, v7.2d, v13.d[0]
|
fmla v27.2d, v7.2d, v13.d[0]
|
||||||
fmla v31.2d, v7.2d, v13.d[1]
|
fmla v31.2d, v7.2d, v13.d[1]
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
||||||
.macro KERNEL8x4_M1
|
.macro KERNEL8x4_M1
|
||||||
|
ldp q12, q13, [pB]
|
||||||
|
ldp q4, q5, [pA]
|
||||||
|
ldp q6, q7, [pA, #32]
|
||||||
|
|
||||||
fmla v16.2d, v0.2d, v8.d[0]
|
fmla v16.2d, v0.2d, v8.d[0]
|
||||||
fmla v20.2d, v0.2d, v8.d[1]
|
fmla v20.2d, v0.2d, v8.d[1]
|
||||||
|
|
||||||
ldp q4, q5, [pA], #32
|
|
||||||
|
|
||||||
fmla v24.2d, v0.2d, v9.d[0]
|
fmla v24.2d, v0.2d, v9.d[0]
|
||||||
fmla v28.2d, v0.2d, v9.d[1]
|
fmla v28.2d, v0.2d, v9.d[1]
|
||||||
|
|
||||||
ldp q12, q13, [pB]
|
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
|
||||||
add pB, pB, #32
|
|
||||||
|
|
||||||
fmla v17.2d, v1.2d, v8.d[0]
|
fmla v17.2d, v1.2d, v8.d[0]
|
||||||
fmla v25.2d, v1.2d, v9.d[0]
|
fmla v25.2d, v1.2d, v9.d[0]
|
||||||
|
|
||||||
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
|
|
||||||
|
|
||||||
fmla v21.2d, v1.2d, v8.d[1]
|
fmla v21.2d, v1.2d, v8.d[1]
|
||||||
fmla v29.2d, v1.2d, v9.d[1]
|
fmla v29.2d, v1.2d, v9.d[1]
|
||||||
|
|
||||||
|
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
|
||||||
|
|
||||||
fmla v18.2d, v2.2d, v8.d[0]
|
fmla v18.2d, v2.2d, v8.d[0]
|
||||||
fmla v22.2d, v2.2d, v8.d[1]
|
fmla v22.2d, v2.2d, v8.d[1]
|
||||||
|
|
||||||
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
|
|
||||||
|
|
||||||
fmla v26.2d, v2.2d, v9.d[0]
|
fmla v26.2d, v2.2d, v9.d[0]
|
||||||
fmla v30.2d, v2.2d, v9.d[1]
|
fmla v30.2d, v2.2d, v9.d[1]
|
||||||
|
|
||||||
|
add pB, pB, #32
|
||||||
|
add pA, pA, #64
|
||||||
|
|
||||||
fmla v19.2d, v3.2d, v8.d[0]
|
fmla v19.2d, v3.2d, v8.d[0]
|
||||||
fmla v23.2d, v3.2d, v8.d[1]
|
fmla v23.2d, v3.2d, v8.d[1]
|
||||||
|
|
||||||
ldp q6, q7, [pA], #32
|
|
||||||
|
|
||||||
fmla v27.2d, v3.2d, v9.d[0]
|
fmla v27.2d, v3.2d, v9.d[0]
|
||||||
fmla v31.2d, v3.2d, v9.d[1]
|
fmla v31.2d, v3.2d, v9.d[1]
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro KERNEL8x4_M2
|
.macro KERNEL8x4_M2
|
||||||
|
ldp q8, q9, [pB]
|
||||||
|
ldp q0, q1, [pA]
|
||||||
|
ldp q2, q3, [pA, #32]
|
||||||
|
|
||||||
fmla v16.2d, v4.2d, v12.d[0]
|
fmla v16.2d, v4.2d, v12.d[0]
|
||||||
fmla v20.2d, v4.2d, v12.d[1]
|
fmla v20.2d, v4.2d, v12.d[1]
|
||||||
fmla v24.2d, v4.2d, v13.d[0]
|
fmla v24.2d, v4.2d, v13.d[0]
|
||||||
fmla v28.2d, v4.2d, v13.d[1]
|
fmla v28.2d, v4.2d, v13.d[1]
|
||||||
|
|
||||||
ldp q0, q1, [pA], #32
|
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
|
||||||
|
|
||||||
fmla v17.2d, v5.2d, v12.d[0]
|
fmla v17.2d, v5.2d, v12.d[0]
|
||||||
fmla v25.2d, v5.2d, v13.d[0]
|
fmla v25.2d, v5.2d, v13.d[0]
|
||||||
|
|
||||||
ldp q8, q9, [pB]
|
|
||||||
add pB, pB, #32
|
|
||||||
|
|
||||||
fmla v21.2d, v5.2d, v12.d[1]
|
fmla v21.2d, v5.2d, v12.d[1]
|
||||||
fmla v29.2d, v5.2d, v13.d[1]
|
fmla v29.2d, v5.2d, v13.d[1]
|
||||||
|
|
||||||
fmla v18.2d, v6.2d, v12.d[0]
|
fmla v18.2d, v6.2d, v12.d[0]
|
||||||
fmla v22.2d, v6.2d, v12.d[1]
|
fmla v22.2d, v6.2d, v12.d[1]
|
||||||
|
|
||||||
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
|
|
||||||
|
|
||||||
fmla v26.2d, v6.2d, v13.d[0]
|
fmla v26.2d, v6.2d, v13.d[0]
|
||||||
fmla v30.2d, v6.2d, v13.d[1]
|
fmla v30.2d, v6.2d, v13.d[1]
|
||||||
|
|
||||||
|
add pB, pB, #32
|
||||||
|
add pA, pA, #64
|
||||||
|
|
||||||
fmla v19.2d, v7.2d, v12.d[0]
|
fmla v19.2d, v7.2d, v12.d[0]
|
||||||
fmla v23.2d, v7.2d, v12.d[1]
|
fmla v23.2d, v7.2d, v12.d[1]
|
||||||
|
|
||||||
ldp q2, q3, [pA], #32
|
|
||||||
|
|
||||||
fmla v27.2d, v7.2d, v13.d[0]
|
fmla v27.2d, v7.2d, v13.d[0]
|
||||||
fmla v31.2d, v7.2d, v13.d[1]
|
fmla v31.2d, v7.2d, v13.d[1]
|
||||||
.endm
|
.endm
|
||||||
|
@ -342,13 +319,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
fmla v24.2d, v4.2d, v13.d[0]
|
fmla v24.2d, v4.2d, v13.d[0]
|
||||||
fmla v28.2d, v4.2d, v13.d[1]
|
fmla v28.2d, v4.2d, v13.d[1]
|
||||||
|
|
||||||
|
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
|
||||||
|
|
||||||
fmla v17.2d, v5.2d, v12.d[0]
|
fmla v17.2d, v5.2d, v12.d[0]
|
||||||
fmla v25.2d, v5.2d, v13.d[0]
|
fmla v25.2d, v5.2d, v13.d[0]
|
||||||
fmla v21.2d, v5.2d, v12.d[1]
|
fmla v21.2d, v5.2d, v12.d[1]
|
||||||
fmla v29.2d, v5.2d, v13.d[1]
|
fmla v29.2d, v5.2d, v13.d[1]
|
||||||
|
|
||||||
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
|
|
||||||
|
|
||||||
fmla v18.2d, v6.2d, v12.d[0]
|
fmla v18.2d, v6.2d, v12.d[0]
|
||||||
fmla v22.2d, v6.2d, v12.d[1]
|
fmla v22.2d, v6.2d, v12.d[1]
|
||||||
fmla v26.2d, v6.2d, v13.d[0]
|
fmla v26.2d, v6.2d, v13.d[0]
|
||||||
|
@ -361,42 +338,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro KERNEL8x4_SUB
|
.macro KERNEL8x4_SUB
|
||||||
ldp q0, q1, [pA], #32
|
ldp q0, q1, [pA]
|
||||||
|
ldp q8, q9, [pB]
|
||||||
ldur q8, [pB]
|
ldp q2, q3, [pA, #32]
|
||||||
|
|
||||||
fmla v16.2d, v0.2d, v8.d[0]
|
|
||||||
fmla v20.2d, v0.2d, v8.d[1]
|
|
||||||
|
|
||||||
ldur q9, [pB, #16]
|
|
||||||
add pB, pB, #32
|
|
||||||
|
|
||||||
fmla v17.2d, v1.2d, v8.d[0]
|
|
||||||
fmla v21.2d, v1.2d, v8.d[1]
|
|
||||||
|
|
||||||
ldp q2, q3, [pA], #32
|
|
||||||
|
|
||||||
fmla v24.2d, v0.2d, v9.d[0]
|
|
||||||
fmla v28.2d, v0.2d, v9.d[1]
|
|
||||||
|
|
||||||
fmla v25.2d, v1.2d, v9.d[0]
|
|
||||||
fmla v29.2d, v1.2d, v9.d[1]
|
|
||||||
|
|
||||||
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
|
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
|
||||||
|
|
||||||
fmla v18.2d, v2.2d, v8.d[0]
|
fmla v16.2d, v0.2d, v8.d[0]
|
||||||
fmla v22.2d, v2.2d, v8.d[1]
|
fmla v20.2d, v0.2d, v8.d[1]
|
||||||
|
fmla v17.2d, v1.2d, v8.d[0]
|
||||||
|
fmla v21.2d, v1.2d, v8.d[1]
|
||||||
|
|
||||||
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
|
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
|
||||||
|
|
||||||
fmla v26.2d, v2.2d, v9.d[0]
|
fmla v24.2d, v0.2d, v9.d[0]
|
||||||
fmla v30.2d, v2.2d, v9.d[1]
|
fmla v28.2d, v0.2d, v9.d[1]
|
||||||
|
fmla v25.2d, v1.2d, v9.d[0]
|
||||||
|
fmla v29.2d, v1.2d, v9.d[1]
|
||||||
|
|
||||||
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
|
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
|
||||||
|
|
||||||
|
fmla v18.2d, v2.2d, v8.d[0]
|
||||||
|
fmla v22.2d, v2.2d, v8.d[1]
|
||||||
|
fmla v26.2d, v2.2d, v9.d[0]
|
||||||
|
fmla v30.2d, v2.2d, v9.d[1]
|
||||||
|
|
||||||
|
add pB, pB, #32
|
||||||
|
add pA, pA, #64
|
||||||
|
|
||||||
fmla v19.2d, v3.2d, v8.d[0]
|
fmla v19.2d, v3.2d, v8.d[0]
|
||||||
fmla v27.2d, v3.2d, v9.d[0]
|
fmla v27.2d, v3.2d, v9.d[0]
|
||||||
|
|
||||||
fmla v31.2d, v3.2d, v9.d[1]
|
fmla v31.2d, v3.2d, v9.d[1]
|
||||||
fmla v23.2d, v3.2d, v8.d[1]
|
fmla v23.2d, v3.2d, v8.d[1]
|
||||||
.endm
|
.endm
|
||||||
|
|
Loading…
Reference in New Issue