THUNDERX2T99: Improve DGEMM

This commit is contained in:
Ashwin Sekhar T K 2017-01-16 23:16:23 -08:00
parent 981064acc6
commit 0f1d6e8b39
1 changed files with 71 additions and 100 deletions

View File

@ -151,187 +151,164 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_I
ldur q0, [pA]
ldur q1, [pA, #16]
ldur q8, [pB]
ldur q9, [pB, #16]
ldp q0, q1, [pA]
ldp q8, q9, [pB]
ldp q2, q3, [pA, #32]
ldp q4, q5, [pA, #64]
ldp q12, q13, [pB, #32]
ldp q6, q7, [pA, #96]
fmul v16.2d, v0.2d, v8.d[0]
fmul v20.2d, v0.2d, v8.d[1]
fmul v17.2d, v1.2d, v8.d[0]
fmul v21.2d, v1.2d, v8.d[1]
ldp q2, q3, [pA, #32]
fmul v24.2d, v0.2d, v9.d[0]
ldp q4, q5, [pA, #64]
fmul v28.2d, v0.2d, v9.d[1]
add pA, pA, #128
add pB, pB, #64
fmul v24.2d, v0.2d, v9.d[0]
fmul v28.2d, v0.2d, v9.d[1]
fmul v25.2d, v1.2d, v9.d[0]
fmul v29.2d, v1.2d, v9.d[1]
ldur q12, [pB, #32]
ldur q13, [pB, #48]
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
fmul v18.2d, v2.2d, v8.d[0]
fmul v22.2d, v2.2d, v8.d[1]
ldur q6, [pA, #96]
ldur q7, [pA, #112]
add pB, pB, #64
add pA, pA, #128
fmul v26.2d, v2.2d, v9.d[0]
fmul v30.2d, v2.2d, v9.d[1]
fmul v19.2d, v3.2d, v8.d[0]
fmul v27.2d, v3.2d, v9.d[0]
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
fmul v31.2d, v3.2d, v9.d[1]
fmul v23.2d, v3.2d, v8.d[1]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
.endm
.macro KERNEL8x4_M1_M2
ldp q12, q13, [pB]
ldp q4, q5, [pA]
ldp q6, q7, [pA, #32]
fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.d[1]
ldp q4, q5, [pA]
fmla v24.2d, v0.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.d[1]
ldp q12, q13, [pB]
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
fmla v17.2d, v1.2d, v8.d[0]
fmla v25.2d, v1.2d, v9.d[0]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
fmla v21.2d, v1.2d, v8.d[1]
fmla v29.2d, v1.2d, v9.d[1]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
fmla v18.2d, v2.2d, v8.d[0]
fmla v22.2d, v2.2d, v8.d[1]
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
fmla v26.2d, v2.2d, v9.d[0]
fmla v30.2d, v2.2d, v9.d[1]
prfm PLDL1KEEP, [pA, #3840]
fmla v19.2d, v3.2d, v8.d[0]
fmla v23.2d, v3.2d, v8.d[1]
ldp q6, q7, [pA, #32]
fmla v27.2d, v3.2d, v9.d[0]
fmla v31.2d, v3.2d, v9.d[1]
ldp q8, q9, [pB, #32]
ldp q0, q1, [pA, #64]
ldp q2, q3, [pA, #96]
fmla v16.2d, v4.2d, v12.d[0]
fmla v20.2d, v4.2d, v12.d[1]
fmla v24.2d, v4.2d, v13.d[0]
fmla v28.2d, v4.2d, v13.d[1]
ldp q0, q1, [pA, #64]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
fmla v17.2d, v5.2d, v12.d[0]
fmla v25.2d, v5.2d, v13.d[0]
ldp q8, q9, [pB, #32]
ldp q2, q3, [pA, #96]
fmla v21.2d, v5.2d, v12.d[1]
fmla v29.2d, v5.2d, v13.d[1]
fmla v18.2d, v6.2d, v12.d[0]
fmla v22.2d, v6.2d, v12.d[1]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
fmla v26.2d, v6.2d, v13.d[0]
fmla v30.2d, v6.2d, v13.d[1]
fmla v19.2d, v7.2d, v12.d[0]
fmla v23.2d, v7.2d, v12.d[1]
add pB, pB, #64
add pA, pA, #128
fmla v19.2d, v7.2d, v12.d[0]
fmla v23.2d, v7.2d, v12.d[1]
fmla v27.2d, v7.2d, v13.d[0]
fmla v31.2d, v7.2d, v13.d[1]
.endm
.macro KERNEL8x4_M1
ldp q12, q13, [pB]
ldp q4, q5, [pA]
ldp q6, q7, [pA, #32]
fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.d[1]
ldp q4, q5, [pA], #32
fmla v24.2d, v0.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.d[1]
ldp q12, q13, [pB]
add pB, pB, #32
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
fmla v17.2d, v1.2d, v8.d[0]
fmla v25.2d, v1.2d, v9.d[0]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
fmla v21.2d, v1.2d, v8.d[1]
fmla v29.2d, v1.2d, v9.d[1]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
fmla v18.2d, v2.2d, v8.d[0]
fmla v22.2d, v2.2d, v8.d[1]
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
fmla v26.2d, v2.2d, v9.d[0]
fmla v30.2d, v2.2d, v9.d[1]
add pB, pB, #32
add pA, pA, #64
fmla v19.2d, v3.2d, v8.d[0]
fmla v23.2d, v3.2d, v8.d[1]
ldp q6, q7, [pA], #32
fmla v27.2d, v3.2d, v9.d[0]
fmla v31.2d, v3.2d, v9.d[1]
.endm
.macro KERNEL8x4_M2
ldp q8, q9, [pB]
ldp q0, q1, [pA]
ldp q2, q3, [pA, #32]
fmla v16.2d, v4.2d, v12.d[0]
fmla v20.2d, v4.2d, v12.d[1]
fmla v24.2d, v4.2d, v13.d[0]
fmla v28.2d, v4.2d, v13.d[1]
ldp q0, q1, [pA], #32
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
fmla v17.2d, v5.2d, v12.d[0]
fmla v25.2d, v5.2d, v13.d[0]
ldp q8, q9, [pB]
add pB, pB, #32
fmla v21.2d, v5.2d, v12.d[1]
fmla v29.2d, v5.2d, v13.d[1]
fmla v18.2d, v6.2d, v12.d[0]
fmla v22.2d, v6.2d, v12.d[1]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
fmla v26.2d, v6.2d, v13.d[0]
fmla v30.2d, v6.2d, v13.d[1]
add pB, pB, #32
add pA, pA, #64
fmla v19.2d, v7.2d, v12.d[0]
fmla v23.2d, v7.2d, v12.d[1]
ldp q2, q3, [pA], #32
fmla v27.2d, v7.2d, v13.d[0]
fmla v31.2d, v7.2d, v13.d[1]
.endm
@ -342,13 +319,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla v24.2d, v4.2d, v13.d[0]
fmla v28.2d, v4.2d, v13.d[1]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
fmla v17.2d, v5.2d, v12.d[0]
fmla v25.2d, v5.2d, v13.d[0]
fmla v21.2d, v5.2d, v12.d[1]
fmla v29.2d, v5.2d, v13.d[1]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
fmla v18.2d, v6.2d, v12.d[0]
fmla v22.2d, v6.2d, v12.d[1]
fmla v26.2d, v6.2d, v13.d[0]
@ -361,42 +338,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_SUB
ldp q0, q1, [pA], #32
ldur q8, [pB]
fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.d[1]
ldur q9, [pB, #16]
add pB, pB, #32
fmla v17.2d, v1.2d, v8.d[0]
fmla v21.2d, v1.2d, v8.d[1]
ldp q2, q3, [pA], #32
fmla v24.2d, v0.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.d[1]
fmla v25.2d, v1.2d, v9.d[0]
fmla v29.2d, v1.2d, v9.d[1]
ldp q0, q1, [pA]
ldp q8, q9, [pB]
ldp q2, q3, [pA, #32]
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
fmla v18.2d, v2.2d, v8.d[0]
fmla v22.2d, v2.2d, v8.d[1]
fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.d[1]
fmla v17.2d, v1.2d, v8.d[0]
fmla v21.2d, v1.2d, v8.d[1]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
fmla v26.2d, v2.2d, v9.d[0]
fmla v30.2d, v2.2d, v9.d[1]
fmla v24.2d, v0.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.d[1]
fmla v25.2d, v1.2d, v9.d[0]
fmla v29.2d, v1.2d, v9.d[1]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
fmla v18.2d, v2.2d, v8.d[0]
fmla v22.2d, v2.2d, v8.d[1]
fmla v26.2d, v2.2d, v9.d[0]
fmla v30.2d, v2.2d, v9.d[1]
add pB, pB, #32
add pA, pA, #64
fmla v19.2d, v3.2d, v8.d[0]
fmla v27.2d, v3.2d, v9.d[0]
fmla v31.2d, v3.2d, v9.d[1]
fmla v23.2d, v3.2d, v8.d[1]
.endm