Improvements to TRMM and GEMM kernels
This commit is contained in:
parent
8a40f1355e
commit
0a5ff9f9f9
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -46,19 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow0 x12
|
||||
#define pCRow1 x13
|
||||
#define pCRow2 x14
|
||||
#define pA x15
|
||||
#define temp x16
|
||||
#define tempOffset x17
|
||||
#define tempK x18
|
||||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define alpha x17
|
||||
#define temp x18
|
||||
#define tempOffset x19
|
||||
#define tempK x20
|
||||
|
||||
#define alpha0 d10
|
||||
#define alphaV0 v10.d[0]
|
||||
#define alpha1 d11
|
||||
#define alphaV1 v11.d[0]
|
||||
#define alpha2 d14
|
||||
#define alphaV2 v14.d[0]
|
||||
#define alpha3 d15
|
||||
#define alphaV3 v15.d[0]
|
||||
|
||||
#define A_PRE_SIZE 2560
|
||||
#define B_PRE_SIZE 448
|
||||
#define C_PRE_SIZE 128
|
||||
|
||||
// 00 origM
|
||||
// 01 origN
|
||||
|
@ -101,14 +101,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//v05 pA1_2, pA1_3
|
||||
//v06 pA1_4, pA1_5
|
||||
//v07 pA1_6, pA1_7
|
||||
//v08 must save pB0_0, pB0_1
|
||||
//v09 must save pB0_2, pB0_3
|
||||
//v10 must save ALPHA0
|
||||
//v11 must save ALPHA1
|
||||
//v12 must save pB1_0, pB1_1
|
||||
//v13 must save pB1_2, pB1_3
|
||||
//v14 must save ALPHA2
|
||||
//v15 must save ALPHA3
|
||||
//v08 must save pB0_0
|
||||
//v09 must save pB0_1
|
||||
//v10 must save pB0_2 --> ALPHA0
|
||||
//v11 must save pB0_3
|
||||
//v12 must save pB1_0
|
||||
//v13 must save pB1_1
|
||||
//v14 must save pB1_2
|
||||
//v15 must save pB1_3
|
||||
//v16 must save C00, C01
|
||||
//v17 must save C02, C03
|
||||
//v18 C04, C05
|
||||
|
@ -150,186 +150,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_I
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld1 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld1 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ldp q0, q1, [pA], #32
|
||||
|
||||
ldp d8, d9, [pB], #16
|
||||
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
fmul v20.2d, v0.2d, v9.d[0]
|
||||
|
||||
ldp d10, d11, [pB], #16
|
||||
|
||||
fmul v17.2d, v1.2d, v8.d[0]
|
||||
fmul v21.2d, v1.2d, v9.d[0]
|
||||
|
||||
ldp q2, q3, [pA], #32
|
||||
|
||||
fmul v24.2d, v0.2d, v10.d[0]
|
||||
fmul v28.2d, v0.2d, v11.d[0]
|
||||
|
||||
ldp q4, q5, [pA], #32
|
||||
|
||||
fmul v25.2d, v1.2d, v10.d[0]
|
||||
fmul v29.2d, v1.2d, v11.d[0]
|
||||
|
||||
ldp d12, d13, [pB], #16
|
||||
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
fmul v22.2d, v2.2d, v9.d[0]
|
||||
|
||||
ldp d14, d15, [pB], #16
|
||||
|
||||
fmul v26.2d, v2.2d, v10.d[0]
|
||||
fmul v30.2d, v2.2d, v11.d[0]
|
||||
|
||||
ldp q6, q7, [pA], #32
|
||||
|
||||
fmul v19.2d, v3.2d, v8.d[0]
|
||||
fmul v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
fmul v20.2d, v0.2d, v8.d[1]
|
||||
fmul v21.2d, v1.2d, v8.d[1]
|
||||
fmul v22.2d, v2.2d, v8.d[1]
|
||||
fmul v23.2d, v3.2d, v8.d[1]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmul v24.2d, v0.2d, v9.d[0]
|
||||
fmul v25.2d, v1.2d, v9.d[0]
|
||||
fmul v26.2d, v2.2d, v9.d[0]
|
||||
fmul v27.2d, v3.2d, v9.d[0]
|
||||
fmul v31.2d, v3.2d, v11.d[0]
|
||||
fmul v23.2d, v3.2d, v9.d[0]
|
||||
|
||||
fmul v28.2d, v0.2d, v9.d[1]
|
||||
fmul v29.2d, v1.2d, v9.d[1]
|
||||
fmul v30.2d, v2.2d, v9.d[1]
|
||||
fmul v31.2d, v3.2d, v9.d[1]
|
||||
|
||||
ld1 {v4.2d, v5.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld1 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld1 {v6.2d, v7.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_M1
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v9.d[0]
|
||||
|
||||
ldp q4, q5, [pA], #32
|
||||
|
||||
fmla v24.2d, v0.2d, v10.d[0]
|
||||
fmla v28.2d, v0.2d, v11.d[0]
|
||||
|
||||
ldp d12, d13, [pB], #16
|
||||
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
|
||||
fmla v21.2d, v1.2d, v9.d[0]
|
||||
fmla v29.2d, v1.2d, v11.d[0]
|
||||
|
||||
ldp d14, d15, [pB], #16
|
||||
|
||||
fmla v18.2d, v2.2d, v8.d[0]
|
||||
fmla v22.2d, v2.2d, v9.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmla v26.2d, v2.2d, v10.d[0]
|
||||
fmla v30.2d, v2.2d, v11.d[0]
|
||||
fmla v19.2d, v3.2d, v8.d[0]
|
||||
fmla v23.2d, v3.2d, v9.d[0]
|
||||
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
fmla v22.2d, v2.2d, v8.d[1]
|
||||
fmla v23.2d, v3.2d, v8.d[1]
|
||||
ldp q6, q7, [pA], #32
|
||||
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v25.2d, v1.2d, v9.d[0]
|
||||
fmla v26.2d, v2.2d, v9.d[0]
|
||||
fmla v27.2d, v3.2d, v9.d[0]
|
||||
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
fmla v29.2d, v1.2d, v9.d[1]
|
||||
fmla v30.2d, v2.2d, v9.d[1]
|
||||
fmla v31.2d, v3.2d, v9.d[1]
|
||||
|
||||
ld1 {v4.2d, v5.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld1 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld1 {v6.2d, v7.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
fmla v27.2d, v3.2d, v10.d[0]
|
||||
fmla v31.2d, v3.2d, v11.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_M2
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v20.2d, v4.2d, v13.d[0]
|
||||
fmla v24.2d, v4.2d, v14.d[0]
|
||||
fmla v28.2d, v4.2d, v15.d[0]
|
||||
|
||||
ldp q0, q1, [pA], #32
|
||||
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
fmla v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
ldp d8, d9, [pB], #16
|
||||
|
||||
fmla v21.2d, v5.2d, v13.d[0]
|
||||
fmla v29.2d, v5.2d, v15.d[0]
|
||||
|
||||
ldp d10, d11, [pB], #16
|
||||
|
||||
fmla v18.2d, v6.2d, v12.d[0]
|
||||
fmla v22.2d, v6.2d, v13.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
fmla v26.2d, v6.2d, v14.d[0]
|
||||
fmla v30.2d, v6.2d, v15.d[0]
|
||||
|
||||
fmla v19.2d, v7.2d, v12.d[0]
|
||||
fmla v23.2d, v7.2d, v13.d[0]
|
||||
|
||||
fmla v20.2d, v4.2d, v12.d[1]
|
||||
fmla v21.2d, v5.2d, v12.d[1]
|
||||
fmla v22.2d, v6.2d, v12.d[1]
|
||||
fmla v23.2d, v7.2d, v12.d[1]
|
||||
ldp q2, q3, [pA], #32
|
||||
|
||||
fmla v24.2d, v4.2d, v13.d[0]
|
||||
fmla v25.2d, v5.2d, v13.d[0]
|
||||
fmla v26.2d, v6.2d, v13.d[0]
|
||||
fmla v27.2d, v7.2d, v13.d[0]
|
||||
|
||||
fmla v28.2d, v4.2d, v13.d[1]
|
||||
fmla v29.2d, v5.2d, v13.d[1]
|
||||
fmla v30.2d, v6.2d, v13.d[1]
|
||||
fmla v31.2d, v7.2d, v13.d[1]
|
||||
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld1 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld1 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
fmla v27.2d, v7.2d, v14.d[0]
|
||||
fmla v31.2d, v7.2d, v15.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_E
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v20.2d, v4.2d, v13.d[0]
|
||||
fmla v24.2d, v4.2d, v14.d[0]
|
||||
fmla v28.2d, v4.2d, v15.d[0]
|
||||
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
fmla v25.2d, v5.2d, v14.d[0]
|
||||
fmla v21.2d, v5.2d, v13.d[0]
|
||||
fmla v29.2d, v5.2d, v15.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
fmla v18.2d, v6.2d, v12.d[0]
|
||||
fmla v22.2d, v6.2d, v13.d[0]
|
||||
fmla v26.2d, v6.2d, v14.d[0]
|
||||
fmla v30.2d, v6.2d, v15.d[0]
|
||||
|
||||
fmla v19.2d, v7.2d, v12.d[0]
|
||||
|
||||
fmla v20.2d, v4.2d, v12.d[1]
|
||||
fmla v21.2d, v5.2d, v12.d[1]
|
||||
fmla v22.2d, v6.2d, v12.d[1]
|
||||
fmla v23.2d, v7.2d, v12.d[1]
|
||||
|
||||
fmla v24.2d, v4.2d, v13.d[0]
|
||||
fmla v25.2d, v5.2d, v13.d[0]
|
||||
fmla v26.2d, v6.2d, v13.d[0]
|
||||
fmla v27.2d, v7.2d, v13.d[0]
|
||||
|
||||
fmla v28.2d, v4.2d, v13.d[1]
|
||||
fmla v29.2d, v5.2d, v13.d[1]
|
||||
fmla v30.2d, v6.2d, v13.d[1]
|
||||
fmla v31.2d, v7.2d, v13.d[1]
|
||||
fmla v23.2d, v7.2d, v13.d[0]
|
||||
fmla v27.2d, v7.2d, v14.d[0]
|
||||
fmla v31.2d, v7.2d, v15.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_SUB
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld1 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld1 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ldp q0, q1, [pA], #32
|
||||
|
||||
ldp d8, d9, [pB], #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v9.d[0]
|
||||
|
||||
ldp d10, d11, [pB], #16
|
||||
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v21.2d, v1.2d, v9.d[0]
|
||||
|
||||
ldp q2, q3, [pA], #32
|
||||
|
||||
fmla v24.2d, v0.2d, v10.d[0]
|
||||
fmla v28.2d, v0.2d, v11.d[0]
|
||||
|
||||
fmla v25.2d, v1.2d, v10.d[0]
|
||||
fmla v29.2d, v1.2d, v11.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmla v18.2d, v2.2d, v8.d[0]
|
||||
fmla v22.2d, v2.2d, v9.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
|
||||
fmla v26.2d, v2.2d, v10.d[0]
|
||||
fmla v30.2d, v2.2d, v11.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
fmla v19.2d, v3.2d, v8.d[0]
|
||||
fmla v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
fmla v22.2d, v2.2d, v8.d[1]
|
||||
fmla v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v25.2d, v1.2d, v9.d[0]
|
||||
fmla v26.2d, v2.2d, v9.d[0]
|
||||
fmla v27.2d, v3.2d, v9.d[0]
|
||||
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
fmla v29.2d, v1.2d, v9.d[1]
|
||||
fmla v30.2d, v2.2d, v9.d[1]
|
||||
fmla v31.2d, v3.2d, v9.d[1]
|
||||
fmla v31.2d, v3.2d, v11.d[0]
|
||||
fmla v23.2d, v3.2d, v9.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x4
|
||||
add pCRow1, pCRow0, LDC
|
||||
fmov alpha0, alpha
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0
|
||||
fmul v1.2d, v17.2d, alphaV1
|
||||
fmul v2.2d, v18.2d, alphaV2
|
||||
fmul v3.2d, v19.2d, alphaV3
|
||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
||||
fmul v1.2d, v17.2d, alphaV0
|
||||
stp q0, q1, [pCRow0]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
add pCRow0, pCRow0, #32
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
fmul v2.2d, v18.2d, alphaV0
|
||||
fmul v3.2d, v19.2d, alphaV0
|
||||
stp q2, q3, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
fmul v4.2d, v20.2d, alphaV0
|
||||
fmul v5.2d, v21.2d, alphaV1
|
||||
fmul v6.2d, v22.2d, alphaV2
|
||||
fmul v7.2d, v23.2d, alphaV3
|
||||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
|
||||
fmul v5.2d, v21.2d, alphaV0
|
||||
stp q4, q5, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
add pCRow1, pCRow1, #32
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
fmul v6.2d, v22.2d, alphaV0
|
||||
fmul v7.2d, v23.2d, alphaV0
|
||||
stp q6, q7, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, #32
|
||||
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
fmul v0.2d, v24.2d, alphaV0
|
||||
fmul v1.2d, v25.2d, alphaV1
|
||||
fmul v2.2d, v26.2d, alphaV2
|
||||
fmul v3.2d, v27.2d, alphaV3
|
||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2]
|
||||
fmul v1.2d, v25.2d, alphaV0
|
||||
stp q0, q1, [pCRow2]
|
||||
|
||||
add pCRow2, pCRow2, #32
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
fmul v2.2d, v26.2d, alphaV0
|
||||
fmul v3.2d, v27.2d, alphaV0
|
||||
stp q2, q3, [pCRow2]
|
||||
|
||||
add pCRow2, pCRow2, #32
|
||||
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
fmul v4.2d, v28.2d, alphaV0
|
||||
fmul v5.2d, v29.2d, alphaV1
|
||||
fmul v6.2d, v30.2d, alphaV2
|
||||
fmul v7.2d, v31.2d, alphaV3
|
||||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
|
||||
fmul v5.2d, v29.2d, alphaV0
|
||||
stp q4, q5, [pCRow3]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
add pCRow3, pCRow3, #32
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
fmul v6.2d, v30.2d, alphaV0
|
||||
fmul v7.2d, v31.2d, alphaV0
|
||||
stp q6, q7, [pCRow3]
|
||||
|
||||
add pCRow3, pCRow3, #32
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
@ -365,26 +428,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
fmov alpha0, alpha
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
fmul v9.2d, v17.2d, alphaV1
|
||||
fmul v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
fmul v12.2d, v20.2d, alphaV2
|
||||
fmul v13.2d, v21.2d, alphaV3
|
||||
fmul v12.2d, v20.2d, alphaV0
|
||||
fmul v13.2d, v21.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
|
||||
fmul v8.2d, v24.2d, alphaV0
|
||||
fmul v9.2d, v25.2d, alphaV1
|
||||
fmul v9.2d, v25.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
|
||||
fmul v12.2d, v28.2d, alphaV2
|
||||
fmul v13.2d, v29.2d, alphaV3
|
||||
fmul v12.2d, v28.2d, alphaV0
|
||||
fmul v13.2d, v29.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
@ -413,22 +477,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
fmov alpha0, alpha
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
fmul v12.2d, v20.2d, alphaV1
|
||||
fmul v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
|
||||
fmul v8.2d, v24.2d, alphaV2
|
||||
fmul v8.2d, v24.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
|
||||
fmul v12.2d, v28.2d, alphaV3
|
||||
fmul v12.2d, v28.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
@ -453,6 +518,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
fmov alpha0, alpha
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
|
@ -462,7 +529,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add pCRow2, pCRow1, LDC
|
||||
add pCRow1, pCRow2, LDC
|
||||
|
||||
fmul v12.2d, v20.2d, alphaV1
|
||||
fmul v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.d}[0], [pCRow2]
|
||||
st1 {v12.d}[1], [pCRow1]
|
||||
|
||||
|
@ -502,18 +569,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE8x2
|
||||
fmov alpha0, alpha
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0
|
||||
fmul v1.2d, v17.2d, alphaV1
|
||||
fmul v2.2d, v18.2d, alphaV2
|
||||
fmul v3.2d, v19.2d, alphaV3
|
||||
fmul v1.2d, v17.2d, alphaV0
|
||||
fmul v2.2d, v18.2d, alphaV0
|
||||
fmul v3.2d, v19.2d, alphaV0
|
||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
||||
|
||||
fmul v4.2d, v20.2d, alphaV0
|
||||
fmul v5.2d, v21.2d, alphaV1
|
||||
fmul v6.2d, v22.2d, alphaV2
|
||||
fmul v7.2d, v23.2d, alphaV3
|
||||
fmul v5.2d, v21.2d, alphaV0
|
||||
fmul v6.2d, v22.2d, alphaV0
|
||||
fmul v7.2d, v23.2d, alphaV0
|
||||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
@ -541,14 +609,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
fmov alpha0, alpha
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
fmul v9.2d, v17.2d, alphaV1
|
||||
fmul v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
fmul v12.2d, v20.2d, alphaV2
|
||||
fmul v13.2d, v21.2d, alphaV3
|
||||
fmul v12.2d, v20.2d, alphaV0
|
||||
fmul v13.2d, v21.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
@ -573,12 +642,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
fmov alpha0, alpha
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
|
||||
add pCRow1 , pCRow0, LDC
|
||||
|
||||
fmul v12.2d, v20.2d, alphaV1
|
||||
fmul v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
@ -601,6 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
fmov alpha0, alpha
|
||||
add pCRow1 , pCRow0, LDC
|
||||
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
|
@ -636,10 +707,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE8x1
|
||||
fmov alpha0, alpha
|
||||
fmul v0.2d, v16.2d, alphaV0
|
||||
fmul v1.2d, v17.2d, alphaV1
|
||||
fmul v2.2d, v18.2d, alphaV2
|
||||
fmul v3.2d, v19.2d, alphaV3
|
||||
fmul v1.2d, v17.2d, alphaV0
|
||||
fmul v2.2d, v18.2d, alphaV0
|
||||
fmul v3.2d, v19.2d, alphaV0
|
||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
@ -665,8 +737,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
fmov alpha0, alpha
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
fmul v9.2d, v17.2d, alphaV1
|
||||
fmul v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
@ -690,6 +763,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
fmov alpha0, alpha
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
|
||||
|
@ -713,6 +787,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x1
|
||||
fmov alpha0, alpha
|
||||
fmul d8, d16, alpha0
|
||||
str d8, [pCRow0]
|
||||
|
||||
|
@ -739,10 +814,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
fmov alpha0, d0
|
||||
fmov alpha1, d0
|
||||
fmov alpha2, d0
|
||||
fmov alpha3, d0
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
|
||||
fmov alpha, d0
|
||||
|
||||
lsl LDC, LDC, #3 // ldc = ldc * 8
|
||||
|
||||
|
@ -759,8 +834,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/******************************************************************************/
|
||||
|
||||
dtrmm_kernel_L4_BEGIN:
|
||||
mov pCRow0, pC // pCRow0 = C
|
||||
add pC, pC, LDC, lsl #2
|
||||
mov pCRow0, pC
|
||||
add pCRow1, pCRow0, LDC
|
||||
add pCRow2, pCRow1, LDC
|
||||
add pCRow3, pCRow2, LDC
|
||||
|
||||
add pC, pCRow3, LDC
|
||||
|
||||
|
||||
#if defined(LEFT)
|
||||
mov tempOffset, offset
|
||||
|
@ -774,6 +854,7 @@ dtrmm_kernel_L4_M8_BEGIN:
|
|||
cmp counterI, #0
|
||||
ble dtrmm_kernel_L4_M4_BEGIN
|
||||
|
||||
.align 5
|
||||
dtrmm_kernel_L4_M8_20:
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
@ -794,40 +875,64 @@ dtrmm_kernel_L4_M8_20:
|
|||
add tempK, tempOffset, #4
|
||||
#endif
|
||||
|
||||
asr counterL , tempK, #1 // L = K / 2
|
||||
asr counterL , tempK, #3 // L = K / 8
|
||||
cmp counterL , #2 // is there at least 4 to do?
|
||||
blt dtrmm_kernel_L4_M8_32
|
||||
|
||||
KERNEL8x4_I // do one in the K
|
||||
KERNEL8x4_M2 // do another in the K
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
|
||||
subs counterL, counterL, #2 // subtract 2
|
||||
ble dtrmm_kernel_L4_M8_22a
|
||||
.align 5
|
||||
|
||||
.align 5
|
||||
dtrmm_kernel_L4_M8_22:
|
||||
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt dtrmm_kernel_L4_M8_22
|
||||
|
||||
|
||||
.align 5
|
||||
dtrmm_kernel_L4_M8_22a:
|
||||
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_E
|
||||
|
||||
b dtrmm_kernel_L4_M8_44
|
||||
|
||||
.align 5
|
||||
dtrmm_kernel_L4_M8_32:
|
||||
|
||||
tst counterL, #1
|
||||
ble dtrmm_kernel_L4_M8_40
|
||||
|
||||
KERNEL8x4_I
|
||||
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_E
|
||||
|
||||
b dtrmm_kernel_L4_M8_44
|
||||
|
@ -838,13 +943,17 @@ dtrmm_kernel_L4_M8_40:
|
|||
|
||||
dtrmm_kernel_L4_M8_44:
|
||||
|
||||
ands counterL , tempK, #1
|
||||
ands counterL , tempK, #7
|
||||
ble dtrmm_kernel_L4_M8_100
|
||||
|
||||
.align 5
|
||||
dtrmm_kernel_L4_M8_46:
|
||||
|
||||
KERNEL8x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne dtrmm_kernel_L4_M8_46
|
||||
|
||||
dtrmm_kernel_L4_M8_100:
|
||||
|
||||
SAVE8x4
|
||||
|
@ -864,6 +973,9 @@ dtrmm_kernel_L4_M8_100:
|
|||
#if defined(LEFT)
|
||||
add tempOffset, tempOffset, #8
|
||||
#endif
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
dtrmm_kernel_L4_M8_END:
|
||||
subs counterI, counterI, #1
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -46,20 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow0 x12
|
||||
#define pCRow1 x13
|
||||
#define pCRow2 x14
|
||||
#define pA x15
|
||||
#define alpha_save_R x16
|
||||
#define alpha_save_I x17
|
||||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define alphaR x17
|
||||
#define alphaI x18
|
||||
|
||||
#define alpha0_R d10
|
||||
#define alphaV0_R v10.d[0]
|
||||
#define alpha0_I d11
|
||||
#define alphaV0_I v11.d[0]
|
||||
|
||||
#define alpha1_R d14
|
||||
#define alphaV1_R v14.d[0]
|
||||
#define alpha1_I d15
|
||||
#define alphaV1_I v15.d[0]
|
||||
|
||||
#define A_PRE_SIZE 2560
|
||||
#define B_PRE_SIZE 448
|
||||
#define C_PRE_SIZE 128
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
#define OP_rr fmla
|
||||
|
@ -98,10 +97,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 12 pCRow0
|
||||
// 13 pCRow1
|
||||
// 14 pCRow2
|
||||
// 15 pA
|
||||
// 16 alpha_save_R
|
||||
// 17 alpha_save_I
|
||||
// 18 must save
|
||||
// 15 pCRow3
|
||||
// 16 pA
|
||||
// 17 alpha_save_R
|
||||
// 18 must save alpha_save_I
|
||||
// 19 must save
|
||||
// 20 must save
|
||||
// 21 must save
|
||||
|
@ -175,12 +174,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro KERNEL4x4_I
|
||||
ld2 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
|
@ -193,16 +188,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.2d, v2.2d, v9.d[0]
|
||||
#else
|
||||
fmul v19.2d, v2.2d, v9.d[0]
|
||||
#endif
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
|
@ -215,6 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmul v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
@ -226,6 +216,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.2d, v2.2d, v9.d[0]
|
||||
#else
|
||||
fmul v19.2d, v2.2d, v9.d[0]
|
||||
#endif
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
ld2 {v4.2d, v5.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
@ -237,6 +244,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
ld2 {v6.2d, v7.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
@ -248,6 +258,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
ld2 {v14.2d, v15.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmul v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
@ -259,6 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmul v30.2d, v2.2d, v10.d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
@ -270,14 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v31.2d, v3.2d, v10.d[1]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v14.2d, v15.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v4.2d, v5.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
ld2 {v6.2d, v7.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
|
@ -286,7 +294,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB] // For next round
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
|
@ -294,15 +302,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
ld2 {v14.2d, v15.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
ld2 {v4.2d, v5.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v4.2d, v5.2d} , [pA] // For next round
|
||||
ld2 {v6.2d, v7.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
|
@ -310,22 +318,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
ld2 {v6.2d, v7.2d} , [pA] // For next round
|
||||
add pA, pA, #32
|
||||
ld2 {v14.2d, v15.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
OP_rr v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
|
||||
OP_rr v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
|
@ -344,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v17.2d, v4.2d, v13.d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.d[0]
|
||||
|
||||
ld2 {v8.2d, v9.2d}, [pB] // For next round
|
||||
ld2 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v6.2d, v12.d[0]
|
||||
|
@ -352,15 +360,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v19.2d, v6.2d, v13.d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.d[0]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v20.2d, v4.2d, v12.d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pA] // For next round
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v22.2d, v6.2d, v12.d[1]
|
||||
|
@ -368,22 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v23.2d, v6.2d, v13.d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.d[1]
|
||||
|
||||
ld2 {v2.2d, v3.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v24.2d, v4.2d, v14.d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
OP_rr v26.2d, v6.2d, v14.d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
|
||||
OP_rr v28.2d, v4.2d, v14.d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.d[1]
|
||||
|
@ -412,6 +420,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v21.2d, v4.2d, v13.d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
OP_rr v22.2d, v6.2d, v12.d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.d[1]
|
||||
|
@ -422,6 +432,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v25.2d, v4.2d, v15.d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
|
||||
OP_rr v26.2d, v6.2d, v14.d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.d[0]
|
||||
|
@ -441,33 +453,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro KERNEL4x4_SUB
|
||||
ld2 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
|
@ -490,74 +509,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
ld2 {v0.2d, v1.2d}, [pCRow0]
|
||||
fmla v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmla v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
ld2 {v2.2d, v3.2d}, [pCRow2]
|
||||
fmla v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
||||
ld2 {v2.2d, v3.2d}, [pCRow0]
|
||||
fmla v2.2d, v18.2d, alphaV0_R
|
||||
fmls v2.2d, v19.2d, alphaV0_I
|
||||
fmla v3.2d, v18.2d, alphaV1_I
|
||||
fmla v3.2d, v19.2d, alphaV1_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
fmla v3.2d, v18.2d, alphaV0_I
|
||||
fmla v3.2d, v19.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
ld2 {v4.2d, v5.2d}, [pCRow1]
|
||||
fmla v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmla v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmla v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
ld2 {v6.2d, v7.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow1, #32
|
||||
|
||||
ld2 {v6.2d, v7.2d}, [pCRow1]
|
||||
fmla v6.2d, v22.2d, alphaV0_R
|
||||
fmls v6.2d, v23.2d, alphaV0_I
|
||||
fmla v7.2d, v22.2d, alphaV1_I
|
||||
fmla v7.2d, v23.2d, alphaV1_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow2]
|
||||
fmla v7.2d, v22.2d, alphaV0_I
|
||||
fmla v7.2d, v23.2d, alphaV0_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow1, pCRow1, #32
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pCRow2]
|
||||
fmla v0.2d, v24.2d, alphaV0_R
|
||||
fmls v0.2d, v25.2d, alphaV0_I
|
||||
fmla v1.2d, v24.2d, alphaV1_I
|
||||
fmla v1.2d, v25.2d, alphaV1_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
fmla v1.2d, v24.2d, alphaV0_I
|
||||
fmla v1.2d, v25.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow2]
|
||||
|
||||
add pCRow2, pCRow2, #32
|
||||
|
||||
ld2 {v2.2d, v3.2d}, [pCRow2]
|
||||
fmla v2.2d, v26.2d, alphaV0_R
|
||||
fmls v2.2d, v27.2d, alphaV0_I
|
||||
fmla v3.2d, v26.2d, alphaV1_I
|
||||
fmla v3.2d, v27.2d, alphaV1_R
|
||||
fmla v3.2d, v26.2d, alphaV0_I
|
||||
fmla v3.2d, v27.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
add pCRow2, pCRow2, #32
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
ld2 {v4.2d, v5.2d}, [pCRow1]
|
||||
ld2 {v4.2d, v5.2d}, [pCRow3]
|
||||
fmla v4.2d, v28.2d, alphaV0_R
|
||||
fmls v4.2d, v29.2d, alphaV0_I
|
||||
fmla v5.2d, v28.2d, alphaV1_I
|
||||
fmla v5.2d, v29.2d, alphaV1_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
ld2 {v6.2d, v7.2d}, [pCRow2]
|
||||
fmla v5.2d, v28.2d, alphaV0_I
|
||||
fmla v5.2d, v29.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow3]
|
||||
|
||||
add pCRow3, pCRow3, #32
|
||||
|
||||
ld2 {v6.2d, v7.2d}, [pCRow3]
|
||||
fmla v6.2d, v30.2d, alphaV0_R
|
||||
fmls v6.2d, v31.2d, alphaV0_I
|
||||
fmla v7.2d, v30.2d, alphaV1_I
|
||||
fmla v7.2d, v31.2d, alphaV1_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow2]
|
||||
fmla v7.2d, v30.2d, alphaV0_I
|
||||
fmla v7.2d, v31.2d, alphaV0_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow3]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
add pCRow3, pCRow3, #32
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
@ -604,18 +634,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
fmla v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmla v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmla v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
@ -623,8 +651,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v4.2d, v5.2d}, [pCRow1]
|
||||
fmla v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmla v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmla v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
@ -632,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
fmla v0.2d, v24.2d, alphaV0_R
|
||||
fmls v0.2d, v25.2d, alphaV0_I
|
||||
fmla v1.2d, v24.2d, alphaV1_I
|
||||
fmla v1.2d, v25.2d, alphaV1_R
|
||||
fmla v1.2d, v24.2d, alphaV0_I
|
||||
fmla v1.2d, v25.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
@ -641,8 +669,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v4.2d, v5.2d}, [pCRow1]
|
||||
fmla v4.2d, v28.2d, alphaV0_R
|
||||
fmls v4.2d, v29.2d, alphaV0_I
|
||||
fmla v5.2d, v28.2d, alphaV1_I
|
||||
fmla v5.2d, v29.2d, alphaV1_R
|
||||
fmla v5.2d, v28.2d, alphaV0_I
|
||||
fmla v5.2d, v29.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
@ -691,18 +719,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.d, v1.d}[0], [pCRow1]
|
||||
fmla d0, d16, alphaV0_R
|
||||
fmls d0, d17, alphaV0_I
|
||||
fmla d1, d16, alphaV1_I
|
||||
fmla d1, d17, alphaV1_R
|
||||
fmla d1, d16, alphaV0_I
|
||||
fmla d1, d17, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
@ -710,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v4.d, v5.d}[0], [pCRow1]
|
||||
fmla d4, d20, alphaV0_R
|
||||
fmls d4, d21, alphaV0_I
|
||||
fmla d5, d20, alphaV1_I
|
||||
fmla d5, d21, alphaV1_R
|
||||
fmla d5, d20, alphaV0_I
|
||||
fmla d5, d21, alphaV0_R
|
||||
st2 {v4.d, v5.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
@ -719,8 +745,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.d, v1.d}[0], [pCRow1]
|
||||
fmla d0, d24, alphaV0_R
|
||||
fmls d0, d25, alphaV0_I
|
||||
fmla d1, d24, alphaV1_I
|
||||
fmla d1, d25, alphaV1_R
|
||||
fmla d1, d24, alphaV0_I
|
||||
fmla d1, d25, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
@ -728,8 +754,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v4.d, v5.d}[0], [pCRow1]
|
||||
fmla d4, d28, alphaV0_R
|
||||
fmls d4, d29, alphaV0_I
|
||||
fmla d5, d28, alphaV1_I
|
||||
fmla d5, d29, alphaV1_R
|
||||
fmla d5, d28, alphaV0_I
|
||||
fmla d5, d29, alphaV0_R
|
||||
st2 {v4.d, v5.d}[0], [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
@ -778,25 +804,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
fmla v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmla v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmla v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
ld2 {v2.2d, v3.2d}, [pCRow2]
|
||||
fmla v2.2d, v18.2d, alphaV0_R
|
||||
fmls v2.2d, v19.2d, alphaV0_I
|
||||
fmla v3.2d, v18.2d, alphaV1_I
|
||||
fmla v3.2d, v19.2d, alphaV1_R
|
||||
fmla v3.2d, v18.2d, alphaV0_I
|
||||
fmla v3.2d, v19.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
@ -804,15 +828,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v4.2d, v5.2d}, [pCRow1]
|
||||
fmla v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmla v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmla v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
ld2 {v6.2d, v7.2d}, [pCRow2]
|
||||
fmla v6.2d, v22.2d, alphaV0_R
|
||||
fmls v6.2d, v23.2d, alphaV0_I
|
||||
fmla v7.2d, v22.2d, alphaV1_I
|
||||
fmla v7.2d, v23.2d, alphaV1_R
|
||||
fmla v7.2d, v22.2d, alphaV0_I
|
||||
fmla v7.2d, v23.2d, alphaV0_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow2]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
@ -845,18 +869,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
fmla v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmla v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmla v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
@ -864,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v4.2d, v5.2d}, [pCRow1]
|
||||
fmla v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmla v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmla v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
@ -898,18 +920,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.d, v1.d}[0], [pCRow1]
|
||||
fmla d0, d16, alphaV0_R
|
||||
fmls d0, d17, alphaV0_I
|
||||
fmla d1, d16, alphaV1_I
|
||||
fmla d1, d17, alphaV1_R
|
||||
fmla d1, d16, alphaV0_I
|
||||
fmla d1, d17, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
@ -917,8 +937,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v4.d, v5.d}[0], [pCRow1]
|
||||
fmla d4, d20, alphaV0_R
|
||||
fmls d4, d21, alphaV0_I
|
||||
fmla d5, d20, alphaV1_I
|
||||
fmla d5, d21, alphaV1_R
|
||||
fmla d5, d20, alphaV0_I
|
||||
fmla d5, d21, alphaV0_R
|
||||
st2 {v4.d, v5.d}[0], [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
@ -953,25 +973,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
fmla v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmla v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmla v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
ld2 {v2.2d, v3.2d}, [pCRow2]
|
||||
fmla v2.2d, v18.2d, alphaV0_R
|
||||
fmls v2.2d, v19.2d, alphaV0_I
|
||||
fmla v3.2d, v18.2d, alphaV1_I
|
||||
fmla v3.2d, v19.2d, alphaV1_R
|
||||
fmla v3.2d, v18.2d, alphaV0_I
|
||||
fmla v3.2d, v19.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
@ -997,18 +1015,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
fmla v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmla v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmla v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
@ -1035,18 +1051,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x1
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.d, v1.d}[0], [pCRow1]
|
||||
fmla d0, d16, alphaV0_R
|
||||
fmls d0, d17, alphaV0_I
|
||||
fmla d1, d16, alphaV1_I
|
||||
fmla d1, d17, alphaV1_R
|
||||
fmla d1, d16, alphaV0_I
|
||||
fmla d1, d17, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
@ -1072,8 +1086,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
fmov alpha_save_R, d0
|
||||
fmov alpha_save_I, d1
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
|
||||
fmov alphaR, d0
|
||||
fmov alphaI, d1
|
||||
|
||||
lsl LDC, LDC, #4 // ldc = ldc * 2 * 8
|
||||
|
||||
|
@ -1085,8 +1102,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ble zgemm_kernel_L2_BEGIN
|
||||
|
||||
zgemm_kernel_L4_BEGIN:
|
||||
mov pCRow0, pC // pCRow0 = C
|
||||
add pC, pC, LDC, lsl #2
|
||||
mov pCRow0, pC
|
||||
add pCRow1, pCRow0, LDC
|
||||
add pCRow2, pCRow1, LDC
|
||||
add pCRow3, pCRow2, LDC
|
||||
|
||||
add pC, pCRow3, LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
zgemm_kernel_L4_M4_BEGIN:
|
||||
|
@ -1096,42 +1118,68 @@ zgemm_kernel_L4_M4_BEGIN:
|
|||
cmp counterI, #0
|
||||
ble zgemm_kernel_L4_M2_BEGIN
|
||||
|
||||
.align 5
|
||||
zgemm_kernel_L4_M4_20:
|
||||
|
||||
mov pB, origPB
|
||||
asr counterL , origK, #1 // L = K / 2
|
||||
cmp counterL , #2 // is there at least 4 to do?
|
||||
asr counterL , origK, #3
|
||||
cmp counterL , #2
|
||||
blt zgemm_kernel_L4_M4_32
|
||||
|
||||
KERNEL4x4_I // do one in the K
|
||||
KERNEL4x4_M2 // do another in the K
|
||||
KERNEL4x4_I
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
subs counterL, counterL, #2 // subtract 2
|
||||
ble zgemm_kernel_L4_M4_22a
|
||||
.align 5
|
||||
|
||||
.align 5
|
||||
zgemm_kernel_L4_M4_22:
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt zgemm_kernel_L4_M4_22
|
||||
|
||||
|
||||
.align 5
|
||||
zgemm_kernel_L4_M4_22a:
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_E
|
||||
|
||||
b zgemm_kernel_L4_M4_44
|
||||
|
||||
.align 5
|
||||
zgemm_kernel_L4_M4_32:
|
||||
|
||||
tst counterL, #1
|
||||
ble zgemm_kernel_L4_M4_40
|
||||
|
||||
KERNEL4x4_I
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_E
|
||||
|
||||
b zgemm_kernel_L4_M4_44
|
||||
|
@ -1143,13 +1191,20 @@ zgemm_kernel_L4_M4_40:
|
|||
|
||||
zgemm_kernel_L4_M4_44:
|
||||
|
||||
ands counterL , origK, #1
|
||||
ands counterL , origK, #7
|
||||
ble zgemm_kernel_L4_M4_100
|
||||
|
||||
.align 5
|
||||
zgemm_kernel_L4_M4_46:
|
||||
KERNEL4x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne zgemm_kernel_L4_M4_46
|
||||
|
||||
zgemm_kernel_L4_M4_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVE4x4
|
||||
|
||||
|
|
|
@ -46,23 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow0 x12
|
||||
#define pCRow1 x13
|
||||
#define pCRow2 x14
|
||||
#define pA x15
|
||||
#define alpha_save_R x16
|
||||
#define alpha_save_I x17
|
||||
#define temp x18
|
||||
#define tempOffset x19
|
||||
#define tempK x20
|
||||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define alphaR x17
|
||||
#define alphaI x18
|
||||
#define temp x19
|
||||
#define tempOffset x20
|
||||
#define tempK x21
|
||||
|
||||
#define alpha0_R d10
|
||||
#define alphaV0_R v10.d[0]
|
||||
#define alpha0_I d11
|
||||
#define alphaV0_I v11.d[0]
|
||||
|
||||
#define alpha1_R d14
|
||||
#define alphaV1_R v14.d[0]
|
||||
#define alpha1_I d15
|
||||
#define alphaV1_I v15.d[0]
|
||||
|
||||
#define A_PRE_SIZE 2560
|
||||
#define B_PRE_SIZE 448
|
||||
#define C_PRE_SIZE 128
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
#define OP_rr fmla
|
||||
|
@ -93,7 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 04 origPB
|
||||
// 05 pC
|
||||
// 06 origLDC -> LDC
|
||||
// 07 offset
|
||||
// 07 offset -> temp
|
||||
// 08 counterL
|
||||
// 09 counterI
|
||||
// 10 counterJ
|
||||
|
@ -101,13 +100,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 12 pCRow0
|
||||
// 13 pCRow1
|
||||
// 14 pCRow2
|
||||
// 15 pA
|
||||
// 16 alpha_save_R
|
||||
// 17 alpha_save_I
|
||||
// 18 must save temp
|
||||
// 19 must save tempOffset
|
||||
// 20 must save tempK
|
||||
// 21 must save
|
||||
// 15 pCRow3
|
||||
// 16 pA
|
||||
// 17 alpha_save_R
|
||||
// 18 must save alpha_save_I
|
||||
// 19 must save temp
|
||||
// 20 must save tempOffset
|
||||
// 21 must save tempK
|
||||
// 22 must save
|
||||
// 23 must save
|
||||
// 24 must save
|
||||
|
@ -178,12 +177,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro KERNEL4x4_I
|
||||
ld2 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
|
@ -196,16 +191,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.2d, v2.2d, v9.d[0]
|
||||
#else
|
||||
fmul v19.2d, v2.2d, v9.d[0]
|
||||
#endif
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
|
@ -218,6 +205,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmul v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
@ -229,6 +219,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.2d, v2.2d, v9.d[0]
|
||||
#else
|
||||
fmul v19.2d, v2.2d, v9.d[0]
|
||||
#endif
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
ld2 {v4.2d, v5.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
@ -240,6 +247,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
ld2 {v6.2d, v7.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
@ -251,6 +261,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
ld2 {v14.2d, v15.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmul v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
@ -262,6 +275,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmul v30.2d, v2.2d, v10.d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
@ -273,14 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v31.2d, v3.2d, v10.d[1]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v14.2d, v15.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v4.2d, v5.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
ld2 {v6.2d, v7.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
|
@ -289,7 +297,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB] // For next round
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
|
@ -297,15 +305,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
ld2 {v14.2d, v15.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
ld2 {v4.2d, v5.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v4.2d, v5.2d} , [pA] // For next round
|
||||
ld2 {v6.2d, v7.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
|
@ -313,22 +321,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
ld2 {v6.2d, v7.2d} , [pA] // For next round
|
||||
add pA, pA, #32
|
||||
ld2 {v14.2d, v15.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
OP_rr v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
|
||||
OP_rr v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
|
@ -347,7 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v17.2d, v4.2d, v13.d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.d[0]
|
||||
|
||||
ld2 {v8.2d, v9.2d}, [pB] // For next round
|
||||
ld2 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v6.2d, v12.d[0]
|
||||
|
@ -355,15 +363,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v19.2d, v6.2d, v13.d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.d[0]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v20.2d, v4.2d, v12.d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pA] // For next round
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v22.2d, v6.2d, v12.d[1]
|
||||
|
@ -371,22 +379,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v23.2d, v6.2d, v13.d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.d[1]
|
||||
|
||||
ld2 {v2.2d, v3.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v24.2d, v4.2d, v14.d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
OP_rr v26.2d, v6.2d, v14.d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
|
||||
OP_rr v28.2d, v4.2d, v14.d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.d[1]
|
||||
|
@ -415,6 +423,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v21.2d, v4.2d, v13.d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
OP_rr v22.2d, v6.2d, v12.d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.d[1]
|
||||
|
@ -425,6 +435,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v25.2d, v4.2d, v15.d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
|
||||
OP_rr v26.2d, v6.2d, v14.d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.d[0]
|
||||
|
@ -444,33 +456,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro KERNEL4x4_SUB
|
||||
ld2 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
|
@ -493,66 +512,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmul v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
fmul v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
||||
fmul v2.2d, v18.2d, alphaV0_R
|
||||
fmls v2.2d, v19.2d, alphaV0_I
|
||||
fmul v3.2d, v18.2d, alphaV1_I
|
||||
fmla v3.2d, v19.2d, alphaV1_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
fmul v3.2d, v18.2d, alphaV0_I
|
||||
fmla v3.2d, v19.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
fmul v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmul v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmul v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
|
||||
add pCRow1, pCRow1, #32
|
||||
|
||||
fmul v6.2d, v22.2d, alphaV0_R
|
||||
fmls v6.2d, v23.2d, alphaV0_I
|
||||
fmul v7.2d, v22.2d, alphaV1_I
|
||||
fmla v7.2d, v23.2d, alphaV1_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow2]
|
||||
fmul v7.2d, v22.2d, alphaV0_I
|
||||
fmla v7.2d, v23.2d, alphaV0_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, #32
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
fmul v0.2d, v24.2d, alphaV0_R
|
||||
fmls v0.2d, v25.2d, alphaV0_I
|
||||
fmul v1.2d, v24.2d, alphaV1_I
|
||||
fmla v1.2d, v25.2d, alphaV1_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
fmul v1.2d, v24.2d, alphaV0_I
|
||||
fmla v1.2d, v25.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow2]
|
||||
|
||||
add pCRow2, pCRow2, #32
|
||||
|
||||
fmul v2.2d, v26.2d, alphaV0_R
|
||||
fmls v2.2d, v27.2d, alphaV0_I
|
||||
fmul v3.2d, v26.2d, alphaV1_I
|
||||
fmla v3.2d, v27.2d, alphaV1_R
|
||||
fmul v3.2d, v26.2d, alphaV0_I
|
||||
fmla v3.2d, v27.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
add pCRow2, pCRow2, #32
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
fmul v4.2d, v28.2d, alphaV0_R
|
||||
fmls v4.2d, v29.2d, alphaV0_I
|
||||
fmul v5.2d, v28.2d, alphaV1_I
|
||||
fmla v5.2d, v29.2d, alphaV1_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
fmul v5.2d, v28.2d, alphaV0_I
|
||||
fmla v5.2d, v29.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow3]
|
||||
|
||||
add pCRow3, pCRow3, #32
|
||||
|
||||
fmul v6.2d, v30.2d, alphaV0_R
|
||||
fmls v6.2d, v31.2d, alphaV0_I
|
||||
fmul v7.2d, v30.2d, alphaV1_I
|
||||
fmla v7.2d, v31.2d, alphaV1_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow2]
|
||||
fmul v7.2d, v30.2d, alphaV0_I
|
||||
fmla v7.2d, v31.2d, alphaV0_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow3]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
add pCRow3, pCRow3, #32
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
@ -599,41 +629,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmul v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmul v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmul v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmul v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul v0.2d, v24.2d, alphaV0_R
|
||||
fmls v0.2d, v25.2d, alphaV0_I
|
||||
fmul v1.2d, v24.2d, alphaV1_I
|
||||
fmla v1.2d, v25.2d, alphaV1_R
|
||||
fmul v1.2d, v24.2d, alphaV0_I
|
||||
fmla v1.2d, v25.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul v4.2d, v28.2d, alphaV0_R
|
||||
fmls v4.2d, v29.2d, alphaV0_I
|
||||
fmul v5.2d, v28.2d, alphaV1_I
|
||||
fmla v5.2d, v29.2d, alphaV1_R
|
||||
fmul v5.2d, v28.2d, alphaV0_I
|
||||
fmla v5.2d, v29.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
@ -682,41 +710,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul d0, d16, alphaV0_R
|
||||
fmls d0, d17, alphaV0_I
|
||||
fmul d1, d16, alphaV1_I
|
||||
fmla d1, d17, alphaV1_R
|
||||
fmul d1, d16, alphaV0_I
|
||||
fmla d1, d17, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul d4, d20, alphaV0_R
|
||||
fmls d4, d21, alphaV0_I
|
||||
fmul d5, d20, alphaV1_I
|
||||
fmla d5, d21, alphaV1_R
|
||||
fmul d5, d20, alphaV0_I
|
||||
fmla d5, d21, alphaV0_R
|
||||
st2 {v4.d, v5.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul d0, d24, alphaV0_R
|
||||
fmls d0, d25, alphaV0_I
|
||||
fmul d1, d24, alphaV1_I
|
||||
fmla d1, d25, alphaV1_R
|
||||
fmul d1, d24, alphaV0_I
|
||||
fmla d1, d25, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul d4, d28, alphaV0_R
|
||||
fmls d4, d29, alphaV0_I
|
||||
fmul d5, d28, alphaV1_I
|
||||
fmla d5, d29, alphaV1_R
|
||||
fmul d5, d28, alphaV0_I
|
||||
fmla d5, d29, alphaV0_R
|
||||
st2 {v4.d, v5.d}[0], [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
@ -765,37 +791,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmul v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmul v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
fmul v2.2d, v18.2d, alphaV0_R
|
||||
fmls v2.2d, v19.2d, alphaV0_I
|
||||
fmul v3.2d, v18.2d, alphaV1_I
|
||||
fmla v3.2d, v19.2d, alphaV1_R
|
||||
fmul v3.2d, v18.2d, alphaV0_I
|
||||
fmla v3.2d, v19.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmul v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmul v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
fmul v6.2d, v22.2d, alphaV0_R
|
||||
fmls v6.2d, v23.2d, alphaV0_I
|
||||
fmul v7.2d, v22.2d, alphaV1_I
|
||||
fmla v7.2d, v23.2d, alphaV1_R
|
||||
fmul v7.2d, v22.2d, alphaV0_I
|
||||
fmla v7.2d, v23.2d, alphaV0_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow2]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
@ -828,25 +852,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmul v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmul v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmul v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmul v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
@ -879,25 +901,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul d0, d16, alphaV0_R
|
||||
fmls d0, d17, alphaV0_I
|
||||
fmul d1, d16, alphaV1_I
|
||||
fmla d1, d17, alphaV1_R
|
||||
fmul d1, d16, alphaV0_I
|
||||
fmla d1, d17, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul d4, d20, alphaV0_R
|
||||
fmls d4, d21, alphaV0_I
|
||||
fmul d5, d20, alphaV1_I
|
||||
fmla d5, d21, alphaV1_R
|
||||
fmul d5, d20, alphaV0_I
|
||||
fmla d5, d21, alphaV0_R
|
||||
st2 {v4.d, v5.d}[0], [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
@ -932,23 +952,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmul v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmul v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
fmul v2.2d, v18.2d, alphaV0_R
|
||||
fmls v2.2d, v19.2d, alphaV0_I
|
||||
fmul v3.2d, v18.2d, alphaV1_I
|
||||
fmla v3.2d, v19.2d, alphaV1_R
|
||||
fmul v3.2d, v18.2d, alphaV0_I
|
||||
fmla v3.2d, v19.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
@ -974,17 +992,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmul v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmul v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
@ -1011,17 +1027,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x1
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul d0, d16, alphaV0_R
|
||||
fmls d0, d17, alphaV0_I
|
||||
fmul d1, d16, alphaV1_I
|
||||
fmla d1, d17, alphaV1_R
|
||||
fmul d1, d16, alphaV0_I
|
||||
fmla d1, d17, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
@ -1047,8 +1061,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
fmov alpha_save_R, d0
|
||||
fmov alpha_save_I, d1
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
|
||||
fmov alphaR, d0
|
||||
fmov alphaI, d1
|
||||
|
||||
lsl LDC, LDC, #4 // ldc = ldc * 2 * 8
|
||||
|
||||
|
@ -1064,8 +1081,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ble ztrmm_kernel_L2_BEGIN
|
||||
|
||||
ztrmm_kernel_L4_BEGIN:
|
||||
mov pCRow0, pC // pCRow0 = C
|
||||
add pC, pC, LDC, lsl #2
|
||||
mov pCRow0, pC
|
||||
add pCRow1, pCRow0, LDC
|
||||
add pCRow2, pCRow1, LDC
|
||||
add pCRow3, pCRow2, LDC
|
||||
|
||||
add pC, pCRow3, LDC
|
||||
|
||||
|
||||
#if defined(LEFT)
|
||||
mov tempOffset, offset
|
||||
|
@ -1079,6 +1101,7 @@ ztrmm_kernel_L4_M4_BEGIN:
|
|||
cmp counterI, #0
|
||||
ble ztrmm_kernel_L4_M2_BEGIN
|
||||
|
||||
.align 5
|
||||
ztrmm_kernel_L4_M4_20:
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
@ -1098,39 +1121,64 @@ ztrmm_kernel_L4_M4_20:
|
|||
add tempK, tempOffset, #4
|
||||
#endif
|
||||
|
||||
asr counterL , tempK, #1 // L = K / 2
|
||||
cmp counterL , #2 // is there at least 4 to do?
|
||||
asr counterL , tempK, #3
|
||||
cmp counterL , #2
|
||||
blt ztrmm_kernel_L4_M4_32
|
||||
|
||||
KERNEL4x4_I // do one in the K
|
||||
KERNEL4x4_M2 // do another in the K
|
||||
KERNEL4x4_I
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
subs counterL, counterL, #2
|
||||
ble ztrmm_kernel_L4_M4_22a
|
||||
.align 5
|
||||
|
||||
.align 5
|
||||
ztrmm_kernel_L4_M4_22:
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt ztrmm_kernel_L4_M4_22
|
||||
|
||||
|
||||
.align 5
|
||||
ztrmm_kernel_L4_M4_22a:
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_E
|
||||
|
||||
b ztrmm_kernel_L4_M4_44
|
||||
|
||||
.align 5
|
||||
ztrmm_kernel_L4_M4_32:
|
||||
|
||||
tst counterL, #1
|
||||
ble ztrmm_kernel_L4_M4_40
|
||||
|
||||
KERNEL4x4_I
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_E
|
||||
|
||||
b ztrmm_kernel_L4_M4_44
|
||||
|
@ -1142,12 +1190,16 @@ ztrmm_kernel_L4_M4_40:
|
|||
|
||||
ztrmm_kernel_L4_M4_44:
|
||||
|
||||
ands counterL , tempK, #1
|
||||
ands counterL , tempK, #7
|
||||
ble ztrmm_kernel_L4_M4_100
|
||||
|
||||
.align 5
|
||||
ztrmm_kernel_L4_M4_46:
|
||||
KERNEL4x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne ztrmm_kernel_L4_M4_46
|
||||
|
||||
ztrmm_kernel_L4_M4_100:
|
||||
|
||||
SAVE4x4
|
||||
|
@ -1167,6 +1219,10 @@ ztrmm_kernel_L4_M4_100:
|
|||
add tempOffset, tempOffset, #4
|
||||
#endif
|
||||
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
ztrmm_kernel_L4_M4_END:
|
||||
subs counterI, counterI, #1
|
||||
bne ztrmm_kernel_L4_M4_20
|
||||
|
|
6
param.h
6
param.h
|
@ -2341,13 +2341,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define DGEMM_DEFAULT_UNROLL_M 4
|
||||
#define DGEMM_DEFAULT_UNROLL_M 8
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define CGEMM_DEFAULT_UNROLL_M 4
|
||||
#define CGEMM_DEFAULT_UNROLL_M 8
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 4
|
||||
|
|
Loading…
Reference in New Issue