Cortex-A57: Improve DGEMM 8x4 Implementation

This commit is contained in:
Ashwin Sekhar T K 2016-03-17 10:23:51 +05:30
parent 8519e4ed9f
commit 3b5ffb49d3
1 changed files with 206 additions and 174 deletions

View File

@ -52,12 +52,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define alpha0 d10
#define alphaV0 v10.d[0]
#define alpha1 d11
#define alphaV1 v11.d[0]
#define alpha2 d14
#define alphaV2 v14.d[0]
#define alpha3 d15
#define alphaV3 v15.d[0]
#define A_PRE_SIZE 2560
#define B_PRE_SIZE 448
#define C_PRE_SIZE 128
// 00 origM
// 01 origN
@ -74,8 +72,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 12 pCRow0
// 13 pCRow1
// 14 pCRow2
// 15 pA
// 16
// 15 pCRow3
// 16 pA
// 17
// 18 must save
// 19 must save
@ -100,14 +98,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//v05 pA1_2, pA1_3
//v06 pA1_4, pA1_5
//v07 pA1_6, pA1_7
//v08 must save pB0_0, pB0_1
//v09 must save pB0_2, pB0_3
//v10 must save ALPHA0
//v11 must save ALPHA1
//v12 must save pB1_0, pB1_1
//v13 must save pB1_2, pB1_3
//v14 must save ALPHA2
//v15 must save ALPHA3
//v08 must save pB0_0
//v09 must save pB0_1
//v10 must save pB0_2 --> ALPHA0
//v11 must save pB0_3
//v12 must save pB1_0
//v13 must save pB1_1
//v14 must save pB1_2
//v15 must save pB1_3
//v16 must save C00, C01
//v17 must save C02, C03
//v18 C04, C05
@ -149,244 +147,257 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_I
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
ldp d8, d9, [pB]
add pB, pB, #16
ldp d10, d11, [pB]
add pB, pB, #16
ldp q0, q1, [pA], #32
ldp d8, d9, [pB], #16
fmul v16.2d, v0.2d, v8.2d[0]
fmul v17.2d, v1.2d, v8.2d[0]
fmul v18.2d, v2.2d, v8.2d[0]
fmul v19.2d, v3.2d, v8.2d[0]
fmul v20.2d, v0.2d, v9.2d[0]
ldp d10, d11, [pB], #16
fmul v17.2d, v1.2d, v8.2d[0]
fmul v21.2d, v1.2d, v9.2d[0]
fmul v22.2d, v2.2d, v9.2d[0]
fmul v23.2d, v3.2d, v9.2d[0]
ldp q2, q3, [pA], #32
fmul v24.2d, v0.2d, v10.2d[0]
fmul v25.2d, v1.2d, v10.2d[0]
fmul v26.2d, v2.2d, v10.2d[0]
fmul v27.2d, v3.2d, v10.2d[0]
fmul v28.2d, v0.2d, v11.2d[0]
ldp q4, q5, [pA], #32
fmul v25.2d, v1.2d, v10.2d[0]
fmul v29.2d, v1.2d, v11.2d[0]
fmul v30.2d, v2.2d, v11.2d[0]
fmul v31.2d, v3.2d, v11.2d[0]
ldp d12, d13, [pB], #16
ld1 {v4.2d, v5.2d}, [pA]
add pA, pA, #32
ld1 {v6.2d, v7.2d}, [pA]
add pA, pA, #32
ldp d12, d13, [pB]
add pB, pB, #16
ldp d14, d15, [pB]
add pB, pB, #16
fmul v18.2d, v2.2d, v8.2d[0]
fmul v22.2d, v2.2d, v9.2d[0]
ldp d14, d15, [pB], #16
fmul v26.2d, v2.2d, v10.2d[0]
fmul v30.2d, v2.2d, v11.2d[0]
ldp q6, q7, [pA], #32
fmul v19.2d, v3.2d, v8.2d[0]
fmul v27.2d, v3.2d, v10.2d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmul v31.2d, v3.2d, v11.2d[0]
fmul v23.2d, v3.2d, v9.2d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNEL8x4_M1
fmla v16.2d, v0.2d, v8.2d[0]
fmla v21.2d, v1.2d, v9.2d[0]
fmla v26.2d, v2.2d, v10.2d[0]
fmla v31.2d, v3.2d, v11.2d[0]
ld1 {v4.2d}, [pA], #16
fmla v20.2d, v0.2d, v9.2d[0]
fmla v17.2d, v1.2d, v8.2d[0]
ld1 {v5.2d}, [pA], #16
fmla v30.2d, v2.2d, v11.2d[0]
fmla v27.2d, v3.2d, v10.2d[0]
ldp d12, d13, [pB]
add pB, pB, #16
fmla v28.2d, v0.2d, v11.2d[0]
fmla v25.2d, v1.2d, v10.2d[0]
ldp d14, d15, [pB]
add pB, pB, #16
fmla v18.2d, v2.2d, v8.2d[0]
fmla v23.2d, v3.2d, v9.2d[0]
ld1 {v6.2d}, [pA], #16
ldp q4, q5, [pA], #32
fmla v24.2d, v0.2d, v10.2d[0]
fmla v28.2d, v0.2d, v11.2d[0]
ldp d12, d13, [pB], #16
fmla v17.2d, v1.2d, v8.2d[0]
fmla v25.2d, v1.2d, v10.2d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
fmla v21.2d, v1.2d, v9.2d[0]
fmla v29.2d, v1.2d, v11.2d[0]
ld1 {v7.2d}, [pA], #16
ldp d14, d15, [pB], #16
fmla v18.2d, v2.2d, v8.2d[0]
fmla v22.2d, v2.2d, v9.2d[0]
fmla v19.2d, v3.2d, v8.2d[0]
prfm PLDL1KEEP, [pA, #224]
prfm PLDL1KEEP, [pA, #224+64]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla v26.2d, v2.2d, v10.2d[0]
fmla v30.2d, v2.2d, v11.2d[0]
fmla v19.2d, v3.2d, v8.2d[0]
fmla v23.2d, v3.2d, v9.2d[0]
ldp q6, q7, [pA], #32
fmla v27.2d, v3.2d, v10.2d[0]
fmla v31.2d, v3.2d, v11.2d[0]
.endm
.macro KERNEL8x4_M2
fmla v16.2d, v4.2d, v12.2d[0]
fmla v21.2d, v5.2d, v13.2d[0]
fmla v26.2d, v6.2d, v14.2d[0]
fmla v31.2d, v7.2d, v15.2d[0]
ld1 {v0.2d}, [pA], #16
fmla v20.2d, v4.2d, v13.2d[0]
fmla v17.2d, v5.2d, v12.2d[0]
ld1 {v1.2d}, [pA], #16
fmla v30.2d, v6.2d, v15.2d[0]
fmla v27.2d, v7.2d, v14.2d[0]
ldp d8, d9, [pB]
add pB, pB, #16
fmla v24.2d, v4.2d, v14.2d[0]
fmla v28.2d, v4.2d, v15.2d[0]
ldp q0, q1, [pA], #32
fmla v17.2d, v5.2d, v12.2d[0]
fmla v25.2d, v5.2d, v14.2d[0]
ldp d10, d11, [pB]
add pB, pB, #16
ldp d8, d9, [pB], #16
fmla v22.2d, v6.2d, v13.2d[0]
fmla v19.2d, v7.2d, v12.2d[0]
ld1 {v2.2d}, [pA], #16
fmla v24.2d, v4.2d, v14.2d[0]
fmla v21.2d, v5.2d, v13.2d[0]
fmla v29.2d, v5.2d, v15.2d[0]
ld1 {v3.2d}, [pA], #16
ldp d10, d11, [pB], #16
fmla v18.2d, v6.2d, v12.2d[0]
fmla v22.2d, v6.2d, v13.2d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v26.2d, v6.2d, v14.2d[0]
fmla v30.2d, v6.2d, v15.2d[0]
fmla v19.2d, v7.2d, v12.2d[0]
fmla v23.2d, v7.2d, v13.2d[0]
prfm PLDL1KEEP, [pB, #640]
ldp q2, q3, [pA], #32
fmla v27.2d, v7.2d, v14.2d[0]
fmla v31.2d, v7.2d, v15.2d[0]
.endm
.macro KERNEL8x4_E
fmla v16.2d, v4.2d, v12.2d[0]
fmla v17.2d, v5.2d, v12.2d[0]
fmla v18.2d, v6.2d, v12.2d[0]
fmla v19.2d, v7.2d, v12.2d[0]
fmla v20.2d, v4.2d, v13.2d[0]
fmla v21.2d, v5.2d, v13.2d[0]
fmla v22.2d, v6.2d, v13.2d[0]
fmla v23.2d, v7.2d, v13.2d[0]
fmla v24.2d, v4.2d, v14.2d[0]
fmla v25.2d, v5.2d, v14.2d[0]
fmla v26.2d, v6.2d, v14.2d[0]
fmla v27.2d, v7.2d, v14.2d[0]
fmla v28.2d, v4.2d, v15.2d[0]
fmla v17.2d, v5.2d, v12.2d[0]
fmla v25.2d, v5.2d, v14.2d[0]
fmla v21.2d, v5.2d, v13.2d[0]
fmla v29.2d, v5.2d, v15.2d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v18.2d, v6.2d, v12.2d[0]
fmla v22.2d, v6.2d, v13.2d[0]
fmla v26.2d, v6.2d, v14.2d[0]
fmla v30.2d, v6.2d, v15.2d[0]
fmla v19.2d, v7.2d, v12.2d[0]
fmla v23.2d, v7.2d, v13.2d[0]
fmla v27.2d, v7.2d, v14.2d[0]
fmla v31.2d, v7.2d, v15.2d[0]
.endm
.macro KERNEL8x4_SUB
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
ldp d8, d9, [pB]
add pB, pB, #16
ldp d10, d11, [pB]
add pB, pB, #16
ldp q0, q1, [pA], #32
ldp d8, d9, [pB], #16
fmla v16.2d, v0.2d, v8.2d[0]
fmla v17.2d, v1.2d, v8.2d[0]
fmla v18.2d, v2.2d, v8.2d[0]
fmla v19.2d, v3.2d, v8.2d[0]
fmla v20.2d, v0.2d, v9.2d[0]
ldp d10, d11, [pB], #16
fmla v17.2d, v1.2d, v8.2d[0]
fmla v21.2d, v1.2d, v9.2d[0]
fmla v22.2d, v2.2d, v9.2d[0]
fmla v23.2d, v3.2d, v9.2d[0]
ldp q2, q3, [pA], #32
fmla v24.2d, v0.2d, v10.2d[0]
fmla v28.2d, v0.2d, v11.2d[0]
fmla v25.2d, v1.2d, v10.2d[0]
fmla v29.2d, v1.2d, v11.2d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla v18.2d, v2.2d, v8.2d[0]
fmla v22.2d, v2.2d, v9.2d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
fmla v26.2d, v2.2d, v10.2d[0]
fmla v30.2d, v2.2d, v11.2d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v19.2d, v3.2d, v8.2d[0]
fmla v27.2d, v3.2d, v10.2d[0]
fmla v28.2d, v0.2d, v11.2d[0]
fmla v29.2d, v1.2d, v11.2d[0]
fmla v30.2d, v2.2d, v11.2d[0]
fmla v31.2d, v3.2d, v11.2d[0]
fmla v23.2d, v3.2d, v9.2d[0]
.endm
.macro SAVE8x4
fmov alpha0, alpha
ld1 {v0.2d, v1.2d}, [pCRow0]
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ldp q0, q1, [pCRow0]
fmla v0.2d, v16.2d, alphaV0
fmla v1.2d, v17.2d, alphaV0
st1 {v0.2d, v1.2d}, [pCRow0]
stp q0, q1, [pCRow0]
add pCRow0, pCRow0, #32
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld1 {v2.2d, v3.2d}, [pCRow0]
ldp q2, q3, [pCRow0]
fmla v2.2d, v18.2d, alphaV0
fmla v3.2d, v19.2d, alphaV0
st1 {v2.2d, v3.2d}, [pCRow0]
stp q2, q3, [pCRow0]
add pCRow0, pCRow0, #32
ld1 {v4.2d, v5.2d}, [pCRow1]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ldp q4, q5, [pCRow1]
fmla v4.2d, v20.2d, alphaV0
fmla v5.2d, v21.2d, alphaV0
st1 {v4.2d, v5.2d}, [pCRow1]
stp q4, q5, [pCRow1]
add pCRow1, pCRow1, #32
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld1 {v6.2d, v7.2d}, [pCRow1]
ldp q6, q7, [pCRow1]
fmla v6.2d, v22.2d, alphaV0
fmla v7.2d, v23.2d, alphaV0
st1 {v6.2d, v7.2d}, [pCRow1]
stp q6, q7, [pCRow1]
add pCRow1, pCRow1, #32
ld1 {v0.2d, v1.2d}, [pCRow2]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
ldp q0, q1, [pCRow2]
fmla v0.2d, v24.2d, alphaV0
fmla v1.2d, v25.2d, alphaV0
st1 {v0.2d, v1.2d}, [pCRow2]
stp q0, q1, [pCRow2]
add pCRow2, pCRow2, #32
ld1 {v2.2d, v3.2d}, [pCRow2]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
ldp q2, q3, [pCRow2]
fmla v2.2d, v26.2d, alphaV0
fmla v3.2d, v27.2d, alphaV0
st1 {v2.2d, v3.2d}, [pCRow2]
stp q2, q3, [pCRow2]
add pCRow2, pCRow2, #32
ld1 {v4.2d, v5.2d}, [pCRow3]
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
ldp q4, q5, [pCRow3]
fmla v4.2d, v28.2d, alphaV0
fmla v5.2d, v29.2d, alphaV0
st1 {v4.2d, v5.2d}, [pCRow3]
stp q4, q5, [pCRow3]
add pCRow3, pCRow3, #32
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
ld1 {v6.2d, v7.2d}, [pCRow3]
ldp q6, q7, [pCRow3]
fmla v6.2d, v30.2d, alphaV0
fmla v7.2d, v31.2d, alphaV0
st1 {v6.2d, v7.2d}, [pCRow3]
stp q6, q7, [pCRow3]
add pCRow3, pCRow3, #32
prfm PLDL2KEEP, [pCRow0, #128]
prfm PLDL2KEEP, [pCRow1, #128]
prfm PLDL2KEEP, [pCRow2, #128]
prfm PLDL2KEEP, [pCRow3, #128]
.endm
/******************************************************************************/
@ -422,30 +433,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x4
fmov alpha0, alpha
ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV1
fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC
ld1 {v12.2d, v13.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV2
fmla v13.2d, v21.2d, alphaV3
fmla v12.2d, v20.2d, alphaV0
fmla v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow2, pCRow1, LDC
ld1 {v8.2d, v9.2d}, [pCRow2]
fmla v8.2d, v24.2d, alphaV0
fmla v9.2d, v25.2d, alphaV1
fmla v9.2d, v25.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow2]
add pCRow1, pCRow2, LDC
ld1 {v12.2d, v13.2d}, [pCRow1]
fmla v12.2d, v28.2d, alphaV2
fmla v13.2d, v29.2d, alphaV3
fmla v12.2d, v28.2d, alphaV0
fmla v13.2d, v29.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -474,6 +486,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x4
fmov alpha0, alpha
ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
@ -481,19 +494,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow1, pCRow0, LDC
ld1 {v12.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV1
fmla v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow2, pCRow1, LDC
ld1 {v8.2d}, [pCRow2]
fmla v8.2d, v24.2d, alphaV2
fmla v8.2d, v24.2d, alphaV0
st1 {v8.2d}, [pCRow2]
add pCRow1, pCRow2, LDC
ld1 {v12.2d}, [pCRow1]
fmla v12.2d, v28.2d, alphaV3
fmla v12.2d, v28.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16
@ -518,6 +531,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x4
fmov alpha0, alpha
add pCRow1, pCRow0, LDC
ld1 {v8.d}[0], [pCRow0]
@ -531,7 +545,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v12.d}[0], [pCRow2]
ld1 {v12.d}[1], [pCRow1]
fmla v12.2d, v20.2d, alphaV1
fmla v12.2d, v20.2d, alphaV0
st1 {v12.d}[0], [pCRow2]
st1 {v12.d}[1], [pCRow1]
@ -571,20 +585,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE8x2
fmov alpha0, alpha
add pCRow1, pCRow0, LDC
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmla v0.2d, v16.2d, alphaV0
fmla v1.2d, v17.2d, alphaV1
fmla v2.2d, v18.2d, alphaV2
fmla v3.2d, v19.2d, alphaV3
fmla v1.2d, v17.2d, alphaV0
fmla v2.2d, v18.2d, alphaV0
fmla v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0
fmla v5.2d, v21.2d, alphaV1
fmla v6.2d, v22.2d, alphaV2
fmla v7.2d, v23.2d, alphaV3
fmla v5.2d, v21.2d, alphaV0
fmla v6.2d, v22.2d, alphaV0
fmla v7.2d, v23.2d, alphaV0
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
add pCRow0, pCRow0, #64
@ -612,16 +627,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x2
fmov alpha0, alpha
ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV1
fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC
ld1 {v12.2d, v13.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV2
fmla v13.2d, v21.2d, alphaV3
fmla v12.2d, v20.2d, alphaV0
fmla v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -646,6 +662,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x2
fmov alpha0, alpha
ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
@ -653,7 +670,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow1 , pCRow0, LDC
ld1 {v12.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV1
fmla v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16
@ -676,6 +693,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x2
fmov alpha0, alpha
add pCRow1 , pCRow0, LDC
ld1 {v8.d}[0], [pCRow0]
@ -713,11 +731,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE8x1
fmov alpha0, alpha
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmla v0.2d, v16.2d, alphaV0
fmla v1.2d, v17.2d, alphaV1
fmla v2.2d, v18.2d, alphaV2
fmla v3.2d, v19.2d, alphaV3
fmla v1.2d, v17.2d, alphaV0
fmla v2.2d, v18.2d, alphaV0
fmla v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
add pCRow0, pCRow0, #64
@ -743,9 +762,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x1
fmov alpha0, alpha
ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV1
fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow0, pCRow0, #32
@ -769,6 +789,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x1
fmov alpha0, alpha
ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
@ -793,6 +814,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x1
fmov alpha0, alpha
ldr d8, [pCRow0]
fmadd d8, d16, alpha0, d8
str d8, [pCRow0]
@ -820,6 +842,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alpha, d0
lsl LDC, LDC, #3 // ldc = ldc * 8
@ -838,6 +863,7 @@ dgemm_kernel_L4_BEGIN:
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
add pCRow3, pCRow2, LDC
add pC, pCRow3, LDC
mov pA, origPA // pA = start of A array
@ -849,6 +875,7 @@ dgemm_kernel_L4_M8_BEGIN:
cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN
.align 5
dgemm_kernel_L4_M8_20:
mov pB, origPB
@ -868,8 +895,8 @@ dgemm_kernel_L4_M8_20:
subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a
.align 5
.align 5
dgemm_kernel_L4_M8_22:
KERNEL8x4_M1
@ -884,7 +911,7 @@ dgemm_kernel_L4_M8_22:
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22
.align 5
dgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
@ -898,6 +925,7 @@ dgemm_kernel_L4_M8_22a:
b dgemm_kernel_L4_M8_44
.align 5
dgemm_kernel_L4_M8_32:
tst counterL, #1
@ -923,6 +951,7 @@ dgemm_kernel_L4_M8_44:
ands counterL , origK, #7
ble dgemm_kernel_L4_M8_100
.align 5
dgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
@ -931,6 +960,9 @@ dgemm_kernel_L4_M8_46:
bne dgemm_kernel_L4_M8_46
dgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE8x4