Improvements to TRMM and GEMM kernels

This commit is contained in:
Ashwin Sekhar T K 2016-07-14 13:51:17 +05:30
parent 8a40f1355e
commit 0a5ff9f9f9
8 changed files with 2396 additions and 1829 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -46,19 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
#define pA x15
#define temp x16
#define tempOffset x17
#define tempK x18
#define pCRow3 x15
#define pA x16
#define alpha x17
#define temp x18
#define tempOffset x19
#define tempK x20
#define alpha0 d10
#define alphaV0 v10.d[0]
#define alpha1 d11
#define alphaV1 v11.d[0]
#define alpha2 d14
#define alphaV2 v14.d[0]
#define alpha3 d15
#define alphaV3 v15.d[0]
#define A_PRE_SIZE 2560
#define B_PRE_SIZE 448
#define C_PRE_SIZE 128
// 00 origM
// 01 origN
@ -101,14 +101,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//v05 pA1_2, pA1_3
//v06 pA1_4, pA1_5
//v07 pA1_6, pA1_7
//v08 must save pB0_0, pB0_1
//v09 must save pB0_2, pB0_3
//v10 must save ALPHA0
//v11 must save ALPHA1
//v12 must save pB1_0, pB1_1
//v13 must save pB1_2, pB1_3
//v14 must save ALPHA2
//v15 must save ALPHA3
//v08 must save pB0_0
//v09 must save pB0_1
//v10 must save pB0_2 --> ALPHA0
//v11 must save pB0_3
//v12 must save pB1_0
//v13 must save pB1_1
//v14 must save pB1_2
//v15 must save pB1_3
//v16 must save C00, C01
//v17 must save C02, C03
//v18 C04, C05
@ -150,186 +150,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_I
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld1 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
ldp q0, q1, [pA], #32
ldp d8, d9, [pB], #16
fmul v16.2d, v0.2d, v8.d[0]
fmul v20.2d, v0.2d, v9.d[0]
ldp d10, d11, [pB], #16
fmul v17.2d, v1.2d, v8.d[0]
fmul v21.2d, v1.2d, v9.d[0]
ldp q2, q3, [pA], #32
fmul v24.2d, v0.2d, v10.d[0]
fmul v28.2d, v0.2d, v11.d[0]
ldp q4, q5, [pA], #32
fmul v25.2d, v1.2d, v10.d[0]
fmul v29.2d, v1.2d, v11.d[0]
ldp d12, d13, [pB], #16
fmul v18.2d, v2.2d, v8.d[0]
fmul v22.2d, v2.2d, v9.d[0]
ldp d14, d15, [pB], #16
fmul v26.2d, v2.2d, v10.d[0]
fmul v30.2d, v2.2d, v11.d[0]
ldp q6, q7, [pA], #32
fmul v19.2d, v3.2d, v8.d[0]
fmul v27.2d, v3.2d, v10.d[0]
fmul v20.2d, v0.2d, v8.d[1]
fmul v21.2d, v1.2d, v8.d[1]
fmul v22.2d, v2.2d, v8.d[1]
fmul v23.2d, v3.2d, v8.d[1]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmul v24.2d, v0.2d, v9.d[0]
fmul v25.2d, v1.2d, v9.d[0]
fmul v26.2d, v2.2d, v9.d[0]
fmul v27.2d, v3.2d, v9.d[0]
fmul v31.2d, v3.2d, v11.d[0]
fmul v23.2d, v3.2d, v9.d[0]
fmul v28.2d, v0.2d, v9.d[1]
fmul v29.2d, v1.2d, v9.d[1]
fmul v30.2d, v2.2d, v9.d[1]
fmul v31.2d, v3.2d, v9.d[1]
ld1 {v4.2d, v5.2d}, [pA]
add pA, pA, #32
ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
ld1 {v6.2d, v7.2d}, [pA]
add pA, pA, #32
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNEL8x4_M1
fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v9.d[0]
ldp q4, q5, [pA], #32
fmla v24.2d, v0.2d, v10.d[0]
fmla v28.2d, v0.2d, v11.d[0]
ldp d12, d13, [pB], #16
fmla v17.2d, v1.2d, v8.d[0]
fmla v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
fmla v21.2d, v1.2d, v9.d[0]
fmla v29.2d, v1.2d, v11.d[0]
ldp d14, d15, [pB], #16
fmla v18.2d, v2.2d, v8.d[0]
fmla v22.2d, v2.2d, v9.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla v26.2d, v2.2d, v10.d[0]
fmla v30.2d, v2.2d, v11.d[0]
fmla v19.2d, v3.2d, v8.d[0]
fmla v23.2d, v3.2d, v9.d[0]
fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.d[1]
fmla v22.2d, v2.2d, v8.d[1]
fmla v23.2d, v3.2d, v8.d[1]
ldp q6, q7, [pA], #32
fmla v24.2d, v0.2d, v9.d[0]
fmla v25.2d, v1.2d, v9.d[0]
fmla v26.2d, v2.2d, v9.d[0]
fmla v27.2d, v3.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.d[1]
fmla v29.2d, v1.2d, v9.d[1]
fmla v30.2d, v2.2d, v9.d[1]
fmla v31.2d, v3.2d, v9.d[1]
ld1 {v4.2d, v5.2d}, [pA]
add pA, pA, #32
ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
ld1 {v6.2d, v7.2d}, [pA]
add pA, pA, #32
prfm PLDL1KEEP, [pA, #512]
fmla v27.2d, v3.2d, v10.d[0]
fmla v31.2d, v3.2d, v11.d[0]
.endm
.macro KERNEL8x4_M2
fmla v16.2d, v4.2d, v12.d[0]
fmla v20.2d, v4.2d, v13.d[0]
fmla v24.2d, v4.2d, v14.d[0]
fmla v28.2d, v4.2d, v15.d[0]
ldp q0, q1, [pA], #32
fmla v17.2d, v5.2d, v12.d[0]
fmla v25.2d, v5.2d, v14.d[0]
ldp d8, d9, [pB], #16
fmla v21.2d, v5.2d, v13.d[0]
fmla v29.2d, v5.2d, v15.d[0]
ldp d10, d11, [pB], #16
fmla v18.2d, v6.2d, v12.d[0]
fmla v22.2d, v6.2d, v13.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v26.2d, v6.2d, v14.d[0]
fmla v30.2d, v6.2d, v15.d[0]
fmla v19.2d, v7.2d, v12.d[0]
fmla v23.2d, v7.2d, v13.d[0]
fmla v20.2d, v4.2d, v12.d[1]
fmla v21.2d, v5.2d, v12.d[1]
fmla v22.2d, v6.2d, v12.d[1]
fmla v23.2d, v7.2d, v12.d[1]
ldp q2, q3, [pA], #32
fmla v24.2d, v4.2d, v13.d[0]
fmla v25.2d, v5.2d, v13.d[0]
fmla v26.2d, v6.2d, v13.d[0]
fmla v27.2d, v7.2d, v13.d[0]
fmla v28.2d, v4.2d, v13.d[1]
fmla v29.2d, v5.2d, v13.d[1]
fmla v30.2d, v6.2d, v13.d[1]
fmla v31.2d, v7.2d, v13.d[1]
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld1 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
prfm PLDL1KEEP, [pB, #512]
fmla v27.2d, v7.2d, v14.d[0]
fmla v31.2d, v7.2d, v15.d[0]
.endm
.macro KERNEL8x4_E
fmla v16.2d, v4.2d, v12.d[0]
fmla v20.2d, v4.2d, v13.d[0]
fmla v24.2d, v4.2d, v14.d[0]
fmla v28.2d, v4.2d, v15.d[0]
fmla v17.2d, v5.2d, v12.d[0]
fmla v25.2d, v5.2d, v14.d[0]
fmla v21.2d, v5.2d, v13.d[0]
fmla v29.2d, v5.2d, v15.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v18.2d, v6.2d, v12.d[0]
fmla v22.2d, v6.2d, v13.d[0]
fmla v26.2d, v6.2d, v14.d[0]
fmla v30.2d, v6.2d, v15.d[0]
fmla v19.2d, v7.2d, v12.d[0]
fmla v20.2d, v4.2d, v12.d[1]
fmla v21.2d, v5.2d, v12.d[1]
fmla v22.2d, v6.2d, v12.d[1]
fmla v23.2d, v7.2d, v12.d[1]
fmla v24.2d, v4.2d, v13.d[0]
fmla v25.2d, v5.2d, v13.d[0]
fmla v26.2d, v6.2d, v13.d[0]
fmla v27.2d, v7.2d, v13.d[0]
fmla v28.2d, v4.2d, v13.d[1]
fmla v29.2d, v5.2d, v13.d[1]
fmla v30.2d, v6.2d, v13.d[1]
fmla v31.2d, v7.2d, v13.d[1]
fmla v23.2d, v7.2d, v13.d[0]
fmla v27.2d, v7.2d, v14.d[0]
fmla v31.2d, v7.2d, v15.d[0]
.endm
.macro KERNEL8x4_SUB
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld1 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
ldp q0, q1, [pA], #32
ldp d8, d9, [pB], #16
fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v9.d[0]
ldp d10, d11, [pB], #16
fmla v17.2d, v1.2d, v8.d[0]
fmla v21.2d, v1.2d, v9.d[0]
ldp q2, q3, [pA], #32
fmla v24.2d, v0.2d, v10.d[0]
fmla v28.2d, v0.2d, v11.d[0]
fmla v25.2d, v1.2d, v10.d[0]
fmla v29.2d, v1.2d, v11.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla v18.2d, v2.2d, v8.d[0]
fmla v22.2d, v2.2d, v9.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
fmla v26.2d, v2.2d, v10.d[0]
fmla v30.2d, v2.2d, v11.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v19.2d, v3.2d, v8.d[0]
fmla v27.2d, v3.2d, v10.d[0]
fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.d[1]
fmla v22.2d, v2.2d, v8.d[1]
fmla v23.2d, v3.2d, v8.d[1]
fmla v24.2d, v0.2d, v9.d[0]
fmla v25.2d, v1.2d, v9.d[0]
fmla v26.2d, v2.2d, v9.d[0]
fmla v27.2d, v3.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.d[1]
fmla v29.2d, v1.2d, v9.d[1]
fmla v30.2d, v2.2d, v9.d[1]
fmla v31.2d, v3.2d, v9.d[1]
fmla v31.2d, v3.2d, v11.d[0]
fmla v23.2d, v3.2d, v9.d[0]
.endm
.macro SAVE8x4
add pCRow1, pCRow0, LDC
fmov alpha0, alpha
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
fmul v0.2d, v16.2d, alphaV0
fmul v1.2d, v17.2d, alphaV1
fmul v2.2d, v18.2d, alphaV2
fmul v3.2d, v19.2d, alphaV3
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmul v1.2d, v17.2d, alphaV0
stp q0, q1, [pCRow0]
add pCRow2, pCRow1, LDC
add pCRow0, pCRow0, #32
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
fmul v2.2d, v18.2d, alphaV0
fmul v3.2d, v19.2d, alphaV0
stp q2, q3, [pCRow0]
add pCRow0, pCRow0, #32
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
fmul v4.2d, v20.2d, alphaV0
fmul v5.2d, v21.2d, alphaV1
fmul v6.2d, v22.2d, alphaV2
fmul v7.2d, v23.2d, alphaV3
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
fmul v5.2d, v21.2d, alphaV0
stp q4, q5, [pCRow1]
add pCRow1, pCRow2, LDC
add pCRow1, pCRow1, #32
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
fmul v6.2d, v22.2d, alphaV0
fmul v7.2d, v23.2d, alphaV0
stp q6, q7, [pCRow1]
add pCRow1, pCRow1, #32
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
fmul v0.2d, v24.2d, alphaV0
fmul v1.2d, v25.2d, alphaV1
fmul v2.2d, v26.2d, alphaV2
fmul v3.2d, v27.2d, alphaV3
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2]
fmul v1.2d, v25.2d, alphaV0
stp q0, q1, [pCRow2]
add pCRow2, pCRow2, #32
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
fmul v2.2d, v26.2d, alphaV0
fmul v3.2d, v27.2d, alphaV0
stp q2, q3, [pCRow2]
add pCRow2, pCRow2, #32
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
fmul v4.2d, v28.2d, alphaV0
fmul v5.2d, v29.2d, alphaV1
fmul v6.2d, v30.2d, alphaV2
fmul v7.2d, v31.2d, alphaV3
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
fmul v5.2d, v29.2d, alphaV0
stp q4, q5, [pCRow3]
add pCRow0, pCRow0, #64
add pCRow3, pCRow3, #32
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
fmul v6.2d, v30.2d, alphaV0
fmul v7.2d, v31.2d, alphaV0
stp q6, q7, [pCRow3]
add pCRow3, pCRow3, #32
.endm
/******************************************************************************/
@ -365,26 +428,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x4
fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
fmul v9.2d, v17.2d, alphaV1
fmul v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC
fmul v12.2d, v20.2d, alphaV2
fmul v13.2d, v21.2d, alphaV3
fmul v12.2d, v20.2d, alphaV0
fmul v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow2, pCRow1, LDC
fmul v8.2d, v24.2d, alphaV0
fmul v9.2d, v25.2d, alphaV1
fmul v9.2d, v25.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow2]
add pCRow1, pCRow2, LDC
fmul v12.2d, v28.2d, alphaV2
fmul v13.2d, v29.2d, alphaV3
fmul v12.2d, v28.2d, alphaV0
fmul v13.2d, v29.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -413,22 +477,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x4
fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
add pCRow1, pCRow0, LDC
fmul v12.2d, v20.2d, alphaV1
fmul v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow2, pCRow1, LDC
fmul v8.2d, v24.2d, alphaV2
fmul v8.2d, v24.2d, alphaV0
st1 {v8.2d}, [pCRow2]
add pCRow1, pCRow2, LDC
fmul v12.2d, v28.2d, alphaV3
fmul v12.2d, v28.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16
@ -453,6 +518,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x4
fmov alpha0, alpha
add pCRow1, pCRow0, LDC
fmul v8.2d, v16.2d, alphaV0
@ -462,7 +529,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow2, pCRow1, LDC
add pCRow1, pCRow2, LDC
fmul v12.2d, v20.2d, alphaV1
fmul v12.2d, v20.2d, alphaV0
st1 {v12.d}[0], [pCRow2]
st1 {v12.d}[1], [pCRow1]
@ -502,18 +569,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE8x2
fmov alpha0, alpha
add pCRow1, pCRow0, LDC
fmul v0.2d, v16.2d, alphaV0
fmul v1.2d, v17.2d, alphaV1
fmul v2.2d, v18.2d, alphaV2
fmul v3.2d, v19.2d, alphaV3
fmul v1.2d, v17.2d, alphaV0
fmul v2.2d, v18.2d, alphaV0
fmul v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmul v4.2d, v20.2d, alphaV0
fmul v5.2d, v21.2d, alphaV1
fmul v6.2d, v22.2d, alphaV2
fmul v7.2d, v23.2d, alphaV3
fmul v5.2d, v21.2d, alphaV0
fmul v6.2d, v22.2d, alphaV0
fmul v7.2d, v23.2d, alphaV0
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
add pCRow0, pCRow0, #64
@ -541,14 +609,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x2
fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
fmul v9.2d, v17.2d, alphaV1
fmul v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC
fmul v12.2d, v20.2d, alphaV2
fmul v13.2d, v21.2d, alphaV3
fmul v12.2d, v20.2d, alphaV0
fmul v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -573,12 +642,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x2
fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
add pCRow1 , pCRow0, LDC
fmul v12.2d, v20.2d, alphaV1
fmul v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16
@ -601,6 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x2
fmov alpha0, alpha
add pCRow1 , pCRow0, LDC
fmul v8.2d, v16.2d, alphaV0
@ -636,10 +707,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE8x1
fmov alpha0, alpha
fmul v0.2d, v16.2d, alphaV0
fmul v1.2d, v17.2d, alphaV1
fmul v2.2d, v18.2d, alphaV2
fmul v3.2d, v19.2d, alphaV3
fmul v1.2d, v17.2d, alphaV0
fmul v2.2d, v18.2d, alphaV0
fmul v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
add pCRow0, pCRow0, #64
@ -665,8 +737,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x1
fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
fmul v9.2d, v17.2d, alphaV1
fmul v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow0, pCRow0, #32
@ -690,6 +763,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x1
fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
@ -713,6 +787,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x1
fmov alpha0, alpha
fmul d8, d16, alpha0
str d8, [pCRow0]
@ -739,10 +814,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
fmov alpha0, d0
fmov alpha1, d0
fmov alpha2, d0
fmov alpha3, d0
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alpha, d0
lsl LDC, LDC, #3 // ldc = ldc * 8
@ -759,8 +834,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/******************************************************************************/
dtrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
add pCRow3, pCRow2, LDC
add pC, pCRow3, LDC
#if defined(LEFT)
mov tempOffset, offset
@ -774,6 +854,7 @@ dtrmm_kernel_L4_M8_BEGIN:
cmp counterI, #0
ble dtrmm_kernel_L4_M4_BEGIN
.align 5
dtrmm_kernel_L4_M8_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@ -794,40 +875,64 @@ dtrmm_kernel_L4_M8_20:
add tempK, tempOffset, #4
#endif
asr counterL , tempK, #1 // L = K / 2
asr counterL , tempK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M8_32
KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
ble dtrmm_kernel_L4_M8_22a
.align 5
.align 5
dtrmm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M8_22
.align 5
dtrmm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
b dtrmm_kernel_L4_M8_44
.align 5
dtrmm_kernel_L4_M8_32:
tst counterL, #1
ble dtrmm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
b dtrmm_kernel_L4_M8_44
@ -838,13 +943,17 @@ dtrmm_kernel_L4_M8_40:
dtrmm_kernel_L4_M8_44:
ands counterL , tempK, #1
ands counterL , tempK, #7
ble dtrmm_kernel_L4_M8_100
.align 5
dtrmm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
bne dtrmm_kernel_L4_M8_46
dtrmm_kernel_L4_M8_100:
SAVE8x4
@ -864,6 +973,9 @@ dtrmm_kernel_L4_M8_100:
#if defined(LEFT)
add tempOffset, tempOffset, #8
#endif
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
dtrmm_kernel_L4_M8_END:
subs counterI, counterI, #1

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -46,20 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
#define pA x15
#define alpha_save_R x16
#define alpha_save_I x17
#define pCRow3 x15
#define pA x16
#define alphaR x17
#define alphaI x18
#define alpha0_R d10
#define alphaV0_R v10.d[0]
#define alpha0_I d11
#define alphaV0_I v11.d[0]
#define alpha1_R d14
#define alphaV1_R v14.d[0]
#define alpha1_I d15
#define alphaV1_I v15.d[0]
#define A_PRE_SIZE 2560
#define B_PRE_SIZE 448
#define C_PRE_SIZE 128
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define OP_rr fmla
@ -98,10 +97,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 12 pCRow0
// 13 pCRow1
// 14 pCRow2
// 15 pA
// 16 alpha_save_R
// 17 alpha_save_I
// 18 must save
// 15 pCRow3
// 16 pA
// 17 alpha_save_R
// 18 must save alpha_save_I
// 19 must save
// 20 must save
// 21 must save
@ -175,12 +174,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_I
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
fmul v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.d[0]
@ -193,16 +188,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v17.2d, v1.2d, v8.d[0]
fmul v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
fmls v19.2d, v2.2d, v9.d[0]
#else
fmul v19.2d, v2.2d, v9.d[0]
#endif
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
fmul v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
@ -215,6 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
fmul v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -226,6 +216,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v23.2d, v3.2d, v8.d[1]
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
fmul v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
fmls v19.2d, v2.2d, v9.d[0]
#else
fmul v19.2d, v2.2d, v9.d[0]
#endif
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v4.2d, v5.2d} , [pA]
add pA, pA, #32
fmul v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -237,6 +244,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v25.2d, v1.2d, v10.d[0]
ld2 {v6.2d, v7.2d} , [pA]
add pA, pA, #32
fmul v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -248,6 +258,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v27.2d, v3.2d, v10.d[0]
ld2 {v14.2d, v15.2d}, [pB]
add pB, pB, #32
fmul v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -259,6 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v29.2d, v1.2d, v10.d[1]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmul v30.2d, v2.2d, v10.d[1]
OP_ii v30.2d, v3.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -270,14 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v31.2d, v3.2d, v10.d[1]
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
ld2 {v14.2d, v15.2d}, [pB]
add pB, pB, #32
ld2 {v4.2d, v5.2d} , [pA]
add pA, pA, #32
ld2 {v6.2d, v7.2d} , [pA]
add pA, pA, #32
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNEL4x4_M1
@ -286,7 +294,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.d[0]
ld2 {v12.2d, v13.2d}, [pB] // For next round
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v2.2d, v8.d[0]
@ -294,15 +302,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v14.2d, v15.2d}, [pB] // For next round
add pB, pB, #32
ld2 {v4.2d, v5.2d} , [pA]
add pA, pA, #32
OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v4.2d, v5.2d} , [pA] // For next round
ld2 {v6.2d, v7.2d} , [pA]
add pA, pA, #32
OP_rr v22.2d, v2.2d, v8.d[1]
@ -310,22 +318,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.d[1]
ld2 {v6.2d, v7.2d} , [pA] // For next round
add pA, pA, #32
ld2 {v14.2d, v15.2d}, [pB]
add pB, pB, #32
OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.d[0]
OP_ir v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #512]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
OP_rr v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.d[0]
OP_ri v27.2d, v2.2d, v11.d[0]
OP_ir v27.2d, v3.2d, v10.d[0]
prfm PLDL1KEEP, [pB, #512]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
OP_rr v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.d[1]
@ -344,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v17.2d, v4.2d, v13.d[0]
OP_ir v17.2d, v5.2d, v12.d[0]
ld2 {v8.2d, v9.2d}, [pB] // For next round
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v6.2d, v12.d[0]
@ -352,15 +360,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v19.2d, v6.2d, v13.d[0]
OP_ir v19.2d, v7.2d, v12.d[0]
ld2 {v10.2d, v11.2d}, [pB] // For next round
add pB, pB, #32
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
OP_rr v20.2d, v4.2d, v12.d[1]
OP_ii v20.2d, v5.2d, v13.d[1]
OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.d[1]
ld2 {v0.2d, v1.2d}, [pA] // For next round
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
OP_rr v22.2d, v6.2d, v12.d[1]
@ -368,22 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v23.2d, v6.2d, v13.d[1]
OP_ir v23.2d, v7.2d, v12.d[1]
ld2 {v2.2d, v3.2d}, [pA] // For next round
add pA, pA, #32
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
OP_rr v24.2d, v4.2d, v14.d[0]
OP_ii v24.2d, v5.2d, v15.d[0]
OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pA, #512]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.d[0]
OP_ir v27.2d, v7.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #512]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
OP_rr v28.2d, v4.2d, v14.d[1]
OP_ii v28.2d, v5.2d, v15.d[1]
@ -412,6 +420,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.d[1]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
OP_rr v22.2d, v6.2d, v12.d[1]
OP_ii v22.2d, v7.2d, v13.d[1]
OP_ri v23.2d, v6.2d, v13.d[1]
@ -422,6 +432,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.d[0]
@ -441,33 +453,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_SUB
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
OP_rr v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.d[1]
OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.d[1]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.d[0]
@ -490,74 +509,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x4
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld2 {v0.2d, v1.2d}, [pCRow1]
ld2 {v0.2d, v1.2d}, [pCRow0]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmla v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
ld2 {v2.2d, v3.2d}, [pCRow2]
fmla v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow0]
add pCRow0, pCRow0, #32
ld2 {v2.2d, v3.2d}, [pCRow0]
fmla v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
fmla v3.2d, v18.2d, alphaV1_I
fmla v3.2d, v19.2d, alphaV1_R
st2 {v2.2d, v3.2d}, [pCRow2]
fmla v3.2d, v18.2d, alphaV0_I
fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow0]
add pCRow0, pCRow0, #32
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow1, pCRow1, LDC
ld2 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmla v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmla v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow2, pCRow1, #32
ld2 {v6.2d, v7.2d}, [pCRow2]
add pCRow1, pCRow1, #32
ld2 {v6.2d, v7.2d}, [pCRow1]
fmla v6.2d, v22.2d, alphaV0_R
fmls v6.2d, v23.2d, alphaV0_I
fmla v7.2d, v22.2d, alphaV1_I
fmla v7.2d, v23.2d, alphaV1_R
st2 {v6.2d, v7.2d}, [pCRow2]
fmla v7.2d, v22.2d, alphaV0_I
fmla v7.2d, v23.2d, alphaV0_R
st2 {v6.2d, v7.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
ld2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, #32
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
ld2 {v0.2d, v1.2d}, [pCRow2]
fmla v0.2d, v24.2d, alphaV0_R
fmls v0.2d, v25.2d, alphaV0_I
fmla v1.2d, v24.2d, alphaV1_I
fmla v1.2d, v25.2d, alphaV1_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmla v1.2d, v24.2d, alphaV0_I
fmla v1.2d, v25.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow2]
add pCRow2, pCRow2, #32
ld2 {v2.2d, v3.2d}, [pCRow2]
fmla v2.2d, v26.2d, alphaV0_R
fmls v2.2d, v27.2d, alphaV0_I
fmla v3.2d, v26.2d, alphaV1_I
fmla v3.2d, v27.2d, alphaV1_R
fmla v3.2d, v26.2d, alphaV0_I
fmla v3.2d, v27.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow1, pCRow1, LDC
add pCRow2, pCRow2, #32
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
ld2 {v4.2d, v5.2d}, [pCRow1]
ld2 {v4.2d, v5.2d}, [pCRow3]
fmla v4.2d, v28.2d, alphaV0_R
fmls v4.2d, v29.2d, alphaV0_I
fmla v5.2d, v28.2d, alphaV1_I
fmla v5.2d, v29.2d, alphaV1_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow2, pCRow1, #32
ld2 {v6.2d, v7.2d}, [pCRow2]
fmla v5.2d, v28.2d, alphaV0_I
fmla v5.2d, v29.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow3]
add pCRow3, pCRow3, #32
ld2 {v6.2d, v7.2d}, [pCRow3]
fmla v6.2d, v30.2d, alphaV0_R
fmls v6.2d, v31.2d, alphaV0_I
fmla v7.2d, v30.2d, alphaV1_I
fmla v7.2d, v31.2d, alphaV1_R
st2 {v6.2d, v7.2d}, [pCRow2]
fmla v7.2d, v30.2d, alphaV0_I
fmla v7.2d, v31.2d, alphaV0_R
st2 {v6.2d, v7.2d}, [pCRow3]
add pCRow0, pCRow0, #64
add pCRow3, pCRow3, #32
.endm
/******************************************************************************/
@ -604,18 +634,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x4
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmla v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmla v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
@ -623,8 +651,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmla v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmla v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
@ -632,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v24.2d, alphaV0_R
fmls v0.2d, v25.2d, alphaV0_I
fmla v1.2d, v24.2d, alphaV1_I
fmla v1.2d, v25.2d, alphaV1_R
fmla v1.2d, v24.2d, alphaV0_I
fmla v1.2d, v25.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
@ -641,8 +669,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v28.2d, alphaV0_R
fmls v4.2d, v29.2d, alphaV0_I
fmla v5.2d, v28.2d, alphaV1_I
fmla v5.2d, v29.2d, alphaV1_R
fmla v5.2d, v28.2d, alphaV0_I
fmla v5.2d, v29.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -691,18 +719,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x4
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.d, v1.d}[0], [pCRow1]
fmla d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
fmla d1, d16, alphaV1_I
fmla d1, d17, alphaV1_R
fmla d1, d16, alphaV0_I
fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@ -710,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.d, v5.d}[0], [pCRow1]
fmla d4, d20, alphaV0_R
fmls d4, d21, alphaV0_I
fmla d5, d20, alphaV1_I
fmla d5, d21, alphaV1_R
fmla d5, d20, alphaV0_I
fmla d5, d21, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@ -719,8 +745,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.d, v1.d}[0], [pCRow1]
fmla d0, d24, alphaV0_R
fmls d0, d25, alphaV0_I
fmla d1, d24, alphaV1_I
fmla d1, d25, alphaV1_R
fmla d1, d24, alphaV0_I
fmla d1, d25, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@ -728,8 +754,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.d, v5.d}[0], [pCRow1]
fmla d4, d28, alphaV0_R
fmls d4, d29, alphaV0_I
fmla d5, d28, alphaV1_I
fmla d5, d29, alphaV1_R
fmla d5, d28, alphaV0_I
fmla d5, d29, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@ -778,25 +804,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x2
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmla v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmla v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
ld2 {v2.2d, v3.2d}, [pCRow2]
fmla v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
fmla v3.2d, v18.2d, alphaV1_I
fmla v3.2d, v19.2d, alphaV1_R
fmla v3.2d, v18.2d, alphaV0_I
fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow1, pCRow1, LDC
@ -804,15 +828,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmla v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmla v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow2, pCRow1, #32
ld2 {v6.2d, v7.2d}, [pCRow2]
fmla v6.2d, v22.2d, alphaV0_R
fmls v6.2d, v23.2d, alphaV0_I
fmla v7.2d, v22.2d, alphaV1_I
fmla v7.2d, v23.2d, alphaV1_R
fmla v7.2d, v22.2d, alphaV0_I
fmla v7.2d, v23.2d, alphaV0_R
st2 {v6.2d, v7.2d}, [pCRow2]
add pCRow0, pCRow0, #64
@ -845,18 +869,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x2
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmla v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmla v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
@ -864,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmla v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmla v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -898,18 +920,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x2
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.d, v1.d}[0], [pCRow1]
fmla d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
fmla d1, d16, alphaV1_I
fmla d1, d17, alphaV1_R
fmla d1, d16, alphaV0_I
fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@ -917,8 +937,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.d, v5.d}[0], [pCRow1]
fmla d4, d20, alphaV0_R
fmls d4, d21, alphaV0_I
fmla d5, d20, alphaV1_I
fmla d5, d21, alphaV1_R
fmla d5, d20, alphaV0_I
fmla d5, d21, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@ -953,25 +973,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x1
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmla v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmla v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
ld2 {v2.2d, v3.2d}, [pCRow2]
fmla v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
fmla v3.2d, v18.2d, alphaV1_I
fmla v3.2d, v19.2d, alphaV1_R
fmla v3.2d, v18.2d, alphaV0_I
fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow0, pCRow0, #64
@ -997,18 +1015,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x1
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmla v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmla v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -1035,18 +1051,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x1
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.d, v1.d}[0], [pCRow1]
fmla d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
fmla d1, d16, alphaV1_I
fmla d1, d17, alphaV1_R
fmla d1, d16, alphaV0_I
fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@ -1072,8 +1086,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
fmov alpha_save_R, d0
fmov alpha_save_I, d1
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alphaR, d0
fmov alphaI, d1
lsl LDC, LDC, #4 // ldc = ldc * 2 * 8
@ -1085,8 +1102,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble zgemm_kernel_L2_BEGIN
zgemm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
add pCRow3, pCRow2, LDC
add pC, pCRow3, LDC
mov pA, origPA // pA = start of A array
zgemm_kernel_L4_M4_BEGIN:
@ -1096,42 +1118,68 @@ zgemm_kernel_L4_M4_BEGIN:
cmp counterI, #0
ble zgemm_kernel_L4_M2_BEGIN
.align 5
zgemm_kernel_L4_M4_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
asr counterL , origK, #3
cmp counterL , #2
blt zgemm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #2 // subtract 2
ble zgemm_kernel_L4_M4_22a
.align 5
.align 5
zgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M4_22
.align 5
zgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
b zgemm_kernel_L4_M4_44
.align 5
zgemm_kernel_L4_M4_32:
tst counterL, #1
ble zgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
b zgemm_kernel_L4_M4_44
@ -1143,13 +1191,20 @@ zgemm_kernel_L4_M4_40:
zgemm_kernel_L4_M4_44:
ands counterL , origK, #1
ands counterL , origK, #7
ble zgemm_kernel_L4_M4_100
.align 5
zgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
subs counterL, counterL, #1
bne zgemm_kernel_L4_M4_46
zgemm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE4x4

View File

@ -46,23 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
#define pA x15
#define alpha_save_R x16
#define alpha_save_I x17
#define temp x18
#define tempOffset x19
#define tempK x20
#define pCRow3 x15
#define pA x16
#define alphaR x17
#define alphaI x18
#define temp x19
#define tempOffset x20
#define tempK x21
#define alpha0_R d10
#define alphaV0_R v10.d[0]
#define alpha0_I d11
#define alphaV0_I v11.d[0]
#define alpha1_R d14
#define alphaV1_R v14.d[0]
#define alpha1_I d15
#define alphaV1_I v15.d[0]
#define A_PRE_SIZE 2560
#define B_PRE_SIZE 448
#define C_PRE_SIZE 128
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define OP_rr fmla
@ -93,7 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 04 origPB
// 05 pC
// 06 origLDC -> LDC
// 07 offset
// 07 offset -> temp
// 08 counterL
// 09 counterI
// 10 counterJ
@ -101,13 +100,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 12 pCRow0
// 13 pCRow1
// 14 pCRow2
// 15 pA
// 16 alpha_save_R
// 17 alpha_save_I
// 18 must save temp
// 19 must save tempOffset
// 20 must save tempK
// 21 must save
// 15 pCRow3
// 16 pA
// 17 alpha_save_R
// 18 must save alpha_save_I
// 19 must save temp
// 20 must save tempOffset
// 21 must save tempK
// 22 must save
// 23 must save
// 24 must save
@ -178,12 +177,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_I
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
fmul v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.d[0]
@ -196,16 +191,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v17.2d, v1.2d, v8.d[0]
fmul v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
fmls v19.2d, v2.2d, v9.d[0]
#else
fmul v19.2d, v2.2d, v9.d[0]
#endif
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
fmul v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
@ -218,6 +205,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
fmul v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -229,6 +219,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v23.2d, v3.2d, v8.d[1]
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
fmul v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
fmls v19.2d, v2.2d, v9.d[0]
#else
fmul v19.2d, v2.2d, v9.d[0]
#endif
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v4.2d, v5.2d} , [pA]
add pA, pA, #32
fmul v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -240,6 +247,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v25.2d, v1.2d, v10.d[0]
ld2 {v6.2d, v7.2d} , [pA]
add pA, pA, #32
fmul v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -251,6 +261,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v27.2d, v3.2d, v10.d[0]
ld2 {v14.2d, v15.2d}, [pB]
add pB, pB, #32
fmul v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -262,6 +275,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v29.2d, v1.2d, v10.d[1]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmul v30.2d, v2.2d, v10.d[1]
OP_ii v30.2d, v3.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -273,14 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v31.2d, v3.2d, v10.d[1]
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
ld2 {v14.2d, v15.2d}, [pB]
add pB, pB, #32
ld2 {v4.2d, v5.2d} , [pA]
add pA, pA, #32
ld2 {v6.2d, v7.2d} , [pA]
add pA, pA, #32
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNEL4x4_M1
@ -289,7 +297,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.d[0]
ld2 {v12.2d, v13.2d}, [pB] // For next round
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v2.2d, v8.d[0]
@ -297,15 +305,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v14.2d, v15.2d}, [pB] // For next round
add pB, pB, #32
ld2 {v4.2d, v5.2d} , [pA]
add pA, pA, #32
OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v4.2d, v5.2d} , [pA] // For next round
ld2 {v6.2d, v7.2d} , [pA]
add pA, pA, #32
OP_rr v22.2d, v2.2d, v8.d[1]
@ -313,22 +321,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.d[1]
ld2 {v6.2d, v7.2d} , [pA] // For next round
add pA, pA, #32
ld2 {v14.2d, v15.2d}, [pB]
add pB, pB, #32
OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.d[0]
OP_ir v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #512]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
OP_rr v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.d[0]
OP_ri v27.2d, v2.2d, v11.d[0]
OP_ir v27.2d, v3.2d, v10.d[0]
prfm PLDL1KEEP, [pB, #512]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
OP_rr v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.d[1]
@ -347,7 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v17.2d, v4.2d, v13.d[0]
OP_ir v17.2d, v5.2d, v12.d[0]
ld2 {v8.2d, v9.2d}, [pB] // For next round
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v6.2d, v12.d[0]
@ -355,15 +363,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v19.2d, v6.2d, v13.d[0]
OP_ir v19.2d, v7.2d, v12.d[0]
ld2 {v10.2d, v11.2d}, [pB] // For next round
add pB, pB, #32
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
OP_rr v20.2d, v4.2d, v12.d[1]
OP_ii v20.2d, v5.2d, v13.d[1]
OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.d[1]
ld2 {v0.2d, v1.2d}, [pA] // For next round
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
OP_rr v22.2d, v6.2d, v12.d[1]
@ -371,22 +379,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v23.2d, v6.2d, v13.d[1]
OP_ir v23.2d, v7.2d, v12.d[1]
ld2 {v2.2d, v3.2d}, [pA] // For next round
add pA, pA, #32
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
OP_rr v24.2d, v4.2d, v14.d[0]
OP_ii v24.2d, v5.2d, v15.d[0]
OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pA, #512]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.d[0]
OP_ir v27.2d, v7.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #512]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
OP_rr v28.2d, v4.2d, v14.d[1]
OP_ii v28.2d, v5.2d, v15.d[1]
@ -415,6 +423,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.d[1]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
OP_rr v22.2d, v6.2d, v12.d[1]
OP_ii v22.2d, v7.2d, v13.d[1]
OP_ri v23.2d, v6.2d, v13.d[1]
@ -425,6 +435,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.d[0]
@ -444,33 +456,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_SUB
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
OP_rr v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.d[1]
OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.d[1]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.d[0]
@ -493,66 +512,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x4
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmul v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmul v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow0]
add pCRow0, pCRow0, #32
fmul v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
fmul v3.2d, v18.2d, alphaV1_I
fmla v3.2d, v19.2d, alphaV1_R
st2 {v2.2d, v3.2d}, [pCRow2]
fmul v3.2d, v18.2d, alphaV0_I
fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow0]
add pCRow0, pCRow0, #32
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow1, pCRow1, LDC
fmul v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmul v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmul v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow2, pCRow1, #32
add pCRow1, pCRow1, #32
fmul v6.2d, v22.2d, alphaV0_R
fmls v6.2d, v23.2d, alphaV0_I
fmul v7.2d, v22.2d, alphaV1_I
fmla v7.2d, v23.2d, alphaV1_R
st2 {v6.2d, v7.2d}, [pCRow2]
fmul v7.2d, v22.2d, alphaV0_I
fmla v7.2d, v23.2d, alphaV0_R
st2 {v6.2d, v7.2d}, [pCRow1]
add pCRow1, pCRow1, #32
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow1, LDC
fmul v0.2d, v24.2d, alphaV0_R
fmls v0.2d, v25.2d, alphaV0_I
fmul v1.2d, v24.2d, alphaV1_I
fmla v1.2d, v25.2d, alphaV1_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmul v1.2d, v24.2d, alphaV0_I
fmla v1.2d, v25.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow2]
add pCRow2, pCRow2, #32
fmul v2.2d, v26.2d, alphaV0_R
fmls v2.2d, v27.2d, alphaV0_I
fmul v3.2d, v26.2d, alphaV1_I
fmla v3.2d, v27.2d, alphaV1_R
fmul v3.2d, v26.2d, alphaV0_I
fmla v3.2d, v27.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow1, pCRow1, LDC
add pCRow2, pCRow2, #32
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
fmul v4.2d, v28.2d, alphaV0_R
fmls v4.2d, v29.2d, alphaV0_I
fmul v5.2d, v28.2d, alphaV1_I
fmla v5.2d, v29.2d, alphaV1_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmul v5.2d, v28.2d, alphaV0_I
fmla v5.2d, v29.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow3]
add pCRow3, pCRow3, #32
fmul v6.2d, v30.2d, alphaV0_R
fmls v6.2d, v31.2d, alphaV0_I
fmul v7.2d, v30.2d, alphaV1_I
fmla v7.2d, v31.2d, alphaV1_R
st2 {v6.2d, v7.2d}, [pCRow2]
fmul v7.2d, v30.2d, alphaV0_I
fmla v7.2d, v31.2d, alphaV0_R
st2 {v6.2d, v7.2d}, [pCRow3]
add pCRow0, pCRow0, #64
add pCRow3, pCRow3, #32
.endm
/******************************************************************************/
@ -599,41 +629,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x4
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmul v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmul v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
fmul v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmul v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmul v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
fmul v0.2d, v24.2d, alphaV0_R
fmls v0.2d, v25.2d, alphaV0_I
fmul v1.2d, v24.2d, alphaV1_I
fmla v1.2d, v25.2d, alphaV1_R
fmul v1.2d, v24.2d, alphaV0_I
fmla v1.2d, v25.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
fmul v4.2d, v28.2d, alphaV0_R
fmls v4.2d, v29.2d, alphaV0_I
fmul v5.2d, v28.2d, alphaV1_I
fmla v5.2d, v29.2d, alphaV1_R
fmul v5.2d, v28.2d, alphaV0_I
fmla v5.2d, v29.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -682,41 +710,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x4
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
fmul d1, d16, alphaV1_I
fmla d1, d17, alphaV1_R
fmul d1, d16, alphaV0_I
fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
fmul d4, d20, alphaV0_R
fmls d4, d21, alphaV0_I
fmul d5, d20, alphaV1_I
fmla d5, d21, alphaV1_R
fmul d5, d20, alphaV0_I
fmla d5, d21, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
fmul d0, d24, alphaV0_R
fmls d0, d25, alphaV0_I
fmul d1, d24, alphaV1_I
fmla d1, d25, alphaV1_R
fmul d1, d24, alphaV0_I
fmla d1, d25, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
fmul d4, d28, alphaV0_R
fmls d4, d29, alphaV0_I
fmul d5, d28, alphaV1_I
fmla d5, d29, alphaV1_R
fmul d5, d28, alphaV0_I
fmla d5, d29, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@ -765,37 +791,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x2
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmul v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmul v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmul v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
fmul v3.2d, v18.2d, alphaV1_I
fmla v3.2d, v19.2d, alphaV1_R
fmul v3.2d, v18.2d, alphaV0_I
fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow1, pCRow1, LDC
fmul v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmul v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmul v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmul v6.2d, v22.2d, alphaV0_R
fmls v6.2d, v23.2d, alphaV0_I
fmul v7.2d, v22.2d, alphaV1_I
fmla v7.2d, v23.2d, alphaV1_R
fmul v7.2d, v22.2d, alphaV0_I
fmla v7.2d, v23.2d, alphaV0_R
st2 {v6.2d, v7.2d}, [pCRow2]
add pCRow0, pCRow0, #64
@ -828,25 +852,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x2
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmul v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmul v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
fmul v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmul v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmul v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -879,25 +901,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x2
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
fmul d1, d16, alphaV1_I
fmla d1, d17, alphaV1_R
fmul d1, d16, alphaV0_I
fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
fmul d4, d20, alphaV0_R
fmls d4, d21, alphaV0_I
fmul d5, d20, alphaV1_I
fmla d5, d21, alphaV1_R
fmul d5, d20, alphaV0_I
fmla d5, d21, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@ -932,23 +952,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x1
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmul v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmul v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmul v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
fmul v3.2d, v18.2d, alphaV1_I
fmla v3.2d, v19.2d, alphaV1_R
fmul v3.2d, v18.2d, alphaV0_I
fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow0, pCRow0, #64
@ -974,17 +992,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x1
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmul v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmul v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -1011,17 +1027,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x1
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
fmul d1, d16, alphaV1_I
fmla d1, d17, alphaV1_R
fmul d1, d16, alphaV0_I
fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@ -1047,8 +1061,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
fmov alpha_save_R, d0
fmov alpha_save_I, d1
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alphaR, d0
fmov alphaI, d1
lsl LDC, LDC, #4 // ldc = ldc * 2 * 8
@ -1064,8 +1081,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble ztrmm_kernel_L2_BEGIN
ztrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
add pCRow3, pCRow2, LDC
add pC, pCRow3, LDC
#if defined(LEFT)
mov tempOffset, offset
@ -1079,6 +1101,7 @@ ztrmm_kernel_L4_M4_BEGIN:
cmp counterI, #0
ble ztrmm_kernel_L4_M2_BEGIN
.align 5
ztrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@ -1098,39 +1121,64 @@ ztrmm_kernel_L4_M4_20:
add tempK, tempOffset, #4
#endif
asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
asr counterL , tempK, #3
cmp counterL , #2
blt ztrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #2
ble ztrmm_kernel_L4_M4_22a
.align 5
.align 5
ztrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M4_22
.align 5
ztrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
b ztrmm_kernel_L4_M4_44
.align 5
ztrmm_kernel_L4_M4_32:
tst counterL, #1
ble ztrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
b ztrmm_kernel_L4_M4_44
@ -1142,12 +1190,16 @@ ztrmm_kernel_L4_M4_40:
ztrmm_kernel_L4_M4_44:
ands counterL , tempK, #1
ands counterL , tempK, #7
ble ztrmm_kernel_L4_M4_100
.align 5
ztrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
subs counterL, counterL, #1
bne ztrmm_kernel_L4_M4_46
ztrmm_kernel_L4_M4_100:
SAVE4x4
@ -1167,6 +1219,10 @@ ztrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
ztrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne ztrmm_kernel_L4_M4_20

View File

@ -2341,13 +2341,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 4
#define CGEMM_DEFAULT_UNROLL_M 8
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 4