Merge pull request #802 from ashwinyes/develop_20160314_dgemm_optimization
DGEMM Optimizations for Cortex-A57
This commit is contained in:
commit
e17303933a
|
@ -141,5 +141,11 @@ In chronological order:
|
|||
* Martin Koehler <https://github.com/grisuthedragon/>
|
||||
* [2015-09-07] Improved imatcopy
|
||||
|
||||
* Ashwin Sekhar T K <https://github.com/ashwinyes/>
|
||||
* [2015-11-09] Assembly kernels for Cortex-A57 (ARMv8)
|
||||
* [2015-11-20] lapack-test fixes for Cortex-A57
|
||||
* [2016-03-14] Additional functional Assembly Kernels for Cortex-A57
|
||||
* [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57
|
||||
|
||||
* [Your name or handle] <[email or website]>
|
||||
* [Date] [Brief summary of your changes]
|
||||
|
|
|
@ -60,32 +60,55 @@ DGEMVTKERNEL = gemv_t.S
|
|||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
STRMMKERNEL = strmm_kernel_4x4.S
|
||||
DTRMMKERNEL = dtrmm_kernel_4x4.S
|
||||
CTRMMKERNEL = ctrmm_kernel_4x4.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_4x4.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_4x4.S
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_4x4.S
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy.o
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy.o
|
||||
endif
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_4x4.S
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy.o
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy.o
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_4x4.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy.o
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy.o
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -46,21 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow0 x12
|
||||
#define pCRow1 x13
|
||||
#define pCRow2 x14
|
||||
#define pA x15
|
||||
#define ppC x16
|
||||
#define ppCRow0 x17
|
||||
#define ppCRow1 x18
|
||||
#define ppCRow2 x19
|
||||
#define ppA x20
|
||||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define ppC x17
|
||||
#define ppCRow0 x18
|
||||
#define ppCRow1 x19
|
||||
#define ppCRow2 x20
|
||||
#define ppCRow3 x21
|
||||
#define ppA x22
|
||||
#define alpha x23
|
||||
|
||||
#define alpha0 d10
|
||||
#define alphaV0 v10.d[0]
|
||||
#define alpha1 d11
|
||||
#define alphaV1 v11.d[0]
|
||||
#define alpha2 d14
|
||||
#define alphaV2 v14.d[0]
|
||||
#define alpha3 d15
|
||||
#define alphaV3 v15.d[0]
|
||||
|
||||
#define A_PRE_SIZE 1024
|
||||
#define B_PRE_SIZE 1024
|
||||
#define C_PRE_SIZE 128
|
||||
|
||||
// 00 origM
|
||||
// 01 origN
|
||||
|
@ -77,15 +78,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 12 pCRow0
|
||||
// 13 pCRow1
|
||||
// 14 pCRow2
|
||||
// 15 pA
|
||||
// 16 ppC
|
||||
// 17 ppCRow0
|
||||
// 18 must save ppCRow1
|
||||
// 19 must save ppCRow2
|
||||
// 20 must save ppA
|
||||
// 21 must save
|
||||
// 22 must save
|
||||
// 23 must save
|
||||
// 15 pCRow3
|
||||
// 16 pA
|
||||
// 17 ppC
|
||||
// 18 must save ppCRow0
|
||||
// 19 must save ppCRow1
|
||||
// 20 must save ppCRow2
|
||||
// 21 must save ppCRow3
|
||||
// 22 must save ppA
|
||||
// 23 must save alpha
|
||||
// 24 must save
|
||||
// 25 must save
|
||||
// 26 must save
|
||||
|
@ -106,11 +107,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//v08 must save pB00, pB01
|
||||
//v09 must save pB02, pB03
|
||||
//v10 must save ALPHA0
|
||||
//v11 must save ALPHA1
|
||||
//v11 must save
|
||||
//v12 must save pB10, pB11
|
||||
//v13 must save pB12, pB13
|
||||
//v14 must save ALPHA2
|
||||
//v15 must save ALPHA3
|
||||
//v14 must save
|
||||
//v15 must save
|
||||
//v16 must save C00, C01
|
||||
//v17 must save C02, C03
|
||||
//v18 ppC00, ppC01
|
||||
|
@ -152,222 +153,254 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_I
|
||||
ld1 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
ldp d8, d9, [pB]
|
||||
add pB, pB, #16
|
||||
ldp d10, d11, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
ldp q0, q1, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.2d[0]
|
||||
fmul v29.2d, v1.2d, v9.2d[1]
|
||||
fmul v29.2d, v1.2d, v11.2d[0]
|
||||
|
||||
ld1 {v2.2d, v3.2d}, [ppA]
|
||||
ldp q2, q3, [ppA]
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmul v20.2d, v0.2d, v8.2d[1]
|
||||
fmul v25.2d, v1.2d, v9.2d[0]
|
||||
fmul v20.2d, v0.2d, v9.2d[0]
|
||||
fmul v25.2d, v1.2d, v10.2d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmul v18.2d, v2.2d, v8.2d[0]
|
||||
fmul v31.2d, v3.2d, v9.2d[1]
|
||||
fmul v22.2d, v2.2d, v8.2d[1]
|
||||
fmul v27.2d, v3.2d, v9.2d[0]
|
||||
fmul v31.2d, v3.2d, v11.2d[0]
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pB] // for next round
|
||||
add pB, pB, #32
|
||||
prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
|
||||
|
||||
fmul v24.2d, v0.2d, v9.2d[0]
|
||||
fmul v21.2d, v1.2d, v8.2d[1]
|
||||
fmul v22.2d, v2.2d, v9.2d[0]
|
||||
fmul v27.2d, v3.2d, v10.2d[0]
|
||||
|
||||
ld1 {v4.2d, v5.2d} , [pA] // for next round
|
||||
ldp d12, d13, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmul v24.2d, v0.2d, v10.2d[0]
|
||||
fmul v21.2d, v1.2d, v9.2d[0]
|
||||
|
||||
ldp q4, q5, [pA] // for next round
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v26.2d, v2.2d, v9.2d[0]
|
||||
fmul v23.2d, v3.2d, v8.2d[1]
|
||||
fmul v26.2d, v2.2d, v10.2d[0]
|
||||
fmul v23.2d, v3.2d, v9.2d[0]
|
||||
|
||||
ld1 {v6.2d, v7.2d} , [ppA] // for next round
|
||||
ldp q6, q7, [ppA] // for next round
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmul v28.2d, v0.2d, v9.2d[1]
|
||||
fmul v28.2d, v0.2d, v11.2d[0]
|
||||
fmul v17.2d, v1.2d, v8.2d[0]
|
||||
fmul v30.2d, v2.2d, v9.2d[1]
|
||||
|
||||
ldp d14, d15, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmul v30.2d, v2.2d, v11.2d[0]
|
||||
fmul v19.2d, v3.2d, v8.2d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_M2
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v29.2d, v5.2d, v13.2d[1]
|
||||
fmla v29.2d, v5.2d, v15.2d[0]
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ldp d8, d9, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v18.2d, v6.2d, v12.2d[0]
|
||||
fmla v31.2d, v7.2d, v13.2d[1]
|
||||
fmla v20.2d, v4.2d, v12.2d[1]
|
||||
fmla v25.2d, v5.2d, v13.2d[0]
|
||||
fmla v31.2d, v7.2d, v15.2d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
ldp d10, d11, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v22.2d, v6.2d, v12.2d[1]
|
||||
fmla v27.2d, v7.2d, v13.2d[0]
|
||||
fmla v24.2d, v4.2d, v13.2d[0]
|
||||
fmla v21.2d, v5.2d, v12.2d[1]
|
||||
fmla v20.2d, v4.2d, v13.2d[0]
|
||||
fmla v25.2d, v5.2d, v14.2d[0]
|
||||
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
fmla v22.2d, v6.2d, v13.2d[0]
|
||||
fmla v27.2d, v7.2d, v14.2d[0]
|
||||
fmla v24.2d, v4.2d, v14.2d[0]
|
||||
fmla v21.2d, v5.2d, v13.2d[0]
|
||||
|
||||
ldp q0, q1, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v26.2d, v6.2d, v13.2d[0]
|
||||
fmla v23.2d, v7.2d, v12.2d[1]
|
||||
fmla v28.2d, v4.2d, v13.2d[1]
|
||||
fmla v26.2d, v6.2d, v14.2d[0]
|
||||
fmla v23.2d, v7.2d, v13.2d[0]
|
||||
fmla v28.2d, v4.2d, v15.2d[0]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
|
||||
ld1 {v2.2d, v3.2d}, [ppA]
|
||||
ldp q2, q3, [ppA]
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmla v30.2d, v6.2d, v13.2d[1]
|
||||
fmla v30.2d, v6.2d, v15.2d[0]
|
||||
fmla v19.2d, v7.2d, v12.2d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_M1
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v29.2d, v1.2d, v9.2d[1]
|
||||
fmla v29.2d, v1.2d, v11.2d[0]
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pB] // for next round
|
||||
add pB, pB, #32
|
||||
ldp d12, d13, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v18.2d, v2.2d, v8.2d[0]
|
||||
fmla v31.2d, v3.2d, v9.2d[1]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v25.2d, v1.2d, v9.2d[0]
|
||||
fmla v31.2d, v3.2d, v11.2d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
ldp d14, d15, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v22.2d, v2.2d, v8.2d[1]
|
||||
fmla v27.2d, v3.2d, v9.2d[0]
|
||||
fmla v20.2d, v0.2d, v9.2d[0]
|
||||
fmla v25.2d, v1.2d, v10.2d[0]
|
||||
|
||||
prfm PLDL1KEEP, [ppA, #512]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v22.2d, v2.2d, v9.2d[0]
|
||||
fmla v27.2d, v3.2d, v10.2d[0]
|
||||
|
||||
ld1 {v4.2d, v5.2d} , [pA] // for next round
|
||||
prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
|
||||
|
||||
fmla v24.2d, v0.2d, v10.2d[0]
|
||||
fmla v21.2d, v1.2d, v9.2d[0]
|
||||
|
||||
ldp q4, q5, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v26.2d, v2.2d, v9.2d[0]
|
||||
fmla v23.2d, v3.2d, v8.2d[1]
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v26.2d, v2.2d, v10.2d[0]
|
||||
fmla v23.2d, v3.2d, v9.2d[0]
|
||||
|
||||
fmla v28.2d, v0.2d, v11.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
|
||||
ld1 {v6.2d, v7.2d} , [ppA] // for next round
|
||||
ldp q6, q7, [ppA]
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmla v30.2d, v2.2d, v9.2d[1]
|
||||
fmla v30.2d, v2.2d, v11.2d[0]
|
||||
fmla v19.2d, v3.2d, v8.2d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_E
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v25.2d, v5.2d, v13.2d[0]
|
||||
fmla v25.2d, v5.2d, v14.2d[0]
|
||||
fmla v18.2d, v6.2d, v12.2d[0]
|
||||
fmla v27.2d, v7.2d, v13.2d[0]
|
||||
fmla v27.2d, v7.2d, v14.2d[0]
|
||||
|
||||
fmla v20.2d, v4.2d, v12.2d[1]
|
||||
fmla v29.2d, v5.2d, v13.2d[1]
|
||||
fmla v22.2d, v6.2d, v12.2d[1]
|
||||
fmla v31.2d, v7.2d, v13.2d[1]
|
||||
fmla v20.2d, v4.2d, v13.2d[0]
|
||||
fmla v29.2d, v5.2d, v15.2d[0]
|
||||
fmla v22.2d, v6.2d, v13.2d[0]
|
||||
fmla v31.2d, v7.2d, v15.2d[0]
|
||||
|
||||
fmla v24.2d, v4.2d, v13.2d[0]
|
||||
fmla v24.2d, v4.2d, v14.2d[0]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v26.2d, v6.2d, v13.2d[0]
|
||||
fmla v26.2d, v6.2d, v14.2d[0]
|
||||
fmla v19.2d, v7.2d, v12.2d[0]
|
||||
|
||||
fmla v28.2d, v4.2d, v13.2d[1]
|
||||
fmla v21.2d, v5.2d, v12.2d[1]
|
||||
fmla v30.2d, v6.2d, v13.2d[1]
|
||||
fmla v23.2d, v7.2d, v12.2d[1]
|
||||
fmla v28.2d, v4.2d, v15.2d[0]
|
||||
fmla v21.2d, v5.2d, v13.2d[0]
|
||||
fmla v30.2d, v6.2d, v15.2d[0]
|
||||
fmla v23.2d, v7.2d, v13.2d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_SUB
|
||||
ld1 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
ldp d8, d9, [pB]
|
||||
add pB, pB, #16
|
||||
ldp d10, d11, [pB]
|
||||
add pB, pB, #16
|
||||
ldp q0, q1, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v29.2d, v1.2d, v9.2d[1]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v25.2d, v1.2d, v9.2d[0]
|
||||
fmla v29.2d, v1.2d, v11.2d[0]
|
||||
fmla v20.2d, v0.2d, v9.2d[0]
|
||||
fmla v25.2d, v1.2d, v10.2d[0]
|
||||
|
||||
ld1 {v2.2d, v3.2d}, [ppA]
|
||||
ldp q2, q3, [ppA]
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v24.2d, v0.2d, v10.2d[0]
|
||||
fmla v21.2d, v1.2d, v9.2d[0]
|
||||
fmla v28.2d, v0.2d, v11.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
|
||||
fmla v18.2d, v2.2d, v8.2d[0]
|
||||
fmla v31.2d, v3.2d, v9.2d[1]
|
||||
fmla v22.2d, v2.2d, v8.2d[1]
|
||||
fmla v27.2d, v3.2d, v9.2d[0]
|
||||
fmla v31.2d, v3.2d, v11.2d[0]
|
||||
fmla v22.2d, v2.2d, v9.2d[0]
|
||||
fmla v27.2d, v3.2d, v10.2d[0]
|
||||
|
||||
fmla v26.2d, v2.2d, v9.2d[0]
|
||||
fmla v23.2d, v3.2d, v8.2d[1]
|
||||
fmla v30.2d, v2.2d, v9.2d[1]
|
||||
fmla v26.2d, v2.2d, v10.2d[0]
|
||||
fmla v23.2d, v3.2d, v9.2d[0]
|
||||
fmla v30.2d, v2.2d, v11.2d[0]
|
||||
fmla v19.2d, v3.2d, v8.2d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x4
|
||||
fmov alpha0, alpha
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
add ppCRow0, pCRow0, #32
|
||||
|
||||
ld1 {v0.2d, v1.2d}, [pCRow0]
|
||||
ldp q0, q1, [pCRow0]
|
||||
fmla v0.2d, v16.2d, alphaV0
|
||||
fmla v1.2d, v17.2d, alphaV1
|
||||
st1 {v0.2d, v1.2d}, [pCRow0]
|
||||
|
||||
ld1 {v2.2d, v3.2d}, [ppCRow0]
|
||||
fmla v2.2d, v18.2d, alphaV2
|
||||
fmla v3.2d, v19.2d, alphaV3
|
||||
st1 {v2.2d, v3.2d}, [ppCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
add ppCRow1, ppCRow0, LDC
|
||||
|
||||
ld1 {v4.2d, v5.2d}, [pCRow1]
|
||||
fmla v4.2d, v20.2d, alphaV0
|
||||
fmla v5.2d, v21.2d, alphaV1
|
||||
st1 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
ld1 {v6.2d, v7.2d}, [ppCRow1]
|
||||
fmla v6.2d, v22.2d, alphaV2
|
||||
fmla v7.2d, v23.2d, alphaV3
|
||||
st1 {v6.2d, v7.2d}, [ppCRow1]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
add ppCRow2, ppCRow1, LDC
|
||||
|
||||
ld1 {v0.2d, v1.2d}, [pCRow2]
|
||||
fmla v0.2d, v24.2d, alphaV0
|
||||
fmla v1.2d, v25.2d, alphaV1
|
||||
st1 {v0.2d, v1.2d}, [pCRow2]
|
||||
|
||||
ld1 {v2.2d, v3.2d}, [ppCRow2]
|
||||
fmla v2.2d, v26.2d, alphaV2
|
||||
fmla v3.2d, v27.2d, alphaV3
|
||||
st1 {v2.2d, v3.2d}, [ppCRow2]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
add ppCRow1, ppCRow2, LDC
|
||||
|
||||
ld1 {v4.2d, v5.2d}, [pCRow1]
|
||||
fmla v4.2d, v28.2d, alphaV0
|
||||
fmla v5.2d, v29.2d, alphaV1
|
||||
st1 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
ld1 {v6.2d, v7.2d}, [ppCRow1]
|
||||
fmla v6.2d, v30.2d, alphaV2
|
||||
fmla v7.2d, v31.2d, alphaV3
|
||||
st1 {v6.2d, v7.2d}, [ppCRow1]
|
||||
fmla v1.2d, v17.2d, alphaV0
|
||||
stp q0, q1, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
||||
ldp q2, q3, [ppCRow0]
|
||||
fmla v2.2d, v18.2d, alphaV0
|
||||
fmla v3.2d, v19.2d, alphaV0
|
||||
stp q2, q3, [ppCRow0]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
add ppCRow1, pCRow1, #32
|
||||
|
||||
ldp q4, q5, [pCRow1]
|
||||
fmla v4.2d, v20.2d, alphaV0
|
||||
fmla v5.2d, v21.2d, alphaV0
|
||||
stp q4, q5, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, #64
|
||||
|
||||
ldp q6, q7, [ppCRow1]
|
||||
fmla v6.2d, v22.2d, alphaV0
|
||||
fmla v7.2d, v23.2d, alphaV0
|
||||
stp q6, q7, [ppCRow1]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
add ppCRow2, pCRow2, #32
|
||||
|
||||
ldp q0, q1, [pCRow2]
|
||||
fmla v0.2d, v24.2d, alphaV0
|
||||
fmla v1.2d, v25.2d, alphaV0
|
||||
stp q0, q1, [pCRow2]
|
||||
|
||||
add pCRow2, pCRow2, #64
|
||||
|
||||
ldp q2, q3, [ppCRow2]
|
||||
fmla v2.2d, v26.2d, alphaV0
|
||||
fmla v3.2d, v27.2d, alphaV0
|
||||
stp q2, q3, [ppCRow2]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
add ppCRow3, pCRow3, #32
|
||||
|
||||
ldp q4, q5, [pCRow3]
|
||||
fmla v4.2d, v28.2d, alphaV0
|
||||
fmla v5.2d, v29.2d, alphaV0
|
||||
stp q4, q5, [pCRow3]
|
||||
|
||||
add pCRow3, pCRow3, #64
|
||||
|
||||
ldp q6, q7, [ppCRow3]
|
||||
fmla v6.2d, v30.2d, alphaV0
|
||||
fmla v7.2d, v31.2d, alphaV0
|
||||
stp q6, q7, [ppCRow3]
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
@ -403,30 +436,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
fmla v9.2d, v17.2d, alphaV1
|
||||
fmla v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV2
|
||||
fmla v13.2d, v21.2d, alphaV3
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
fmla v13.2d, v21.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pCRow2]
|
||||
fmla v8.2d, v24.2d, alphaV0
|
||||
fmla v9.2d, v25.2d, alphaV1
|
||||
fmla v9.2d, v25.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pCRow1]
|
||||
fmla v12.2d, v28.2d, alphaV2
|
||||
fmla v13.2d, v29.2d, alphaV3
|
||||
fmla v12.2d, v28.2d, alphaV0
|
||||
fmla v13.2d, v29.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
@ -454,6 +489,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
|
@ -461,19 +498,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add pCRow1, pCRow0, LDC
|
||||
|
||||
ld1 {v12.2d}, [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV1
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
|
||||
ld1 {v8.2d}, [pCRow2]
|
||||
fmla v8.2d, v24.2d, alphaV2
|
||||
fmla v8.2d, v24.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
|
||||
ld1 {v12.2d}, [pCRow1]
|
||||
fmla v12.2d, v28.2d, alphaV3
|
||||
fmla v12.2d, v28.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
@ -498,6 +535,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
fmov alpha0, alpha
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
ld1 {v8.d}[0], [pCRow0]
|
||||
|
@ -511,7 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
ld1 {v12.d}[0], [pCRow2]
|
||||
ld1 {v12.d}[1], [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV1
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.d}[0], [pCRow2]
|
||||
st1 {v12.d}[1], [pCRow1]
|
||||
|
||||
|
@ -540,16 +579,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
fmla v9.2d, v17.2d, alphaV1
|
||||
fmla v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV2
|
||||
fmla v13.2d, v21.2d, alphaV3
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
fmla v13.2d, v21.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
@ -574,6 +615,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
|
@ -581,7 +624,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add pCRow1 , pCRow0, LDC
|
||||
|
||||
ld1 {v12.2d}, [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV1
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
@ -604,6 +647,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
fmov alpha0, alpha
|
||||
|
||||
add pCRow1 , pCRow0, LDC
|
||||
|
||||
ld1 {v8.d}[0], [pCRow0]
|
||||
|
@ -634,9 +679,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
fmla v9.2d, v17.2d, alphaV1
|
||||
fmla v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
@ -662,6 +709,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
|
@ -686,6 +735,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x1
|
||||
fmov alpha0, alpha
|
||||
|
||||
ldr d8, [pCRow0]
|
||||
fmadd d8, d16, alpha0, d8
|
||||
str d8, [pCRow0]
|
||||
|
@ -713,10 +764,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
fmov alpha0, d0
|
||||
fmov alpha1, d0
|
||||
fmov alpha2, d0
|
||||
fmov alpha3, d0
|
||||
fmov alpha, d0
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
lsl LDC, LDC, #3 // ldc = ldc * 8
|
||||
|
||||
|
@ -728,12 +778,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ble dgemm_kernel_L2_BEGIN
|
||||
|
||||
dgemm_kernel_L4_BEGIN:
|
||||
mov pCRow0, pC // pCRow0 = C
|
||||
add pC, pC, LDC, lsl #2
|
||||
mov pCRow0, pC
|
||||
add pCRow1, pCRow0, LDC
|
||||
add pCRow2, pCRow1, LDC
|
||||
add pCRow3, pCRow2, LDC
|
||||
add pC, pCRow3, LDC
|
||||
|
||||
lsl temp, origK, #5 // k * 4 * 8
|
||||
mov pA, origPA // pA = start of A array
|
||||
add ppA, temp, pA
|
||||
prfm PLDL1KEEP, [ppA]
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
|
@ -744,43 +798,51 @@ dgemm_kernel_L4_M8_BEGIN:
|
|||
cmp counterI, #0
|
||||
ble dgemm_kernel_L4_M4_BEGIN
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L4_M8_20:
|
||||
|
||||
mov pB, origPB
|
||||
asr counterL , origK, #1 // L = K / 2
|
||||
cmp counterL , #2 // is there at least 4 to do?
|
||||
asr counterL , origK, #2 // L = K / 4
|
||||
cmp counterL , #2
|
||||
blt dgemm_kernel_L4_M8_32
|
||||
|
||||
KERNEL8x4_I // do one in the K
|
||||
KERNEL8x4_M2 // do another in the K
|
||||
KERNEL8x4_I
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
|
||||
subs counterL, counterL, #2 // subtract 2
|
||||
ble dgemm_kernel_L4_M8_22a
|
||||
|
||||
.align 5
|
||||
|
||||
dgemm_kernel_L4_M8_22:
|
||||
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt dgemm_kernel_L4_M8_22
|
||||
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L4_M8_22a:
|
||||
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_E
|
||||
|
||||
b dgemm_kernel_L4_M8_44
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L4_M8_32:
|
||||
|
||||
tst counterL, #1
|
||||
ble dgemm_kernel_L4_M8_40
|
||||
|
||||
KERNEL8x4_I
|
||||
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_E
|
||||
|
||||
b dgemm_kernel_L4_M8_44
|
||||
|
@ -792,14 +854,22 @@ dgemm_kernel_L4_M8_40:
|
|||
|
||||
dgemm_kernel_L4_M8_44:
|
||||
|
||||
ands counterL , origK, #1
|
||||
ands counterL , origK, #3
|
||||
ble dgemm_kernel_L4_M8_100
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L4_M8_46:
|
||||
|
||||
KERNEL8x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne dgemm_kernel_L4_M8_46
|
||||
|
||||
dgemm_kernel_L4_M8_100:
|
||||
lsl temp, origK, #5
|
||||
prfm PLDL1KEEP, [pA, temp]
|
||||
prfm PLDL1KEEP, [ppA, temp]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVE8x4
|
||||
|
||||
|
@ -810,7 +880,6 @@ dgemm_kernel_L4_M8_END:
|
|||
subs counterI, counterI, #1
|
||||
bne dgemm_kernel_L4_M8_20
|
||||
|
||||
|
||||
dgemm_kernel_L4_M4_BEGIN:
|
||||
mov counterI, origM
|
||||
tst counterI , #7
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue