Merge pull request #802 from ashwinyes/develop_20160314_dgemm_optimization

DGEMM Optimizations for Cortex-A57
This commit is contained in:
Zhang Xianyi 2016-03-14 20:31:03 -04:00
commit e17303933a
13 changed files with 21421 additions and 202 deletions

View File

@ -141,5 +141,11 @@ In chronological order:
* Martin Koehler <https://github.com/grisuthedragon/> * Martin Koehler <https://github.com/grisuthedragon/>
* [2015-09-07] Improved imatcopy * [2015-09-07] Improved imatcopy
* Ashwin Sekhar T K <https://github.com/ashwinyes/>
* [2015-11-09] Assembly kernels for Cortex-A57 (ARMv8)
* [2015-11-20] lapack-test fixes for Cortex-A57
* [2016-03-14] Additional functional Assembly Kernels for Cortex-A57
* [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57
* [Your name or handle] <[email or website]> * [Your name or handle] <[email or website]>
* [Date] [Brief summary of your changes] * [Date] [Brief summary of your changes]

View File

@ -60,32 +60,55 @@ DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S
STRMMKERNEL = strmm_kernel_4x4.S SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_4x4.S STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_4x4.S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
ZTRMMKERNEL = ztrmm_kernel_4x4.S SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
SGEMMKERNEL = sgemm_kernel_4x4.S SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c endif
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = dgemm_kernel_4x4.S DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
DGEMMONCOPY = ../generic/gemm_ncopy_4.c DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
endif
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = cgemm_kernel_4x4.S CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy.o
CGEMMITCOPYOBJ = cgemm_itcopy.o
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = zgemm_kernel_4x4.S ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy.o
ZGEMMITCOPYOBJ = zgemm_itcopy.o
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o

2044
kernel/arm64/cgemm_kernel_8x4.S Executable file

File diff suppressed because it is too large Load Diff

2425
kernel/arm64/ctrmm_kernel_8x4.S Executable file

File diff suppressed because it is too large Load Diff

View File

@ -46,21 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow0 x12 #define pCRow0 x12
#define pCRow1 x13 #define pCRow1 x13
#define pCRow2 x14 #define pCRow2 x14
#define pA x15 #define pCRow3 x15
#define ppC x16 #define pA x16
#define ppCRow0 x17 #define ppC x17
#define ppCRow1 x18 #define ppCRow0 x18
#define ppCRow2 x19 #define ppCRow1 x19
#define ppA x20 #define ppCRow2 x20
#define ppCRow3 x21
#define ppA x22
#define alpha x23
#define alpha0 d10 #define alpha0 d10
#define alphaV0 v10.d[0] #define alphaV0 v10.d[0]
#define alpha1 d11
#define alphaV1 v11.d[0] #define A_PRE_SIZE 1024
#define alpha2 d14 #define B_PRE_SIZE 1024
#define alphaV2 v14.d[0] #define C_PRE_SIZE 128
#define alpha3 d15
#define alphaV3 v15.d[0]
// 00 origM // 00 origM
// 01 origN // 01 origN
@ -77,15 +78,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 12 pCRow0 // 12 pCRow0
// 13 pCRow1 // 13 pCRow1
// 14 pCRow2 // 14 pCRow2
// 15 pA // 15 pCRow3
// 16 ppC // 16 pA
// 17 ppCRow0 // 17 ppC
// 18 must save ppCRow1 // 18 must save ppCRow0
// 19 must save ppCRow2 // 19 must save ppCRow1
// 20 must save ppA // 20 must save ppCRow2
// 21 must save // 21 must save ppCRow3
// 22 must save // 22 must save ppA
// 23 must save // 23 must save alpha
// 24 must save // 24 must save
// 25 must save // 25 must save
// 26 must save // 26 must save
@ -106,11 +107,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//v08 must save pB00, pB01 //v08 must save pB00, pB01
//v09 must save pB02, pB03 //v09 must save pB02, pB03
//v10 must save ALPHA0 //v10 must save ALPHA0
//v11 must save ALPHA1 //v11 must save
//v12 must save pB10, pB11 //v12 must save pB10, pB11
//v13 must save pB12, pB13 //v13 must save pB12, pB13
//v14 must save ALPHA2 //v14 must save
//v15 must save ALPHA3 //v15 must save
//v16 must save C00, C01 //v16 must save C00, C01
//v17 must save C02, C03 //v17 must save C02, C03
//v18 ppC00, ppC01 //v18 ppC00, ppC01
@ -152,222 +153,254 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_I .macro KERNEL8x4_I
ld1 {v8.2d, v9.2d}, [pB] ldp d8, d9, [pB]
add pB, pB, #32 add pB, pB, #16
ld1 {v0.2d, v1.2d}, [pA] ldp d10, d11, [pB]
add pB, pB, #16
ldp q0, q1, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.2d, v0.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.2d[0]
fmul v29.2d, v1.2d, v9.2d[1] fmul v29.2d, v1.2d, v11.2d[0]
ld1 {v2.2d, v3.2d}, [ppA] ldp q2, q3, [ppA]
add ppA, ppA, #32 add ppA, ppA, #32
fmul v20.2d, v0.2d, v8.2d[1] fmul v20.2d, v0.2d, v9.2d[0]
fmul v25.2d, v1.2d, v9.2d[0] fmul v25.2d, v1.2d, v10.2d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmul v18.2d, v2.2d, v8.2d[0] fmul v18.2d, v2.2d, v8.2d[0]
fmul v31.2d, v3.2d, v9.2d[1] fmul v31.2d, v3.2d, v11.2d[0]
fmul v22.2d, v2.2d, v8.2d[1]
fmul v27.2d, v3.2d, v9.2d[0]
ld1 {v12.2d, v13.2d}, [pB] // for next round prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
add pB, pB, #32
fmul v24.2d, v0.2d, v9.2d[0] fmul v22.2d, v2.2d, v9.2d[0]
fmul v21.2d, v1.2d, v8.2d[1] fmul v27.2d, v3.2d, v10.2d[0]
ld1 {v4.2d, v5.2d} , [pA] // for next round ldp d12, d13, [pB]
add pB, pB, #16
fmul v24.2d, v0.2d, v10.2d[0]
fmul v21.2d, v1.2d, v9.2d[0]
ldp q4, q5, [pA] // for next round
add pA, pA, #32 add pA, pA, #32
fmul v26.2d, v2.2d, v9.2d[0] fmul v26.2d, v2.2d, v10.2d[0]
fmul v23.2d, v3.2d, v8.2d[1] fmul v23.2d, v3.2d, v9.2d[0]
ld1 {v6.2d, v7.2d} , [ppA] // for next round ldp q6, q7, [ppA] // for next round
add ppA, ppA, #32 add ppA, ppA, #32
fmul v28.2d, v0.2d, v9.2d[1] fmul v28.2d, v0.2d, v11.2d[0]
fmul v17.2d, v1.2d, v8.2d[0] fmul v17.2d, v1.2d, v8.2d[0]
fmul v30.2d, v2.2d, v9.2d[1]
ldp d14, d15, [pB]
add pB, pB, #16
fmul v30.2d, v2.2d, v11.2d[0]
fmul v19.2d, v3.2d, v8.2d[0] fmul v19.2d, v3.2d, v8.2d[0]
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.2d[0]
fmla v29.2d, v5.2d, v13.2d[1] fmla v29.2d, v5.2d, v15.2d[0]
ld1 {v8.2d, v9.2d}, [pB] ldp d8, d9, [pB]
add pB, pB, #32 add pB, pB, #16
fmla v18.2d, v6.2d, v12.2d[0] fmla v18.2d, v6.2d, v12.2d[0]
fmla v31.2d, v7.2d, v13.2d[1] fmla v31.2d, v7.2d, v15.2d[0]
fmla v20.2d, v4.2d, v12.2d[1]
fmla v25.2d, v5.2d, v13.2d[0]
prfm PLDL1KEEP, [pB, #512] ldp d10, d11, [pB]
add pB, pB, #16
fmla v22.2d, v6.2d, v12.2d[1] fmla v20.2d, v4.2d, v13.2d[0]
fmla v27.2d, v7.2d, v13.2d[0] fmla v25.2d, v5.2d, v14.2d[0]
fmla v24.2d, v4.2d, v13.2d[0]
fmla v21.2d, v5.2d, v12.2d[1]
ld1 {v0.2d, v1.2d}, [pA] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v22.2d, v6.2d, v13.2d[0]
fmla v27.2d, v7.2d, v14.2d[0]
fmla v24.2d, v4.2d, v14.2d[0]
fmla v21.2d, v5.2d, v13.2d[0]
ldp q0, q1, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v26.2d, v6.2d, v13.2d[0] fmla v26.2d, v6.2d, v14.2d[0]
fmla v23.2d, v7.2d, v12.2d[1] fmla v23.2d, v7.2d, v13.2d[0]
fmla v28.2d, v4.2d, v13.2d[1] fmla v28.2d, v4.2d, v15.2d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.2d[0]
ld1 {v2.2d, v3.2d}, [ppA] ldp q2, q3, [ppA]
add ppA, ppA, #32 add ppA, ppA, #32
fmla v30.2d, v6.2d, v13.2d[1] fmla v30.2d, v6.2d, v15.2d[0]
fmla v19.2d, v7.2d, v12.2d[0] fmla v19.2d, v7.2d, v12.2d[0]
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.2d[0]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v11.2d[0]
ld1 {v12.2d, v13.2d}, [pB] // for next round ldp d12, d13, [pB]
add pB, pB, #32 add pB, pB, #16
fmla v18.2d, v2.2d, v8.2d[0] fmla v18.2d, v2.2d, v8.2d[0]
fmla v31.2d, v3.2d, v9.2d[1] fmla v31.2d, v3.2d, v11.2d[0]
fmla v20.2d, v0.2d, v8.2d[1]
fmla v25.2d, v1.2d, v9.2d[0]
prfm PLDL1KEEP, [pA, #512] ldp d14, d15, [pB]
add pB, pB, #16
fmla v22.2d, v2.2d, v8.2d[1] fmla v20.2d, v0.2d, v9.2d[0]
fmla v27.2d, v3.2d, v9.2d[0] fmla v25.2d, v1.2d, v10.2d[0]
prfm PLDL1KEEP, [ppA, #512] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla v24.2d, v0.2d, v9.2d[0] fmla v22.2d, v2.2d, v9.2d[0]
fmla v21.2d, v1.2d, v8.2d[1] fmla v27.2d, v3.2d, v10.2d[0]
ld1 {v4.2d, v5.2d} , [pA] // for next round prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
fmla v24.2d, v0.2d, v10.2d[0]
fmla v21.2d, v1.2d, v9.2d[0]
ldp q4, q5, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v26.2d, v2.2d, v9.2d[0] fmla v26.2d, v2.2d, v10.2d[0]
fmla v23.2d, v3.2d, v8.2d[1] fmla v23.2d, v3.2d, v9.2d[0]
fmla v28.2d, v0.2d, v9.2d[1]
fmla v28.2d, v0.2d, v11.2d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.2d[0]
ld1 {v6.2d, v7.2d} , [ppA] // for next round ldp q6, q7, [ppA]
add ppA, ppA, #32 add ppA, ppA, #32
fmla v30.2d, v2.2d, v9.2d[1] fmla v30.2d, v2.2d, v11.2d[0]
fmla v19.2d, v3.2d, v8.2d[0] fmla v19.2d, v3.2d, v8.2d[0]
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.2d[0]
fmla v25.2d, v5.2d, v13.2d[0] fmla v25.2d, v5.2d, v14.2d[0]
fmla v18.2d, v6.2d, v12.2d[0] fmla v18.2d, v6.2d, v12.2d[0]
fmla v27.2d, v7.2d, v13.2d[0] fmla v27.2d, v7.2d, v14.2d[0]
fmla v20.2d, v4.2d, v12.2d[1] fmla v20.2d, v4.2d, v13.2d[0]
fmla v29.2d, v5.2d, v13.2d[1] fmla v29.2d, v5.2d, v15.2d[0]
fmla v22.2d, v6.2d, v12.2d[1] fmla v22.2d, v6.2d, v13.2d[0]
fmla v31.2d, v7.2d, v13.2d[1] fmla v31.2d, v7.2d, v15.2d[0]
fmla v24.2d, v4.2d, v13.2d[0] fmla v24.2d, v4.2d, v14.2d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.2d[0]
fmla v26.2d, v6.2d, v13.2d[0] fmla v26.2d, v6.2d, v14.2d[0]
fmla v19.2d, v7.2d, v12.2d[0] fmla v19.2d, v7.2d, v12.2d[0]
fmla v28.2d, v4.2d, v13.2d[1] fmla v28.2d, v4.2d, v15.2d[0]
fmla v21.2d, v5.2d, v12.2d[1] fmla v21.2d, v5.2d, v13.2d[0]
fmla v30.2d, v6.2d, v13.2d[1] fmla v30.2d, v6.2d, v15.2d[0]
fmla v23.2d, v7.2d, v12.2d[1] fmla v23.2d, v7.2d, v13.2d[0]
.endm .endm
.macro KERNEL8x4_SUB .macro KERNEL8x4_SUB
ld1 {v8.2d, v9.2d}, [pB] ldp d8, d9, [pB]
add pB, pB, #32 add pB, pB, #16
ld1 {v0.2d, v1.2d}, [pA] ldp d10, d11, [pB]
add pB, pB, #16
ldp q0, q1, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.2d[0]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v11.2d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v9.2d[0]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v10.2d[0]
ld1 {v2.2d, v3.2d}, [ppA] ldp q2, q3, [ppA]
add ppA, ppA, #32 add ppA, ppA, #32
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v10.2d[0]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v9.2d[0]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v11.2d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.2d[0]
fmla v18.2d, v2.2d, v8.2d[0] fmla v18.2d, v2.2d, v8.2d[0]
fmla v31.2d, v3.2d, v9.2d[1] fmla v31.2d, v3.2d, v11.2d[0]
fmla v22.2d, v2.2d, v8.2d[1] fmla v22.2d, v2.2d, v9.2d[0]
fmla v27.2d, v3.2d, v9.2d[0] fmla v27.2d, v3.2d, v10.2d[0]
fmla v26.2d, v2.2d, v9.2d[0] fmla v26.2d, v2.2d, v10.2d[0]
fmla v23.2d, v3.2d, v8.2d[1] fmla v23.2d, v3.2d, v9.2d[0]
fmla v30.2d, v2.2d, v9.2d[1] fmla v30.2d, v2.2d, v11.2d[0]
fmla v19.2d, v3.2d, v8.2d[0] fmla v19.2d, v3.2d, v8.2d[0]
.endm .endm
.macro SAVE8x4 .macro SAVE8x4
fmov alpha0, alpha
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add ppCRow0, pCRow0, #32 add ppCRow0, pCRow0, #32
ld1 {v0.2d, v1.2d}, [pCRow0] ldp q0, q1, [pCRow0]
fmla v0.2d, v16.2d, alphaV0 fmla v0.2d, v16.2d, alphaV0
fmla v1.2d, v17.2d, alphaV1 fmla v1.2d, v17.2d, alphaV0
st1 {v0.2d, v1.2d}, [pCRow0] stp q0, q1, [pCRow0]
ld1 {v2.2d, v3.2d}, [ppCRow0]
fmla v2.2d, v18.2d, alphaV2
fmla v3.2d, v19.2d, alphaV3
st1 {v2.2d, v3.2d}, [ppCRow0]
add pCRow1, pCRow0, LDC
add ppCRow1, ppCRow0, LDC
ld1 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0
fmla v5.2d, v21.2d, alphaV1
st1 {v4.2d, v5.2d}, [pCRow1]
ld1 {v6.2d, v7.2d}, [ppCRow1]
fmla v6.2d, v22.2d, alphaV2
fmla v7.2d, v23.2d, alphaV3
st1 {v6.2d, v7.2d}, [ppCRow1]
add pCRow2, pCRow1, LDC
add ppCRow2, ppCRow1, LDC
ld1 {v0.2d, v1.2d}, [pCRow2]
fmla v0.2d, v24.2d, alphaV0
fmla v1.2d, v25.2d, alphaV1
st1 {v0.2d, v1.2d}, [pCRow2]
ld1 {v2.2d, v3.2d}, [ppCRow2]
fmla v2.2d, v26.2d, alphaV2
fmla v3.2d, v27.2d, alphaV3
st1 {v2.2d, v3.2d}, [ppCRow2]
add pCRow1, pCRow2, LDC
add ppCRow1, ppCRow2, LDC
ld1 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v28.2d, alphaV0
fmla v5.2d, v29.2d, alphaV1
st1 {v4.2d, v5.2d}, [pCRow1]
ld1 {v6.2d, v7.2d}, [ppCRow1]
fmla v6.2d, v30.2d, alphaV2
fmla v7.2d, v31.2d, alphaV3
st1 {v6.2d, v7.2d}, [ppCRow1]
add pCRow0, pCRow0, #64 add pCRow0, pCRow0, #64
ldp q2, q3, [ppCRow0]
fmla v2.2d, v18.2d, alphaV0
fmla v3.2d, v19.2d, alphaV0
stp q2, q3, [ppCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add ppCRow1, pCRow1, #32
ldp q4, q5, [pCRow1]
fmla v4.2d, v20.2d, alphaV0
fmla v5.2d, v21.2d, alphaV0
stp q4, q5, [pCRow1]
add pCRow1, pCRow1, #64
ldp q6, q7, [ppCRow1]
fmla v6.2d, v22.2d, alphaV0
fmla v7.2d, v23.2d, alphaV0
stp q6, q7, [ppCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add ppCRow2, pCRow2, #32
ldp q0, q1, [pCRow2]
fmla v0.2d, v24.2d, alphaV0
fmla v1.2d, v25.2d, alphaV0
stp q0, q1, [pCRow2]
add pCRow2, pCRow2, #64
ldp q2, q3, [ppCRow2]
fmla v2.2d, v26.2d, alphaV0
fmla v3.2d, v27.2d, alphaV0
stp q2, q3, [ppCRow2]
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
add ppCRow3, pCRow3, #32
ldp q4, q5, [pCRow3]
fmla v4.2d, v28.2d, alphaV0
fmla v5.2d, v29.2d, alphaV0
stp q4, q5, [pCRow3]
add pCRow3, pCRow3, #64
ldp q6, q7, [ppCRow3]
fmla v6.2d, v30.2d, alphaV0
fmla v7.2d, v31.2d, alphaV0
stp q6, q7, [ppCRow3]
.endm .endm
/******************************************************************************/ /******************************************************************************/
@ -403,30 +436,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
fmov alpha0, alpha
ld1 {v8.2d, v9.2d}, [pCRow0] ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV1 fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0] st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
ld1 {v12.2d, v13.2d}, [pCRow1] ld1 {v12.2d, v13.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV2 fmla v12.2d, v20.2d, alphaV0
fmla v13.2d, v21.2d, alphaV3 fmla v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1] st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
ld1 {v8.2d, v9.2d}, [pCRow2] ld1 {v8.2d, v9.2d}, [pCRow2]
fmla v8.2d, v24.2d, alphaV0 fmla v8.2d, v24.2d, alphaV0
fmla v9.2d, v25.2d, alphaV1 fmla v9.2d, v25.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow2] st1 {v8.2d, v9.2d}, [pCRow2]
add pCRow1, pCRow2, LDC add pCRow1, pCRow2, LDC
ld1 {v12.2d, v13.2d}, [pCRow1] ld1 {v12.2d, v13.2d}, [pCRow1]
fmla v12.2d, v28.2d, alphaV2 fmla v12.2d, v28.2d, alphaV0
fmla v13.2d, v29.2d, alphaV3 fmla v13.2d, v29.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1] st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32 add pCRow0, pCRow0, #32
@ -454,6 +489,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
fmov alpha0, alpha
ld1 {v8.2d}, [pCRow0] ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0] st1 {v8.2d}, [pCRow0]
@ -461,19 +498,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
ld1 {v12.2d}, [pCRow1] ld1 {v12.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV1 fmla v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1] st1 {v12.2d}, [pCRow1]
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
ld1 {v8.2d}, [pCRow2] ld1 {v8.2d}, [pCRow2]
fmla v8.2d, v24.2d, alphaV2 fmla v8.2d, v24.2d, alphaV0
st1 {v8.2d}, [pCRow2] st1 {v8.2d}, [pCRow2]
add pCRow1, pCRow2, LDC add pCRow1, pCRow2, LDC
ld1 {v12.2d}, [pCRow1] ld1 {v12.2d}, [pCRow1]
fmla v12.2d, v28.2d, alphaV3 fmla v12.2d, v28.2d, alphaV0
st1 {v12.2d}, [pCRow1] st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16 add pCRow0, pCRow0, #16
@ -498,6 +535,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE1x4 .macro SAVE1x4
fmov alpha0, alpha
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
ld1 {v8.d}[0], [pCRow0] ld1 {v8.d}[0], [pCRow0]
@ -511,7 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v12.d}[0], [pCRow2] ld1 {v12.d}[0], [pCRow2]
ld1 {v12.d}[1], [pCRow1] ld1 {v12.d}[1], [pCRow1]
fmla v12.2d, v20.2d, alphaV1 fmla v12.2d, v20.2d, alphaV0
st1 {v12.d}[0], [pCRow2] st1 {v12.d}[0], [pCRow2]
st1 {v12.d}[1], [pCRow1] st1 {v12.d}[1], [pCRow1]
@ -540,16 +579,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
fmov alpha0, alpha
ld1 {v8.2d, v9.2d}, [pCRow0] ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV1 fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0] st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
ld1 {v12.2d, v13.2d}, [pCRow1] ld1 {v12.2d, v13.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV2 fmla v12.2d, v20.2d, alphaV0
fmla v13.2d, v21.2d, alphaV3 fmla v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1] st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32 add pCRow0, pCRow0, #32
@ -574,6 +615,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
fmov alpha0, alpha
ld1 {v8.2d}, [pCRow0] ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0] st1 {v8.2d}, [pCRow0]
@ -581,7 +624,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow1 , pCRow0, LDC add pCRow1 , pCRow0, LDC
ld1 {v12.2d}, [pCRow1] ld1 {v12.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV1 fmla v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1] st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16 add pCRow0, pCRow0, #16
@ -604,6 +647,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
fmov alpha0, alpha
add pCRow1 , pCRow0, LDC add pCRow1 , pCRow0, LDC
ld1 {v8.d}[0], [pCRow0] ld1 {v8.d}[0], [pCRow0]
@ -634,9 +679,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
fmov alpha0, alpha
ld1 {v8.2d, v9.2d}, [pCRow0] ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV1 fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0] st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow0, pCRow0, #32 add pCRow0, pCRow0, #32
@ -662,6 +709,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE2x1 .macro SAVE2x1
fmov alpha0, alpha
ld1 {v8.2d}, [pCRow0] ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0] st1 {v8.2d}, [pCRow0]
@ -686,6 +735,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE1x1 .macro SAVE1x1
fmov alpha0, alpha
ldr d8, [pCRow0] ldr d8, [pCRow0]
fmadd d8, d16, alpha0, d8 fmadd d8, d16, alpha0, d8
str d8, [pCRow0] str d8, [pCRow0]
@ -713,10 +764,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)] stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)] str x28, [sp, #(10 * 16)]
fmov alpha0, d0 fmov alpha, d0
fmov alpha1, d0 prfm PLDL1KEEP, [origPA]
fmov alpha2, d0 prfm PLDL1KEEP, [origPB]
fmov alpha3, d0
lsl LDC, LDC, #3 // ldc = ldc * 8 lsl LDC, LDC, #3 // ldc = ldc * 8
@ -728,12 +778,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble dgemm_kernel_L2_BEGIN ble dgemm_kernel_L2_BEGIN
dgemm_kernel_L4_BEGIN: dgemm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC
add pC, pC, LDC, lsl #2 add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
add pCRow3, pCRow2, LDC
add pC, pCRow3, LDC
lsl temp, origK, #5 // k * 4 * 8 lsl temp, origK, #5 // k * 4 * 8
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
add ppA, temp, pA add ppA, temp, pA
prfm PLDL1KEEP, [ppA]
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
@ -744,43 +798,51 @@ dgemm_kernel_L4_M8_BEGIN:
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN ble dgemm_kernel_L4_M4_BEGIN
.align 5
dgemm_kernel_L4_M8_20: dgemm_kernel_L4_M8_20:
mov pB, origPB mov pB, origPB
asr counterL , origK, #1 // L = K / 2 asr counterL , origK, #2 // L = K / 4
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2
blt dgemm_kernel_L4_M8_32 blt dgemm_kernel_L4_M8_32
KERNEL8x4_I // do one in the K KERNEL8x4_I
KERNEL8x4_M2 // do another in the K KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a ble dgemm_kernel_L4_M8_22a
.align 5 .align 5
dgemm_kernel_L4_M8_22: dgemm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22 bgt dgemm_kernel_L4_M8_22
.align 5
dgemm_kernel_L4_M8_22a: dgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E
b dgemm_kernel_L4_M8_44 b dgemm_kernel_L4_M8_44
.align 5
dgemm_kernel_L4_M8_32: dgemm_kernel_L4_M8_32:
tst counterL, #1 tst counterL, #1
ble dgemm_kernel_L4_M8_40 ble dgemm_kernel_L4_M8_40
KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E
b dgemm_kernel_L4_M8_44 b dgemm_kernel_L4_M8_44
@ -792,14 +854,22 @@ dgemm_kernel_L4_M8_40:
dgemm_kernel_L4_M8_44: dgemm_kernel_L4_M8_44:
ands counterL , origK, #1 ands counterL , origK, #3
ble dgemm_kernel_L4_M8_100 ble dgemm_kernel_L4_M8_100
.align 5
dgemm_kernel_L4_M8_46: dgemm_kernel_L4_M8_46:
KERNEL8x4_SUB KERNEL8x4_SUB
subs counterL, counterL, #1
bne dgemm_kernel_L4_M8_46
dgemm_kernel_L4_M8_100: dgemm_kernel_L4_M8_100:
lsl temp, origK, #5
prfm PLDL1KEEP, [pA, temp]
prfm PLDL1KEEP, [ppA, temp]
prfm PLDL1KEEP, [origPB]
SAVE8x4 SAVE8x4
@ -810,7 +880,6 @@ dgemm_kernel_L4_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20 bne dgemm_kernel_L4_M8_20
dgemm_kernel_L4_M4_BEGIN: dgemm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7

1689
kernel/arm64/dgemm_kernel_4x8.S Executable file

File diff suppressed because it is too large Load Diff

1570
kernel/arm64/dgemm_kernel_8x4.S Executable file

File diff suppressed because it is too large Load Diff

2026
kernel/arm64/dtrmm_kernel_4x8.S Executable file

File diff suppressed because it is too large Load Diff

1849
kernel/arm64/dtrmm_kernel_8x4.S Executable file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

2431
kernel/arm64/strmm_kernel_16x4.S Executable file

File diff suppressed because it is too large Load Diff

2795
kernel/arm64/strmm_kernel_8x8.S Executable file

File diff suppressed because it is too large Load Diff