Merge pull request #2618 from craft-zhang/cortex-A53

Improve performance of SGEMM and STRMM on Arm Cortex-A53
This commit is contained in:
Martin Kroeker 2020-05-25 12:14:46 +02:00 committed by GitHub
commit f1a18d245b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 5381 additions and 2 deletions

View File

@ -187,3 +187,6 @@ In chronological order:
* Marius Hillenbrand <https://github.com/mhillenibm> * Marius Hillenbrand <https://github.com/mhillenibm>
* [2020-05-12] Revise dynamic architecture detection for IBM z * [2020-05-12] Revise dynamic architecture detection for IBM z
* [2020-05-12] Add new sgemm and strmm kernel for IBM z14 * [2020-05-12] Add new sgemm and strmm kernel for IBM z14
* Danfeng Zhang <https://github.com/craft-zhang>
* [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53

View File

@ -1,3 +1,194 @@
include $(KERNELDIR)/KERNEL.ARMV8 SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot.S
SDOTKERNEL = dot.S
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S
SGEMM_BETA = sgemm_beta.S
ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
else
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
endif
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
ifeq ($(SGEMM_UNROLL_M), 16)
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
else
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
endif
ifeq ($(SGEMM_UNROLL_M), 4)
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
else
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
endif
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(SGEMM_UNROLL_N), 16)
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
else
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
endif
ifeq ($(SGEMM_UNROLL_N), 4)
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
else
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
endif
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
ifeq ($(DGEMM_UNROLL_M), 8)
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
else
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
endif
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(DGEMM_UNROLL_N), 4)
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
else
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
endif
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

31
param.h
View File

@ -2623,7 +2623,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 16 #define SYMV_P 16
#if defined(CORTEXA53) || defined(CORTEXA57) || \ #if defined(CORTEXA57) || \
defined(CORTEXA72) || defined(CORTEXA73) || \ defined(CORTEXA72) || defined(CORTEXA73) || \
defined(FALKOR) || defined(TSV110) || defined(EMAG8180) defined(FALKOR) || defined(TSV110) || defined(EMAG8180)
@ -2669,6 +2669,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#define CGEMM_DEFAULT_R 4096 #define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 2048 #define ZGEMM_DEFAULT_R 2048
#elif defined(CORTEXA53)
#define SGEMM_DEFAULT_UNROLL_M 8
#define SGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 8
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 4
#define ZGEMM_DEFAULT_UNROLL_N 4
#define SGEMM_DEFAULT_P 256
#define DGEMM_DEFAULT_P 160
#define CGEMM_DEFAULT_P 128
#define ZGEMM_DEFAULT_P 128
#define SGEMM_DEFAULT_Q 256
#define DGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 224
#define ZGEMM_DEFAULT_Q 112
#define SGEMM_DEFAULT_R 4096
#define DGEMM_DEFAULT_R 4096
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 2048
#elif defined(THUNDERX) #elif defined(THUNDERX)
#define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_M 4