Use SVE kernel for SGEMM/DGEMM on Arm(R) Neoverse(TM) V1
After #3868, the SVE kernels represent a pretty good boost. This re-uses ARMV8SVE as a base and I'm going to incrementally move everything to use ARMV8SVE in additional patches (as well as fix up anything that's not already in ARMV8SVE).
This commit is contained in:
parent
84a1d76d1e
commit
7121e16684
|
@ -1,75 +1,8 @@
|
||||||
SAMINKERNEL = ../arm/amin.c
|
# Copyright 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
||||||
DAMINKERNEL = ../arm/amin.c
|
|
||||||
CAMINKERNEL = ../arm/zamin.c
|
|
||||||
ZAMINKERNEL = ../arm/zamin.c
|
|
||||||
|
|
||||||
SMAXKERNEL = ../arm/max.c
|
include $(KERNELDIR)/KERNEL.ARMV8SVE
|
||||||
DMAXKERNEL = ../arm/max.c
|
|
||||||
|
|
||||||
SMINKERNEL = ../arm/min.c
|
|
||||||
DMINKERNEL = ../arm/min.c
|
|
||||||
|
|
||||||
ISAMINKERNEL = ../arm/iamin.c
|
|
||||||
IDAMINKERNEL = ../arm/iamin.c
|
|
||||||
ICAMINKERNEL = ../arm/izamin.c
|
|
||||||
IZAMINKERNEL = ../arm/izamin.c
|
|
||||||
|
|
||||||
ISMAXKERNEL = ../arm/imax.c
|
|
||||||
IDMAXKERNEL = ../arm/imax.c
|
|
||||||
|
|
||||||
ISMINKERNEL = ../arm/imin.c
|
|
||||||
IDMINKERNEL = ../arm/imin.c
|
|
||||||
|
|
||||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
|
||||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
|
||||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
|
||||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
|
||||||
|
|
||||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
|
||||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
|
||||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
|
||||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
|
||||||
|
|
||||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
|
||||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
|
||||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
|
||||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
|
||||||
|
|
||||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
|
||||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
|
||||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
|
||||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
|
||||||
|
|
||||||
SAMAXKERNEL = amax.S
|
|
||||||
DAMAXKERNEL = amax.S
|
|
||||||
CAMAXKERNEL = zamax.S
|
|
||||||
ZAMAXKERNEL = zamax.S
|
|
||||||
|
|
||||||
SAXPYKERNEL = axpy.S
|
|
||||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||||
CAXPYKERNEL = zaxpy.S
|
|
||||||
ZAXPYKERNEL = zaxpy.S
|
|
||||||
|
|
||||||
SROTKERNEL = rot.S
|
|
||||||
DROTKERNEL = rot.S
|
|
||||||
CROTKERNEL = zrot.S
|
|
||||||
ZROTKERNEL = zrot.S
|
|
||||||
|
|
||||||
SSCALKERNEL = scal.S
|
|
||||||
DSCALKERNEL = scal.S
|
|
||||||
CSCALKERNEL = zscal.S
|
|
||||||
ZSCALKERNEL = zscal.S
|
|
||||||
|
|
||||||
SGEMVNKERNEL = gemv_n.S
|
|
||||||
DGEMVNKERNEL = gemv_n.S
|
|
||||||
CGEMVNKERNEL = zgemv_n.S
|
|
||||||
ZGEMVNKERNEL = zgemv_n.S
|
|
||||||
|
|
||||||
SGEMVTKERNEL = gemv_t.S
|
|
||||||
DGEMVTKERNEL = gemv_t.S
|
|
||||||
CGEMVTKERNEL = zgemv_t.S
|
|
||||||
ZGEMVTKERNEL = zgemv_t.S
|
|
||||||
|
|
||||||
|
|
||||||
SASUMKERNEL = sasum_thunderx2t99.c
|
SASUMKERNEL = sasum_thunderx2t99.c
|
||||||
DASUMKERNEL = dasum_thunderx2t99.c
|
DASUMKERNEL = dasum_thunderx2t99.c
|
||||||
|
@ -100,67 +33,6 @@ DDOTKERNEL = dot.c
|
||||||
SDOTKERNEL = dot.c
|
SDOTKERNEL = dot.c
|
||||||
CDOTKERNEL = zdot_thunderx2t99.c
|
CDOTKERNEL = zdot_thunderx2t99.c
|
||||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||||
DSDOTKERNEL = dot.S
|
|
||||||
|
|
||||||
DGEMM_BETA = dgemm_beta.S
|
|
||||||
SGEMM_BETA = sgemm_beta.S
|
|
||||||
|
|
||||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
|
||||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
|
||||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
|
||||||
ifeq ($(SGEMM_UNROLL_M), 16)
|
|
||||||
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
|
||||||
else
|
|
||||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
|
||||||
endif
|
|
||||||
ifeq ($(SGEMM_UNROLL_M), 4)
|
|
||||||
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
|
||||||
else
|
|
||||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
|
||||||
endif
|
|
||||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
endif
|
|
||||||
ifeq ($(SGEMM_UNROLL_N), 16)
|
|
||||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
|
||||||
else
|
|
||||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
|
||||||
endif
|
|
||||||
ifeq ($(SGEMM_UNROLL_N), 4)
|
|
||||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
|
||||||
else
|
|
||||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
|
||||||
endif
|
|
||||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
|
|
||||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
|
||||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
|
||||||
|
|
||||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
|
||||||
|
|
||||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
|
||||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
|
||||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
|
||||||
else
|
|
||||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
|
||||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
|
||||||
endif
|
|
||||||
|
|
||||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
|
||||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
|
||||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
|
||||||
else
|
|
||||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
|
||||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
|
||||||
endif
|
|
||||||
|
|
||||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
|
|
||||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||||
|
@ -182,6 +54,10 @@ ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
else
|
||||||
|
# TODO(Mousius) - check performance of the SVE kernels again and remove this
|
||||||
|
ZGEMMINCOPYOBJ =
|
||||||
|
ZGEMMITCOPYOBJ =
|
||||||
endif
|
endif
|
||||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||||
|
|
18
param.h
18
param.h
|
@ -3369,11 +3369,21 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
|
||||||
|
|
||||||
#define SWITCH_RATIO 16
|
#define SWITCH_RATIO 16
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
/* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl".
|
||||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */
|
||||||
|
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||||
|
#define SGEMM_DEFAULT_UNROLL_N 8
|
||||||
|
/* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N)
|
||||||
|
* Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro.
|
||||||
|
* If SVE size is ever more than 1024, this should be increased also. */
|
||||||
|
#define SGEMM_DEFAULT_UNROLL_MN 32
|
||||||
|
|
||||||
#define DGEMM_DEFAULT_UNROLL_M 8
|
/* When all BLAS3 routines are implemeted with SVE, DGEMM_DEFAULT_UNROLL_M should be "sve_vl".
|
||||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */
|
||||||
|
#define DGEMM_DEFAULT_UNROLL_M 2
|
||||||
|
#define DGEMM_DEFAULT_UNROLL_N 8
|
||||||
|
|
||||||
|
#define DGEMM_DEFAULT_UNROLL_MN 32
|
||||||
|
|
||||||
#define CGEMM_DEFAULT_UNROLL_M 8
|
#define CGEMM_DEFAULT_UNROLL_M 8
|
||||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
|
Loading…
Reference in New Issue