diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index ee66fea8e..80be4ddd0 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -114,8 +114,8 @@ DSDOTKERNEL = dot.S DGEMM_BETA = dgemm_beta.S SGEMM_BETA = sgemm_beta.S -SGEMMKERNEL = sgemm_kernel_sve_v1x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_8x$(SGEMM_UNROLL_N).S +SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S SGEMMINCOPY = sgemm_ncopy_sve_v1.c SGEMMITCOPY = sgemm_tcopy_sve_v1.c @@ -127,6 +127,11 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +STRMMUNCOPY_M = trmm_uncopy_sve_v1.c +STRMMLNCOPY_M = trmm_lncopy_sve_v1.c +STRMMUTCOPY_M = trmm_utcopy_sve_v1.c +STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + SSYMMUCOPY_M = symm_ucopy_sve.c SSYMMLCOPY_M = symm_lcopy_sve.c diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index 1f605d10b..0364a929c 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -114,35 +114,27 @@ DSDOTKERNEL = dot.S DGEMM_BETA = dgemm_beta.S SGEMM_BETA = sgemm_beta.S -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) -ifeq ($(SGEMM_UNROLL_M), 16) -SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S -else -SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -endif -ifeq ($(SGEMM_UNROLL_M), 4) -SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S -else -SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c -endif +SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S + +SGEMMINCOPY = sgemm_ncopy_sve_v1.c +SGEMMITCOPY = sgemm_tcopy_sve_v1.c +SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S +SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S + SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -ifeq ($(SGEMM_UNROLL_N), 16) -SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S -else -SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -endif -ifeq ($(SGEMM_UNROLL_N), 4) -SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S -else -SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c -endif SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +STRMMUNCOPY_M = trmm_uncopy_sve_v1.c +STRMMLNCOPY_M = trmm_lncopy_sve_v1.c +STRMMUTCOPY_M = trmm_utcopy_sve_v1.c +STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +SSYMMUCOPY_M = symm_ucopy_sve.c +SSYMMLCOPY_M = symm_lcopy_sve.c + DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S diff --git a/param.h b/param.h index e9419bd9d..f7b8eb07b 100644 --- a/param.h +++ b/param.h @@ -3296,14 +3296,22 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #elif defined(ARMV8SVE) || defined(A64FX) +/* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". +Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 8 +/* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N) + * Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro. + * If SVE size is ever more than 1024, this should be increased also. */ +#define SGEMM_DEFAULT_UNROLL_MN 32 /* When all BLAS3 routines are implemeted with SVE, DGEMM_DEFAULT_UNROLL_M should be "sve_vl". Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ #define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_MN 32 + #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4