Add ARMV8SVE to AArch64 Dynamic Dispatch

In order to enable support for future cores which have similar tunings
(in this case I'm doing this for the Arm(R) Neoverse(TM) V2 core), this generically detects SVE support and enables it. This should better manage the size and complexity of dynamic dispatch rather than just copy pasting the same parameters.

To make `ARMV8SVE` more representive of the common 128-bit SVE case,
I've split it and similar parameters from A64FX which has the wider
512-bit SVE.
This commit is contained in:
Chris Sidebottom
2023-07-25 11:56:33 +01:00
parent 7976deff80
commit f971ef55f2
4 changed files with 59 additions and 4 deletions

41
param.h
View File

@@ -3371,7 +3371,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
#elif defined(NEOVERSEV1)
#elif defined(NEOVERSEV1) // 256-bit SVE
#if defined(XDOUBLE) || defined(DOUBLE)
#define SWITCH_RATIO 8
@@ -3449,7 +3449,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
#elif defined(ARMV8SVE) || defined(A64FX) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2)
#elif defined(A64FX) // 512-bit SVE
/* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl".
Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */
@@ -3490,6 +3490,43 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
#elif defined(ARMV8SVE) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE
#if defined(XDOUBLE) || defined(DOUBLE)
#define SWITCH_RATIO 8
#else
#define SWITCH_RATIO 16
#endif
#define SGEMM_DEFAULT_UNROLL_M 4 // Actually 1VL (8) but kept seperate to keep copies seperate
#define SGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_N 8
#define CGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_MN 16
#define ZGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_MN 16
#define SGEMM_DEFAULT_P 128
#define DGEMM_DEFAULT_P 160
#define CGEMM_DEFAULT_P 128
#define ZGEMM_DEFAULT_P 128
#define SGEMM_DEFAULT_Q 352
#define DGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 224
#define ZGEMM_DEFAULT_Q 112
#define SGEMM_DEFAULT_R 4096
#define DGEMM_DEFAULT_R 4096
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
#else /* Other/undetected ARMv8 cores */
#define SGEMM_DEFAULT_UNROLL_M 16