Add ARMV8SVE to AArch64 Dynamic Dispatch
In order to enable support for future cores which have similar tunings (in this case I'm doing this for the Arm(R) Neoverse(TM) V2 core), this generically detects SVE support and enables it. This should better manage the size and complexity of dynamic dispatch rather than just copy pasting the same parameters. To make `ARMV8SVE` more representive of the common 128-bit SVE case, I've split it and similar parameters from A64FX which has the wider 512-bit SVE.
This commit is contained in:
parent
7976deff80
commit
f971ef55f2
|
@ -668,6 +668,7 @@ DYNAMIC_CORE += NEOVERSEN1
|
|||
ifneq ($(NO_SVE), 1)
|
||||
DYNAMIC_CORE += NEOVERSEV1
|
||||
DYNAMIC_CORE += NEOVERSEN2
|
||||
DYNAMIC_CORE += ARMV8SVE
|
||||
endif
|
||||
DYNAMIC_CORE += CORTEXA55
|
||||
DYNAMIC_CORE += FALKOR
|
||||
|
|
|
@ -46,7 +46,7 @@ if (DYNAMIC_ARCH)
|
|||
if (ARM64)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE)
|
||||
endif ()
|
||||
if (DYNAMIC_LIST)
|
||||
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2;
|
|||
#else
|
||||
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_ARMV8SVE
|
||||
extern gotoblas_t gotoblas_ARMV8SVE;
|
||||
#else
|
||||
#define gotoblas_ARMV8SVE gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_CORTEX_A55
|
||||
extern gotoblas_t gotoblas_CORTEXA55;
|
||||
#else
|
||||
|
@ -128,9 +134,11 @@ extern gotoblas_t gotoblas_NEOVERSEN1;
|
|||
#ifndef NO_SVE
|
||||
extern gotoblas_t gotoblas_NEOVERSEV1;
|
||||
extern gotoblas_t gotoblas_NEOVERSEN2;
|
||||
extern gotoblas_t gotoblas_ARMV8SVE;
|
||||
#else
|
||||
#define gotoblas_NEOVERSEV1 gotoblas_ARMV8
|
||||
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
|
||||
#define gotoblas_ARMV8SVE gotoblas_ARMV8
|
||||
#endif
|
||||
extern gotoblas_t gotoblas_THUNDERX3T110;
|
||||
extern gotoblas_t gotoblas_CORTEXA55;
|
||||
|
@ -140,7 +148,7 @@ extern void openblas_warning(int verbose, const char * msg);
|
|||
#define FALLBACK_VERBOSE 1
|
||||
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"
|
||||
|
||||
#define NUM_CORETYPES 13
|
||||
#define NUM_CORETYPES 16
|
||||
|
||||
/*
|
||||
* In case asm/hwcap.h is outdated on the build system, make sure
|
||||
|
@ -173,6 +181,7 @@ static char *corename[] = {
|
|||
"neoversen2",
|
||||
"thunderx3t110",
|
||||
"cortexa55",
|
||||
"armv8sve",
|
||||
"unknown"
|
||||
};
|
||||
|
||||
|
@ -192,6 +201,7 @@ char *gotoblas_corename(void) {
|
|||
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12];
|
||||
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
|
||||
if (gotoblas == &gotoblas_CORTEXA55) return corename[14];
|
||||
if (gotoblas == &gotoblas_ARMV8SVE) return corename[15];
|
||||
return corename[NUM_CORETYPES];
|
||||
}
|
||||
|
||||
|
@ -226,6 +236,7 @@ static gotoblas_t *force_coretype(char *coretype) {
|
|||
case 12: return (&gotoblas_NEOVERSEN2);
|
||||
case 13: return (&gotoblas_THUNDERX3T110);
|
||||
case 14: return (&gotoblas_CORTEXA55);
|
||||
case 15: return (&gotoblas_ARMV8SVE);
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
|
@ -345,6 +356,12 @@ static gotoblas_t *get_coretype(void) {
|
|||
snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
|
||||
openblas_warning(1, coremsg);
|
||||
}
|
||||
#ifndef NO_SVE
|
||||
if ((getauxval(AT_HWCAP) & HWCAP_SVE)) {
|
||||
return &gotoblas_ARMV8SVE;
|
||||
}
|
||||
#endif
|
||||
|
||||
return NULL;
|
||||
#endif
|
||||
}
|
||||
|
|
41
param.h
41
param.h
|
@ -3371,7 +3371,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
|
|||
#define CGEMM_DEFAULT_R 4096
|
||||
#define ZGEMM_DEFAULT_R 4096
|
||||
|
||||
#elif defined(NEOVERSEV1)
|
||||
#elif defined(NEOVERSEV1) // 256-bit SVE
|
||||
|
||||
#if defined(XDOUBLE) || defined(DOUBLE)
|
||||
#define SWITCH_RATIO 8
|
||||
|
@ -3449,7 +3449,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
|
|||
#define CGEMM_DEFAULT_R 4096
|
||||
#define ZGEMM_DEFAULT_R 4096
|
||||
|
||||
#elif defined(ARMV8SVE) || defined(A64FX) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2)
|
||||
#elif defined(A64FX) // 512-bit SVE
|
||||
|
||||
/* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl".
|
||||
Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */
|
||||
|
@ -3490,6 +3490,43 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout
|
|||
#define CGEMM_DEFAULT_R 4096
|
||||
#define ZGEMM_DEFAULT_R 4096
|
||||
|
||||
#elif defined(ARMV8SVE) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE
|
||||
|
||||
#if defined(XDOUBLE) || defined(DOUBLE)
|
||||
#define SWITCH_RATIO 8
|
||||
#else
|
||||
#define SWITCH_RATIO 16
|
||||
#endif
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4 // Actually 1VL (8) but kept seperate to keep copies seperate
|
||||
#define SGEMM_DEFAULT_UNROLL_N 8
|
||||
|
||||
#define DGEMM_DEFAULT_UNROLL_M 4
|
||||
#define DGEMM_DEFAULT_UNROLL_N 8
|
||||
|
||||
#define CGEMM_DEFAULT_UNROLL_M 2
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
#define CGEMM_DEFAULT_UNROLL_MN 16
|
||||
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 2
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_MN 16
|
||||
|
||||
#define SGEMM_DEFAULT_P 128
|
||||
#define DGEMM_DEFAULT_P 160
|
||||
#define CGEMM_DEFAULT_P 128
|
||||
#define ZGEMM_DEFAULT_P 128
|
||||
|
||||
#define SGEMM_DEFAULT_Q 352
|
||||
#define DGEMM_DEFAULT_Q 128
|
||||
#define CGEMM_DEFAULT_Q 224
|
||||
#define ZGEMM_DEFAULT_Q 112
|
||||
|
||||
#define SGEMM_DEFAULT_R 4096
|
||||
#define DGEMM_DEFAULT_R 4096
|
||||
#define CGEMM_DEFAULT_R 4096
|
||||
#define ZGEMM_DEFAULT_R 4096
|
||||
|
||||
#else /* Other/undetected ARMv8 cores */
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
|
|
Loading…
Reference in New Issue