From f971ef55f2ce09d60b08137ca0608a6475a7611f Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Tue, 25 Jul 2023 11:56:33 +0100 Subject: [PATCH] Add ARMV8SVE to AArch64 Dynamic Dispatch In order to enable support for future cores which have similar tunings (in this case I'm doing this for the Arm(R) Neoverse(TM) V2 core), this generically detects SVE support and enables it. This should better manage the size and complexity of dynamic dispatch rather than just copy pasting the same parameters. To make `ARMV8SVE` more representive of the common 128-bit SVE case, I've split it and similar parameters from A64FX which has the wider 512-bit SVE. --- Makefile.system | 1 + cmake/arch.cmake | 2 +- driver/others/dynamic_arm64.c | 19 +++++++++++++++- param.h | 41 +++++++++++++++++++++++++++++++++-- 4 files changed, 59 insertions(+), 4 deletions(-) diff --git a/Makefile.system b/Makefile.system index 7d26eccc3..62926b380 100644 --- a/Makefile.system +++ b/Makefile.system @@ -668,6 +668,7 @@ DYNAMIC_CORE += NEOVERSEN1 ifneq ($(NO_SVE), 1) DYNAMIC_CORE += NEOVERSEV1 DYNAMIC_CORE += NEOVERSEN2 +DYNAMIC_CORE += ARMV8SVE endif DYNAMIC_CORE += CORTEXA55 DYNAMIC_CORE += FALKOR diff --git a/cmake/arch.cmake b/cmake/arch.cmake index e6e434a0a..07df31b89 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -46,7 +46,7 @@ if (DYNAMIC_ARCH) if (ARM64) set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) - set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2) + set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE) endif () if (DYNAMIC_LIST) set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index ef2597234..530d18115 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2; #else #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 #endif +#ifdef DYN_ARMV8SVE +extern gotoblas_t gotoblas_ARMV8SVE; +#else +#define gotoblas_ARMV8SVE gotoblas_ARMV8 +#endif #ifdef DYN_CORTEX_A55 extern gotoblas_t gotoblas_CORTEXA55; #else @@ -128,9 +134,11 @@ extern gotoblas_t gotoblas_NEOVERSEN1; #ifndef NO_SVE extern gotoblas_t gotoblas_NEOVERSEV1; extern gotoblas_t gotoblas_NEOVERSEN2; +extern gotoblas_t gotoblas_ARMV8SVE; #else #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 +#define gotoblas_ARMV8SVE gotoblas_ARMV8 #endif extern gotoblas_t gotoblas_THUNDERX3T110; extern gotoblas_t gotoblas_CORTEXA55; @@ -140,7 +148,7 @@ extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" -#define NUM_CORETYPES 13 +#define NUM_CORETYPES 16 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -173,6 +181,7 @@ static char *corename[] = { "neoversen2", "thunderx3t110", "cortexa55", + "armv8sve", "unknown" }; @@ -192,6 +201,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12]; if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13]; if (gotoblas == &gotoblas_CORTEXA55) return corename[14]; + if (gotoblas == &gotoblas_ARMV8SVE) return corename[15]; return corename[NUM_CORETYPES]; } @@ -226,6 +236,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 12: return (&gotoblas_NEOVERSEN2); case 13: return (&gotoblas_THUNDERX3T110); case 14: return (&gotoblas_CORTEXA55); + case 15: return (&gotoblas_ARMV8SVE); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -345,6 +356,12 @@ static gotoblas_t *get_coretype(void) { snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); openblas_warning(1, coremsg); } +#ifndef NO_SVE + if ((getauxval(AT_HWCAP) & HWCAP_SVE)) { + return &gotoblas_ARMV8SVE; + } +#endif + return NULL; #endif } diff --git a/param.h b/param.h index aa193a284..84e0c2ac7 100644 --- a/param.h +++ b/param.h @@ -3371,7 +3371,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(NEOVERSEV1) +#elif defined(NEOVERSEV1) // 256-bit SVE #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8 @@ -3449,7 +3449,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(ARMV8SVE) || defined(A64FX) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) +#elif defined(A64FX) // 512-bit SVE /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ @@ -3490,6 +3490,43 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#elif defined(ARMV8SVE) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE + +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#else +#define SWITCH_RATIO 16 +#endif + +#define SGEMM_DEFAULT_UNROLL_M 4 // Actually 1VL (8) but kept seperate to keep copies seperate +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 8 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_MN 16 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_MN 16 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #else /* Other/undetected ARMv8 cores */ #define SGEMM_DEFAULT_UNROLL_M 16