diff --git a/Makefile.prebuild b/Makefile.prebuild index c59e9049c..0d12b9761 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -79,7 +79,7 @@ endif getarch : getarch.c cpuid.S dummy $(CPUIDEMU) avx512=$$(./c_check$(SCRIPTSUFFIX) - - $(CC) $(TARGET_FLAGS) $(CFLAGS) | grep NO_AVX512); \ rv64gv=$$(./c_check$(SCRIPTSUFFIX) - - $(CC) $(TARGET_FLAGS) $(CFLAGS) | grep NO_RV64GV); \ - $(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) $${avx512:+-D$${avx512}} $${rv64gv:+-D$${rv64gv}} -o $(@F) getarch.c cpuid.S $(CPUIDEMU) + $(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) $${avx512:+-D$${avx512}} $${rv64gv:+-D$${rv64gv}} -lm -o $(@F) getarch.c cpuid.S $(CPUIDEMU) getarch_2nd : getarch_2nd.c $(TARGET_CONF) dummy ifndef TARGET_CORE diff --git a/Makefile.system b/Makefile.system index bada954c1..6ff9da35f 100644 --- a/Makefile.system +++ b/Makefile.system @@ -681,7 +681,7 @@ DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 endif ifeq ($(ARCH), loongarch64) -DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC +DYNAMIC_CORE = LA464 LA264 LA64_GENERIC endif ifeq ($(ARCH), zarch) diff --git a/TargetList.txt b/TargetList.txt index d17caf480..631998353 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -121,9 +121,9 @@ RISCV64_GENERIC C910V 11.LOONGARCH64: -LOONGSONGENERIC -LOONGSON3R5 -LOONGSON2K1000 +LA64_GENERIC +LA464 +LA264 12. Elbrus E2000: E2K diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c index ca07c7ffb..4d24aab98 100644 --- a/cpuid_loongarch64.c +++ b/cpuid_loongarch64.c @@ -32,52 +32,219 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include +#include -/* If LASX extension instructions supported, - * using core LOONGSON3R5 - * If only LSX extension instructions supported, - * using core LOONGSON2K1000 - * If neither LASX nor LSX extension instructions supported, - * using core LOONGSONGENERIC (As far as I know, there is no such - * CPU yet) +/* + * */ +#define CPU_GENERIC 0 +#define CPU_LA464 1 +#define CPU_LA264 2 -#define CPU_GENERIC 0 -#define CPU_LOONGSON3R5 1 -#define CPU_LOONGSON2K1000 2 +#define LOONGARCH_CFG0 0x00 +#define LOONGARCH_CFG2 0x02 +#define LOONGARCH_CFG10 0x10 +#define LOONGARCH_CFG11 0x11 +#define LOONGARCH_CFG12 0x12 +#define LOONGARCH_CFG13 0x13 +#define LOONGARCH_CFG14 0x14 +#define LASX_MASK 1<<7 +#define LSX_MASK 1<<6 +#define PRID_SERIES_MASK 0xf000 +#define PRID_SERIES_LA464 0xc000 +#define PRID_SERIES_LA264 0xa000 -#define LOONGARCH_CFG2 0x02 -#define LOONGARCH_LASX 1<<7 -#define LOONGARCH_LSX 1<<6 +#define CACHE_INFO_L1_IU 0 +#define CACHE_INFO_L1_D 1 +#define CACHE_INFO_L2_IU 2 +#define CACHE_INFO_L3_IU 3 +#define L1_IU_PRESENT_MASK 0x0001 +#define L1_IU_UNITY_MASK 0x0002 +#define L1_D_PRESENT_MASK 0x0004 +#define L2_IU_PRESENT_MASK 0x0008 +#define L2_IU_UNITY_MASK 0x0010 +#define L2_D_PRESENT_MASK 0x0080 +#define L3_IU_PRESENT_MASK 0x0400 +#define L3_IU_UNITY_MASK 0x0800 +#define L3_D_PRESENT_MASK 0x4000 +#define CACHE_WAY_MINUS_1_MASK 0x0000ffff +#define CACHE_INDEX_LOG2_MASK 0x00ff0000 +#define CACHE_LINESIZE_LOG2_MASK 0x7f000000 + +typedef struct { + int size; + int associative; + int linesize; + int unify; +} cache_info_t; static char *cpuname[] = { - "LOONGSONGENERIC", - "LOONGSON3R5", - "LOONGSON2K1000" + "LA64_GENERIC", + "LA464", + "LA264" }; static char *cpuname_lower[] = { - "loongsongeneric", - "loongson3r5", - "loongson2k1000" + "la64_generic", + "la464", + "la264" }; -int detect(void) { -#ifdef __linux - uint32_t reg = 0; +static void get_cacheinfo(int type, cache_info_t *cacheinfo) { + cache_info_t cache_info; + memset(&cache_info, 0, sizeof(cache_info)); + uint32_t reg_10 = 0; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg_10) + : "r"(LOONGARCH_CFG10) + ); + switch (type) { + case CACHE_INFO_L1_IU: + if (reg_10 & L1_IU_PRESENT_MASK) { + uint32_t reg_11 = 0; + cache_info.unify = reg_10 & L1_IU_UNITY_MASK; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg_11) + : "r"(LOONGARCH_CFG11) + ); + cache_info.associative = (reg_11 & CACHE_WAY_MINUS_1_MASK) + 1; + cache_info.linesize = pow(2, (reg_11 & CACHE_LINESIZE_LOG2_MASK) >> 24); + cache_info.size = cache_info.associative * cache_info.linesize * + pow(2, (reg_11 & CACHE_INDEX_LOG2_MASK) >> 16); + } + break; + + case CACHE_INFO_L1_D: + if (reg_10 & L1_D_PRESENT_MASK) { + uint32_t reg_12 = 0; + cache_info.unify = reg_10 & L1_IU_UNITY_MASK; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg_12) + : "r"(LOONGARCH_CFG12) + ); + cache_info.associative = (reg_12 & CACHE_WAY_MINUS_1_MASK) + 1; + cache_info.linesize = pow(2, (reg_12 & CACHE_LINESIZE_LOG2_MASK) >> 24); + cache_info.size = cache_info.associative * cache_info.linesize * + pow(2, (reg_12 & CACHE_INDEX_LOG2_MASK) >> 16); + } + break; + + case CACHE_INFO_L2_IU: + if (reg_10 & L2_IU_PRESENT_MASK) { + uint32_t reg_13 = 0; + cache_info.unify = reg_10 & L2_IU_UNITY_MASK; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg_13) + : "r"(LOONGARCH_CFG13) + ); + cache_info.associative = (reg_13 & CACHE_WAY_MINUS_1_MASK) + 1; + cache_info.linesize = pow(2, (reg_13 & CACHE_LINESIZE_LOG2_MASK) >> 24); + cache_info.size = cache_info.associative * cache_info.linesize * + pow(2, (reg_13 & CACHE_INDEX_LOG2_MASK) >> 16); + } + break; + + case CACHE_INFO_L3_IU: + if (reg_10 & L3_IU_PRESENT_MASK) { + uint32_t reg_14 = 0; + cache_info.unify = reg_10 & L3_IU_UNITY_MASK; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg_14) + : "r"(LOONGARCH_CFG14) + ); + cache_info.associative = (reg_14 & CACHE_WAY_MINUS_1_MASK) + 1; + cache_info.linesize = pow(2, (reg_14 & CACHE_LINESIZE_LOG2_MASK) >> 24); + cache_info.size = cache_info.associative * cache_info.linesize * + pow(2, (reg_14 & CACHE_INDEX_LOG2_MASK) >> 16); + } + break; + + default: + break; + } + *cacheinfo = cache_info; +} + +static void get_cpucount(uint32_t *count) { +#ifdef __linux + uint32_t num = 0; + FILE *f = fopen("/proc/cpuinfo", "r"); + if (!f) return; + char buf[200]; + while (fgets(buf, sizeof(buf), f)) + { + if (!strncmp("processor", buf, 9)) + num ++; + } + fclose(f); + *count = num; +#endif +} + +static int support_lasx() { + uint32_t reg = 0; __asm__ volatile ( "cpucfg %0, %1 \n\t" : "+&r"(reg) : "r"(LOONGARCH_CFG2) ); - if (reg & LOONGARCH_LASX) - return CPU_LOONGSON3R5; - else if (reg & LOONGARCH_LSX) - return CPU_LOONGSON2K1000; - else - return CPU_GENERIC; + if (reg & LASX_MASK) + return 1; + return 0; +} + +static int support_lsx() { + uint32_t reg = 0; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg) + : "r"(LOONGARCH_CFG2) + ); + + if (reg & LSX_MASK) + return 1; + return 0; +} + +static uint32_t get_prid() { + uint32_t reg = 0; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg) + : "r"(LOONGARCH_CFG0) + ); + return reg; +} + +int detect(void) { +#ifdef __linux + uint32_t prid = get_prid(); + switch (prid & PRID_SERIES_MASK) { + case (PRID_SERIES_LA464): + if (support_lasx()) + return CPU_LA464; + else + return CPU_GENERIC; + break; + + case (PRID_SERIES_LA264): + if (support_lsx()) + return CPU_LA264; + else + return CPU_GENERIC; + break; + + default: + return CPU_GENERIC; + } #endif return CPU_GENERIC; } @@ -100,41 +267,52 @@ void get_subdirname(void) { } void get_cpuconfig(void) { - int d = detect(); - switch (d) { - case CPU_LOONGSON3R5: - printf("#define LOONGSON3R5\n"); - printf("#define L1_DATA_SIZE 65536\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L2_SIZE 1048576\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - break; + cache_info_t info; + uint32_t num_cores = 0; - case CPU_LOONGSON2K1000: - printf("#define LOONGSON2K1000\n"); - printf("#define L1_DATA_SIZE 65536\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L2_SIZE 262144\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - break; + printf("#define %s\n", cpuname[detect()]); - default: - printf("#define LOONGSONGENERIC\n"); - printf("#define L1_DATA_SIZE 65536\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L2_SIZE 262144\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - break; + get_cacheinfo(CACHE_INFO_L1_IU, &info); + if (info.size > 0) { + printf("#define L1_CODE_SIZE %d\n", info.size); + printf("#define L1_CODE_ASSOCIATIVE %d\n", info.associative); + printf("#define L1_CODE_LINESIZE %d\n", info.linesize); } + get_cacheinfo(CACHE_INFO_L1_D, &info); + if (info.size > 0) { + printf("#define L1_DATA_SIZE %d\n", info.size); + printf("#define L1_DATA_ASSOCIATIVE %d\n", info.associative); + printf("#define L1_DATA_LINESIZE %d\n", info.linesize); + } + get_cacheinfo(CACHE_INFO_L2_IU, &info); + if (info.size > 0) { + if (info.unify) { + printf("#define L2_SIZE %d\n", info.size); + printf("#define L2_ASSOCIATIVE %d\n", info.associative); + printf("#define L2_LINESIZE %d\n", info.linesize); + } else { + printf("#define L2_CODE_SIZE %d\n", info.size); + printf("#define L2_CODE_ASSOCIATIVE %d\n", info.associative); + printf("#define L2_CODE_LINESIZE %d\n", info.linesize); + } + } + get_cacheinfo(CACHE_INFO_L3_IU, &info); + if (info.size > 0) { + if (info.unify) { + printf("#define L3_SIZE %d\n", info.size); + printf("#define L3_ASSOCIATIVE %d\n", info.associative); + printf("#define L3_LINESIZE %d\n", info.linesize); + } else { + printf("#define L3_CODE_SIZE %d\n", info.size); + printf("#define L3_CODE_ASSOCIATIVE %d\n", info.associative); + printf("#define L3_CODE_LINESIZE %d\n", info.linesize); + } + } + get_cpucount(&num_cores); + if (num_cores) + printf("#define NUM_CORES %d\n", num_cores); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); } void get_libname(void){ diff --git a/driver/others/dynamic_loongarch64.c b/driver/others/dynamic_loongarch64.c index 52f8bcb2f..6b948b5ea 100644 --- a/driver/others/dynamic_loongarch64.c +++ b/driver/others/dynamic_loongarch64.c @@ -27,25 +27,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -extern gotoblas_t gotoblas_LOONGSON3R5; -extern gotoblas_t gotoblas_LOONGSON2K1000; -extern gotoblas_t gotoblas_LOONGSONGENERIC; +extern gotoblas_t gotoblas_LA464; +extern gotoblas_t gotoblas_LA264; +extern gotoblas_t gotoblas_LA64_GENERIC; extern void openblas_warning(int verbose, const char * msg); #define NUM_CORETYPES 3 static char *corename[] = { - "loongson3r5", - "loongson2k1000", - "loongsongeneric", + "la464", + "la264", + "la64_generic", "unknown" }; char *gotoblas_corename(void) { - if (gotoblas == &gotoblas_LOONGSON3R5) return corename[0]; - if (gotoblas == &gotoblas_LOONGSON2K1000) return corename[1]; - if (gotoblas == &gotoblas_LOONGSONGENERIC) return corename[2]; + if (gotoblas == &gotoblas_LA464) return corename[0]; + if (gotoblas == &gotoblas_LA264) return corename[1]; + if (gotoblas == &gotoblas_LA64_GENERIC) return corename[2]; return corename[NUM_CORETYPES]; } @@ -65,33 +65,82 @@ static gotoblas_t *force_coretype(char *coretype) { switch (found) { - case 0: return (&gotoblas_LOONGSON3R5); - case 1: return (&gotoblas_LOONGSON2K1000); - case 2: return (&gotoblas_LOONGSONGENERIC); + case 0: return (&gotoblas_LA464); + case 1: return (&gotoblas_LA264); + case 2: return (&gotoblas_LA64_GENERIC); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); return NULL; } -#define LASX_MASK 1<<7 -#define LSX_MASK 1<<6 -#define LOONGARCH_CFG2 0x02 +#define LASX_MASK 1<<7 +#define LSX_MASK 1<<6 +#define LOONGARCH_CFG0 0x00 +#define LOONGARCH_CFG2 0x02 +#define PRID_SERIES_MASK 0xf000 +#define PRID_SERIES_LA464 0xc000 +#define PRID_SERIES_LA264 0xa000 -static gotoblas_t *get_coretype(void) { - int ret = 0; +static uint32_t get_prid() { + uint32_t reg = 0; __asm__ volatile ( "cpucfg %0, %1 \n\t" - : "+&r"(ret) + : "+&r"(reg) + : "r"(LOONGARCH_CFG0) + ); + return reg; +} + +static int support_lasx() { + uint32_t reg = 0; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg) : "r"(LOONGARCH_CFG2) ); - if (ret & LASX_MASK) - return &gotoblas_LOONGSON3R5; - else if (ret & LSX_MASK) - return &gotoblas_LOONGSON2K1000; - else - return &gotoblas_LOONGSONGENERIC; + if (reg & LASX_MASK) + return 1; + return 0; +} + +static int support_lsx() { + uint32_t reg = 0; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg) + : "r"(LOONGARCH_CFG2) + ); + + if (reg & LSX_MASK) + return 1; + return 0; +} + +static gotoblas_t *get_coretype(void) { +#ifdef __linux + uint32_t prid = get_prid(); + switch (prid & PRID_SERIES_MASK) { + case (PRID_SERIES_LA464): + if (support_lasx()) + return &gotoblas_LA464; + else + return &gotoblas_LA64_GENERIC; + break; + + case (PRID_SERIES_LA264): + if (support_lsx()) + return &gotoblas_LA264; + else + return &gotoblas_LA64_GENERIC; + break; + + default: + return &gotoblas_LA64_GENERIC; + } +#endif + return &gotoblas_LA64_GENERIC; } void gotoblas_dynamic_init(void) { diff --git a/getarch.c b/getarch.c index 7761551ea..1113d02ce 100644 --- a/getarch.c +++ b/getarch.c @@ -134,9 +134,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_SICORTEX */ /* #define FORCE_LOONGSON3R3 */ /* #define FORCE_LOONGSON3R4 */ -/* #define FORCE_LOONGSON3R5 */ -/* #define FORCE_LOONGSON2K1000 */ -/* #define FORCE_LOONGSONGENERIC */ +/* #define FORCE_LA464 */ +/* #define FORCE_LA264 */ +/* #define FORCE_LA64_GENERIC */ /* #define FORCE_I6400 */ /* #define FORCE_P6600 */ /* #define FORCE_P5600 */ @@ -957,45 +957,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif -#ifdef FORCE_LOONGSON3R5 +#ifdef FORCE_LA464 #define FORCE #define ARCHITECTURE "LOONGARCH" -#define SUBARCHITECTURE "LOONGSON3R5" +#define SUBARCHITECTURE "LA464" #define SUBDIRNAME "loongarch64" -#define ARCHCONFIG "-DLOONGSON3R5 " \ - "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ - "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " -#define LIBNAME "loongson3r5" -#define CORENAME "LOONGSON3R5" -#else -#endif - -#ifdef FORCE_LOONGSON2K1000 -#define FORCE -#define ARCHITECTURE "LOONGARCH" -#define SUBARCHITECTURE "LOONGSON2K1000" -#define SUBDIRNAME "loongarch64" -#define ARCHCONFIG "-DLOONGSON2K1000 " \ +#define ARCHCONFIG "-DLA464 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " -#define LIBNAME "loongson2k1000" -#define CORENAME "LOONGSON2K1000" +#define LIBNAME "la464" +#define CORENAME "LA464" #else #endif -#ifdef FORCE_LOONGSONGENERIC +#ifdef FORCE_LA264 #define FORCE #define ARCHITECTURE "LOONGARCH" -#define SUBARCHITECTURE "LOONGSONGENERIC" +#define SUBARCHITECTURE "LA264" #define SUBDIRNAME "loongarch64" -#define ARCHCONFIG "-DLOONGSONGENERIC " \ - "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ +#define ARCHCONFIG "-DLA264 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "la264" +#define CORENAME "LA264" +#else +#endif + +#ifdef FORCE_LA64_GENERIC +#define FORCE +#define ARCHITECTURE "LOONGARCH" +#define SUBARCHITECTURE "LA64_GENERIC" +#define SUBDIRNAME "loongarch64" +#define ARCHCONFIG "-DLA64_GENERIC " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " -#define LIBNAME "loongsongeneric" -#define CORENAME "LOONGSONGENERIC" + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "la64_generic" +#define CORENAME "LA64_GENERIC" #else #endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LA464 similarity index 100% rename from kernel/loongarch64/KERNEL.LOONGSON3R5 rename to kernel/loongarch64/KERNEL.LA464 diff --git a/lapack/laswp/loongarch64/Makefile b/lapack/laswp/loongarch64/Makefile index 71e5a87cb..1c85667ec 100644 --- a/lapack/laswp/loongarch64/Makefile +++ b/lapack/laswp/loongarch64/Makefile @@ -1,6 +1,11 @@ TOPDIR = ../../.. include ../../../Makefile.system +ifeq ($(DYNAMIC_ARCH), 1) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + ifndef LASWP LASWP = ../generic/laswp_k.c endif diff --git a/param.h b/param.h index dc02147d8..251c642a1 100644 --- a/param.h +++ b/param.h @@ -2835,7 +2835,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#if defined (LOONGSON3R5) +#if defined (LA464) #define SNUMOPT 2 #define DNUMOPT 2 @@ -2875,7 +2875,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#ifdef LOONGSON2K1000 +#ifdef LA264 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL @@ -2910,7 +2910,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#ifdef LOONGSONGENERIC +#ifdef LA64_GENERIC #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL