diff --git a/CMakeLists.txt b/CMakeLists.txt index c5c7aaa6a..e7821d77a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,9 +23,9 @@ if(MSVC AND NOT DEFINED NOFORTRAN) endif() ####### -if(MSVC) - option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) -endif() +option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF) + +option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON) option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF) @@ -320,7 +320,9 @@ if (NOT NOFORTRAN) if(NOT NO_CBLAS) add_subdirectory(ctest) endif() - add_subdirectory(lapack-netlib/TESTING) + if (BUILD_TESTING) + add_subdirectory(lapack-netlib/TESTING) + endif() if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) add_subdirectory(cpp_thread_test) endif() diff --git a/Makefile.arm64 b/Makefile.arm64 index 2eade8d78..9844d2083 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -55,6 +55,13 @@ FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 endif endif +ifeq ($(CORE), FT2000) +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +endif +endif + # Use a72 tunings because Neoverse-N1 is only available # in GCC>=9 ifeq ($(CORE), NEOVERSEN1) @@ -229,6 +236,43 @@ endif endif endif +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) +ifeq ($(CORE), CORTEXX1) +CCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72 +endif +endif +endif + +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) +ifeq ($(CORE), CORTEXX2) +CCOMMON_OPT += -march=armv8.4-a+sve +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.4-a+sve +endif +endif +endif + +#ifeq (1, $(filter 1,$(ISCLANG))) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) +ifeq ($(CORE), CORTEXA510) +CCOMMON_OPT += -march=armv8.4-a+sve +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.4-a+sve +endif +endif +endif + +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) +ifeq ($(CORE), CORTEXA710) +CCOMMON_OPT += -march=armv8.4-a+sve +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.4-a+sve +endif +endif +endif + endif endif diff --git a/Makefile.prebuild b/Makefile.prebuild index 399db956f..5e8874d42 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -71,7 +71,8 @@ endif getarch : getarch.c cpuid.S dummy $(CPUIDEMU) - $(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) -o $(@F) getarch.c cpuid.S $(CPUIDEMU) + avx512=$$(perl c_check - - $(CC) $(TARGET_FLAGS) $(CFLAGS) | grep NO_AVX512); \ + $(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) $${avx512:+-D$${avx512}} -o $(@F) getarch.c cpuid.S $(CPUIDEMU) getarch_2nd : getarch_2nd.c config.h dummy ifndef TARGET_CORE diff --git a/TargetList.txt b/TargetList.txt index a5a07a661..a297fd0e8 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -92,6 +92,10 @@ CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 +CORTEXA510 +CORTEXA710 +CORTEXX1 +CORTEXX2 NEOVERSEN1 NEOVERSEV1 NEOVERSEN2 @@ -103,6 +107,9 @@ THUNDERX2T99 TSV110 THUNDERX3T110 VORTEX +A64FX +ARMV8SVE +FT2000 9.System Z: ZARCH_GENERIC diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 04ed428de..1545d56db 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -65,7 +65,7 @@ jobs: - task: CMake@1 inputs: workingDirectory: 'build' # Optional - cmakeArgs: '-G "Visual Studio 16 2019" ..' + cmakeArgs: '-G "Visual Studio 17 2022" ..' - task: CMake@1 inputs: cmakeArgs: '--build . --config Release' @@ -103,7 +103,7 @@ jobs: - job: Windows_flang_clang pool: - vmImage: 'windows-latest' + vmImage: 'windows-2022' steps: - script: | set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%" @@ -114,8 +114,8 @@ jobs: conda install --yes --quiet ninja flang mkdir build cd build - call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" - cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat" + cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. cmake --build . --config Release ctest @@ -178,7 +178,7 @@ jobs: cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON .. cmake --build . ctest - + - job: OSX_Ifort_Clang pool: vmImage: 'macOS-10.15' diff --git a/c_check b/c_check index 999f5a7a7..f9d3f2ca2 100644 --- a/c_check +++ b/c_check @@ -254,7 +254,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) { # $tmpf = new File::Temp( UNLINK => 1 ); ($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 ); $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; - print $tmpf "#include \n\nint main(void){ __asm__ volatile($code); }\n"; + print $fh "#include \n\nint main(void){ __asm__ volatile($code); }\n"; $args = " -march=skylake-avx512 -c -o $tmpf.o $tmpf"; if ($compiler eq "PGI") { $args = " -tp skylake -c -o $tmpf.o $tmpf"; @@ -278,7 +278,7 @@ if ($data =~ /HAVE_C11/) { $c11_atomics = 0; } else { ($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 ); - print $tmpf "#include \nint main(void){}\n"; + print $fh "#include \nint main(void){}\n"; $args = " -c -o $tmpf.o $tmpf"; my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); system(@cmd) == 0; @@ -316,6 +316,7 @@ if ($architecture ne $hostarch) { } $cross = 1 if ($os ne $hostos); +$cross = 0 if (($os eq "Android") && ($hostos eq "Linux") && ($ENV{TERMUX_APP_PID} != "")); $openmp = "" if $ENV{USE_OPENMP} != 1; diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 06bc14986..57e42781d 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -161,6 +161,30 @@ if (${CORE} STREQUAL ARMV8SVE) endif () endif () +if (${CORE} STREQUAL CORTEXA510) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + endif () +endif () + +if (${CORE} STREQUAL CORTEXA710) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + endif () +endif () + +if (${CORE} STREQUAL CORTEXX1) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a") + endif () +endif () + +if (${CORE} STREQUAL CORTEXX2) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + endif () +endif () + if (${CORE} STREQUAL POWER10) if (NOT DYNAMIC_ARCH) execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 9feda9be3..94199605d 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -67,7 +67,15 @@ if (${F_COMPILER} STREQUAL "GFORTRAN") if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -m64") if (INTERFACE64) - set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") + if (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel") + if (WIN32) + set(FCOMMON_OPT "${FCOMMON_OPT} /integer-size:64") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -integer-size 64") + endif () + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") + endif () endif () else () set(FCOMMON_OPT "${FCOMMON_OPT} -m32") diff --git a/common_macro.h b/common_macro.h index 9826f1809..d2fa822c2 100644 --- a/common_macro.h +++ b/common_macro.h @@ -2610,8 +2610,9 @@ #endif #ifndef ASSEMBLER -#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\ -|| defined(ARCH_LOONGARCH64) || defined(ARCH_E2K) +#if !defined(DYNAMIC_ARCH) \ + && (defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) \ + || defined(ARCH_LOONGARCH64) || defined(ARCH_E2K)) extern BLASLONG gemm_offset_a; extern BLASLONG gemm_offset_b; extern BLASLONG sbgemm_p; diff --git a/cpuid_arm64.c b/cpuid_arm64.c index cc3a82815..89ec18632 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -45,6 +45,10 @@ size_t length64=sizeof(value64); #define CPU_NEOVERSEN1 11 #define CPU_NEOVERSEV1 16 #define CPU_NEOVERSEN2 17 +#define CPU_CORTEXX1 18 +#define CPU_CORTEXX2 19 +#define CPU_CORTEXA510 20 +#define CPU_CORTEXA710 21 // Qualcomm #define CPU_FALKOR 6 // Cavium @@ -59,6 +63,8 @@ size_t length64=sizeof(value64); #define CPU_VORTEX 13 // Fujitsu #define CPU_A64FX 15 +// Phytium +#define CPU_FT2000 22 static char *cpuname[] = { "UNKNOWN", @@ -73,12 +79,17 @@ static char *cpuname[] = { "TSV110", "EMAG8180", "NEOVERSEN1", - "NEOVERSEV1" - "NEOVERSEN2" "THUNDERX3T110", "VORTEX", "CORTEXA55", - "A64FX" + "A64FX", + "NEOVERSEV1", + "NEOVERSEN2", + "CORTEXX1", + "CORTEXX2", + "CORTEXA510", + "CORTEXA710", + "FT2000" }; static char *cpuname_lower[] = { @@ -94,12 +105,17 @@ static char *cpuname_lower[] = { "tsv110", "emag8180", "neoversen1", - "neoversev1", - "neoversen2", "thunderx3t110", "vortex", "cortexa55", - "a64fx" + "a64fx", + "neoversev1", + "neoversen2", + "cortexx1", + "cortexx2", + "cortexa510", + "cortexa710", + "ft2000" }; int get_feature(char *search) @@ -182,6 +198,14 @@ int detect(void) return CPU_NEOVERSEN2; else if (strstr(cpu_part, "0xd05")) return CPU_CORTEXA55; + else if (strstr(cpu_part, "0xd46")) + return CPU_CORTEXA510; + else if (strstr(cpu_part, "0xd47")) + return CPU_CORTEXA710; + else if (strstr(cpu_part, "0xd44")) + return CPU_CORTEXX1; + else if (strstr(cpu_part, "0xd4c")) + return CPU_CORTEXX2; } // Qualcomm else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00")) @@ -202,6 +226,13 @@ int detect(void) // Fujitsu else if (strstr(cpu_implementer, "0x46") && strstr(cpu_part, "0x001")) return CPU_A64FX; + // Apple + else if (strstr(cpu_implementer, "0x61") && strstr(cpu_part, "0x022")) + return CPU_VORTEX; + // Phytium + else if (strstr(cpu_implementer, "0x70") && (strstr(cpu_part, "0x660") || strstr(cpu_part, "0x661") + || strstr(cpu_part, "0x662") || strstr(cpu_part, "0x663"))) + return CPU_FT2000; } p = (char *) NULL ; @@ -382,7 +413,24 @@ void get_cpuconfig(void) printf("#define DTB_DEFAULT_ENTRIES 48\n"); printf("#define DTB_SIZE 4096\n"); break; - + case CPU_CORTEXA510: + case CPU_CORTEXA710: + case CPU_CORTEXX1: + case CPU_CORTEXX2: + printf("#define ARMV9\n"); + printf("#define %s\n", cpuname[d]); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 4\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + break; case CPU_FALKOR: printf("#define FALKOR\n"); printf("#define L1_CODE_SIZE 65536\n"); @@ -469,9 +517,9 @@ void get_cpuconfig(void) printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_SIZE 4096 \n"); break; -#ifdef __APPLE__ case CPU_VORTEX: printf("#define VORTEX \n"); +#ifdef __APPLE__ sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); printf("#define L1_CODE_SIZE %lld \n",value64); sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); @@ -480,10 +528,10 @@ void get_cpuconfig(void) printf("#define L1_DATA_SIZE %lld \n",value64); sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); printf("#define L2_SIZE %lld \n",value64); +#endif printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_SIZE 4096 \n"); break; -#endif case CPU_A64FX: printf("#define A64FX\n"); printf("#define L1_CODE_SIZE 65535\n"); @@ -494,6 +542,16 @@ void get_cpuconfig(void) printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); break; + case CPU_FT2000: + printf("#define FT2000\n"); + printf("#define L1_CODE_SIZE 32768\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 33554432\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + break; } get_cpucount(); } diff --git a/cpuid_x86.c b/cpuid_x86.c index d7d85eb20..4ac1de047 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1707,8 +1707,18 @@ int get_cpuname(void){ if (model == 0xf && stepping < 0xe) return CPUTYPE_NANO; return CPUTYPE_NEHALEM; + case 0x7: + switch (exmodel) { + case 5: + if (support_avx2()) + return CPUTYPE_ZEN; + else + return CPUTYPE_DUNNINGTON; + default: + return CPUTYPE_NEHALEM; + } default: - if (family >= 0x7) + if (family >= 0x8) return CPUTYPE_NEHALEM; else return CPUTYPE_VIAC3; @@ -1716,7 +1726,20 @@ int get_cpuname(void){ } if (vendor == VENDOR_ZHAOXIN){ - return CPUTYPE_NEHALEM; + switch (family) { + case 0x7: + switch (exmodel) { + case 5: + if (support_avx2()) + return CPUTYPE_ZEN; + else + return CPUTYPE_DUNNINGTON; + default: + return CPUTYPE_NEHALEM; + } + default: + return CPUTYPE_NEHALEM; + } } if (vendor == VENDOR_RISE){ @@ -2416,8 +2439,18 @@ int get_coretype(void){ if (model == 0xf && stepping < 0xe) return CORE_NANO; return CORE_NEHALEM; + case 0x7: + switch (exmodel) { + case 5: + if (support_avx2()) + return CORE_ZEN; + else + return CORE_DUNNINGTON; + default: + return CORE_NEHALEM; + } default: - if (family >= 0x7) + if (family >= 0x8) return CORE_NEHALEM; else return CORE_VIAC3; @@ -2425,7 +2458,20 @@ int get_coretype(void){ } if (vendor == VENDOR_ZHAOXIN) { - return CORE_NEHALEM; + switch (family) { + case 0x7: + switch (exmodel) { + case 5: + if (support_avx2()) + return CORE_ZEN; + else + return CORE_DUNNINGTON; + default: + return CORE_NEHALEM; + } + default: + return CORE_NEHALEM; + } } return CORE_UNKNOWN; diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 52a7c6087..df7fa67e6 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -96,7 +96,7 @@ extern gotoblas_t gotoblas_BARCELONA; #endif #ifdef DYN_ATOM extern gotoblas_t gotoblas_ATOM; -elif defined(DYN_NEHALEM) +#elif defined(DYN_NEHALEM) #define gotoblas_ATOM gotoblas_NEHALEM #else #define gotoblas_ATOM gotoblas_PRESCOTT @@ -875,14 +875,37 @@ static gotoblas_t *get_coretype(void){ if (model == 0xf && stepping < 0xe) return &gotoblas_NANO; return &gotoblas_NEHALEM; + case 0x7: + switch (exmodel) { + case 5: + if (support_avx2()) + return &gotoblas_ZEN; + else + return &gotoblas_DUNNINGTON; + default: + return &gotoblas_NEHALEM; + } default: - if (family >= 0x7) + if (family >= 0x8) return &gotoblas_NEHALEM; } } if (vendor == VENDOR_ZHAOXIN) { - return &gotoblas_NEHALEM; + switch (family) { + case 0x7: + switch (exmodel) { + case 5: + if (support_avx2()) + return &gotoblas_ZEN; + else + return &gotoblas_DUNNINGTON; + default: + return &gotoblas_NEHALEM; + } + default: + return &gotoblas_NEHALEM; + } } return NULL; diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 7fefee33d..7a5cbeb62 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -60,6 +60,9 @@ static char* openblas_config_str="" #ifdef USE_OPENMP "USE_OPENMP " #endif +#ifdef USE_TLS + "USE_TLS " +#endif #ifndef DYNAMIC_ARCH CHAR_CORENAME #endif diff --git a/getarch.c b/getarch.c index 00e544bc7..4af986fb3 100644 --- a/getarch.c +++ b/getarch.c @@ -94,14 +94,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif -#if defined(__x86_64__) || defined(_M_X64) -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) -#else -#ifndef NO_AVX512 -#define NO_AVX512 -#endif -#endif -#endif /* #define FORCE_P2 */ /* #define FORCE_KATMAI */ /* #define FORCE_COPPERMINE */ @@ -1240,7 +1232,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "cortexa53" #define CORENAME "CORTEXA53" -#else #endif #ifdef FORCE_CORTEXA57 @@ -1256,7 +1247,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "cortexa57" #define CORENAME "CORTEXA57" -#else #endif #ifdef FORCE_CORTEXA72 @@ -1272,7 +1262,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "cortexa72" #define CORENAME "CORTEXA72" -#else #endif #ifdef FORCE_CORTEXA73 @@ -1288,7 +1277,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "cortexa73" #define CORENAME "CORTEXA73" -#else +#endif + +#ifdef FORCE_CORTEXX1 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "CORTEXX1" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DCORTEXX1 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "cortexx1" +#define CORENAME "CORTEXX1" +#endif + +#ifdef FORCE_CORTEXX2 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "CORTEXX2" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DCORTEXX2 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 -DARMV9" +#define LIBNAME "cortexx2" +#define CORENAME "CORTEXX2" +#endif + +#ifdef FORCE_CORTEXA510 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "CORTEXA510" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DCORTEXA510 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 -DARMV9" +#define LIBNAME "cortexa510" +#define CORENAME "CORTEXA510" +#endif + +#ifdef FORCE_CORTEXA710 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "CORTEXA710" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DCORTEXA710 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 -DARMV9" +#define LIBNAME "cortexa710" +#define CORENAME "CORTEXA710" #endif #ifdef FORCE_NEOVERSEN1 @@ -1305,7 +1349,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-march=armv8.2-a -mtune=neoverse-n1" #define LIBNAME "neoversen1" #define CORENAME "NEOVERSEN1" -#else #endif #ifdef FORCE_NEOVERSEV1 @@ -1322,7 +1365,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-march=armv8.4-a -mtune=neoverse-v1" #define LIBNAME "neoversev1" #define CORENAME "NEOVERSEV1" -#else #endif @@ -1340,7 +1382,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-march=armv8.5-a -mtune=neoverse-n2" #define LIBNAME "neoversen2" #define CORENAME "NEOVERSEN2" -#else #endif #ifdef FORCE_CORTEXA55 @@ -1356,7 +1397,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "cortexa55" #define CORENAME "CORTEXA55" -#else #endif #ifdef FORCE_FALKOR @@ -1372,7 +1412,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "falkor" #define CORENAME "FALKOR" -#else #endif #ifdef FORCE_THUNDERX @@ -1387,7 +1426,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "thunderx" #define CORENAME "THUNDERX" -#else #endif #ifdef FORCE_THUNDERX2T99 @@ -1405,7 +1443,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "thunderx2t99" #define CORENAME "THUNDERX2T99" -#else #endif #ifdef FORCE_TSV110 @@ -1421,7 +1458,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "tsv110" #define CORENAME "TSV110" -#else #endif #ifdef FORCE_EMAG8180 @@ -1456,7 +1492,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "thunderx3t110" #define CORENAME "THUNDERX3T110" -#else #endif #ifdef FORCE_VORTEX @@ -1488,7 +1523,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8" #define LIBNAME "a64fx" #define CORENAME "A64FX" -#else +#endif + +#ifdef FORCE_FT2000 +#define ARMV8 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "FT2000" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DFT2000 " \ + "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ + "-DL2_SIZE=33554426-DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "ft2000" +#define CORENAME "FT2000" #endif #ifdef FORCE_ZARCH_GENERIC diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 8aa6728d5..98c803e71 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -678,7 +678,7 @@ endif () set(SBGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c) endif () if (NOT DEFINED SBGEMM_SMALL_K_B0_TT) - set($SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) + set(SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) endif () GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16") diff --git a/kernel/arm64/KERNEL.CORTEXA510 b/kernel/arm64/KERNEL.CORTEXA510 new file mode 100644 index 000000000..bd25f7cd8 --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXA510 @@ -0,0 +1,216 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = trsm_kernel_LN_sve.c +STRSMKERNEL_LT = trsm_kernel_LT_sve.c +STRSMKERNEL_RN = trsm_kernel_RN_sve.c +STRSMKERNEL_RT = trsm_kernel_RT_sve.c + +DTRSMKERNEL_LN = trsm_kernel_LN_sve.c +DTRSMKERNEL_LT = trsm_kernel_LT_sve.c +DTRSMKERNEL_RN = trsm_kernel_RN_sve.c +DTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +TRSMCOPYLN_M = trsm_lncopy_sve.c +TRSMCOPYLT_M = trsm_ltcopy_sve.c +TRSMCOPYUN_M = trsm_uncopy_sve.c +TRSMCOPYUT_M = trsm_utcopy_sve.c + +CTRSMKERNEL_LN = trsm_kernel_LN_sve.c +CTRSMKERNEL_LT = trsm_kernel_LT_sve.c +CTRSMKERNEL_RN = trsm_kernel_RN_sve.c +CTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c +ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c +ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c +ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c +ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c +ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c +ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c + + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S + +SGEMMINCOPY = sgemm_ncopy_sve_v1.c +SGEMMITCOPY = sgemm_tcopy_sve_v1.c +SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S +SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S + +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRMMUNCOPY_M = trmm_uncopy_sve_v1.c +STRMMLNCOPY_M = trmm_lncopy_sve_v1.c +STRMMUTCOPY_M = trmm_utcopy_sve_v1.c +STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +SSYMMUCOPY_M = symm_ucopy_sve.c +SSYMMLCOPY_M = symm_lcopy_sve.c + +DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S + +DGEMMINCOPY = dgemm_ncopy_sve_v1.c +DGEMMITCOPY = dgemm_tcopy_sve_v1.c +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c +DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c +DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +DSYMMUCOPY_M = symm_ucopy_sve.c +DSYMMLCOPY_M = symm_lcopy_sve.c + +CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +CGEMMINCOPY = cgemm_ncopy_sve_v1.c +CGEMMITCOPY = cgemm_tcopy_sve_v1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +CHEMMLTCOPY_M = zhemm_ltcopy_sve.c +CHEMMUTCOPY_M = zhemm_utcopy_sve.c + +CSYMMUCOPY_M = zsymm_ucopy_sve.c +CSYMMLCOPY_M = zsymm_lcopy_sve.c + +ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +ZGEMMINCOPY = zgemm_ncopy_sve_v1.c +ZGEMMITCOPY = zgemm_tcopy_sve_v1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c +ZHEMMUTCOPY_M = zhemm_utcopy_sve.c + +ZSYMMUCOPY_M = zsymm_ucopy_sve.c +ZSYMMLCOPY_M = zsymm_lcopy_sve.c diff --git a/kernel/arm64/KERNEL.CORTEXA710 b/kernel/arm64/KERNEL.CORTEXA710 new file mode 100644 index 000000000..bd25f7cd8 --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXA710 @@ -0,0 +1,216 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = trsm_kernel_LN_sve.c +STRSMKERNEL_LT = trsm_kernel_LT_sve.c +STRSMKERNEL_RN = trsm_kernel_RN_sve.c +STRSMKERNEL_RT = trsm_kernel_RT_sve.c + +DTRSMKERNEL_LN = trsm_kernel_LN_sve.c +DTRSMKERNEL_LT = trsm_kernel_LT_sve.c +DTRSMKERNEL_RN = trsm_kernel_RN_sve.c +DTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +TRSMCOPYLN_M = trsm_lncopy_sve.c +TRSMCOPYLT_M = trsm_ltcopy_sve.c +TRSMCOPYUN_M = trsm_uncopy_sve.c +TRSMCOPYUT_M = trsm_utcopy_sve.c + +CTRSMKERNEL_LN = trsm_kernel_LN_sve.c +CTRSMKERNEL_LT = trsm_kernel_LT_sve.c +CTRSMKERNEL_RN = trsm_kernel_RN_sve.c +CTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c +ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c +ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c +ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c +ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c +ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c +ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c + + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S + +SGEMMINCOPY = sgemm_ncopy_sve_v1.c +SGEMMITCOPY = sgemm_tcopy_sve_v1.c +SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S +SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S + +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRMMUNCOPY_M = trmm_uncopy_sve_v1.c +STRMMLNCOPY_M = trmm_lncopy_sve_v1.c +STRMMUTCOPY_M = trmm_utcopy_sve_v1.c +STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +SSYMMUCOPY_M = symm_ucopy_sve.c +SSYMMLCOPY_M = symm_lcopy_sve.c + +DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S + +DGEMMINCOPY = dgemm_ncopy_sve_v1.c +DGEMMITCOPY = dgemm_tcopy_sve_v1.c +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c +DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c +DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +DSYMMUCOPY_M = symm_ucopy_sve.c +DSYMMLCOPY_M = symm_lcopy_sve.c + +CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +CGEMMINCOPY = cgemm_ncopy_sve_v1.c +CGEMMITCOPY = cgemm_tcopy_sve_v1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +CHEMMLTCOPY_M = zhemm_ltcopy_sve.c +CHEMMUTCOPY_M = zhemm_utcopy_sve.c + +CSYMMUCOPY_M = zsymm_ucopy_sve.c +CSYMMLCOPY_M = zsymm_lcopy_sve.c + +ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +ZGEMMINCOPY = zgemm_ncopy_sve_v1.c +ZGEMMITCOPY = zgemm_tcopy_sve_v1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c +ZHEMMUTCOPY_M = zhemm_utcopy_sve.c + +ZSYMMUCOPY_M = zsymm_ucopy_sve.c +ZSYMMLCOPY_M = zsymm_lcopy_sve.c diff --git a/kernel/arm64/KERNEL.CORTEXX1 b/kernel/arm64/KERNEL.CORTEXX1 new file mode 100644 index 000000000..a077ab4f3 --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXX1 @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.CORTEXA57 diff --git a/kernel/arm64/KERNEL.CORTEXX2 b/kernel/arm64/KERNEL.CORTEXX2 new file mode 100644 index 000000000..bd25f7cd8 --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXX2 @@ -0,0 +1,216 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = trsm_kernel_LN_sve.c +STRSMKERNEL_LT = trsm_kernel_LT_sve.c +STRSMKERNEL_RN = trsm_kernel_RN_sve.c +STRSMKERNEL_RT = trsm_kernel_RT_sve.c + +DTRSMKERNEL_LN = trsm_kernel_LN_sve.c +DTRSMKERNEL_LT = trsm_kernel_LT_sve.c +DTRSMKERNEL_RN = trsm_kernel_RN_sve.c +DTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +TRSMCOPYLN_M = trsm_lncopy_sve.c +TRSMCOPYLT_M = trsm_ltcopy_sve.c +TRSMCOPYUN_M = trsm_uncopy_sve.c +TRSMCOPYUT_M = trsm_utcopy_sve.c + +CTRSMKERNEL_LN = trsm_kernel_LN_sve.c +CTRSMKERNEL_LT = trsm_kernel_LT_sve.c +CTRSMKERNEL_RN = trsm_kernel_RN_sve.c +CTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c +ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c +ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c +ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c +ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c +ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c +ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c + + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S + +SGEMMINCOPY = sgemm_ncopy_sve_v1.c +SGEMMITCOPY = sgemm_tcopy_sve_v1.c +SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S +SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S + +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRMMUNCOPY_M = trmm_uncopy_sve_v1.c +STRMMLNCOPY_M = trmm_lncopy_sve_v1.c +STRMMUTCOPY_M = trmm_utcopy_sve_v1.c +STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +SSYMMUCOPY_M = symm_ucopy_sve.c +SSYMMLCOPY_M = symm_lcopy_sve.c + +DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S + +DGEMMINCOPY = dgemm_ncopy_sve_v1.c +DGEMMITCOPY = dgemm_tcopy_sve_v1.c +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c +DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c +DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +DSYMMUCOPY_M = symm_ucopy_sve.c +DSYMMLCOPY_M = symm_lcopy_sve.c + +CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +CGEMMINCOPY = cgemm_ncopy_sve_v1.c +CGEMMITCOPY = cgemm_tcopy_sve_v1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +CHEMMLTCOPY_M = zhemm_ltcopy_sve.c +CHEMMUTCOPY_M = zhemm_utcopy_sve.c + +CSYMMUCOPY_M = zsymm_ucopy_sve.c +CSYMMLCOPY_M = zsymm_lcopy_sve.c + +ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +ZGEMMINCOPY = zgemm_ncopy_sve_v1.c +ZGEMMITCOPY = zgemm_tcopy_sve_v1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c +ZHEMMUTCOPY_M = zhemm_utcopy_sve.c + +ZSYMMUCOPY_M = zsymm_ucopy_sve.c +ZSYMMLCOPY_M = zsymm_lcopy_sve.c diff --git a/kernel/arm64/KERNEL.FT2000 b/kernel/arm64/KERNEL.FT2000 new file mode 100644 index 000000000..007b2ce26 --- /dev/null +++ b/kernel/arm64/KERNEL.FT2000 @@ -0,0 +1,3 @@ +include $(KERNELDIR)/KERNEL.CORTEXA57 + + diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index fe796be64..9f5d34d9b 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -1239,7 +1239,6 @@ static void init_parameter(void) { #ifdef BUILD_BFLOAT16 TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; - TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; #endif #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) @@ -1824,6 +1823,13 @@ static void init_parameter(void) { fprintf(stderr, "L2 = %8d DGEMM_P .. %d\n", l2, TABLE_NAME.dgemm_p); #endif +#if BUILD_BFLOAT16==1 + TABLE_NAME.sbgemm_r = (((BUFFER_SIZE - + ((TABLE_NAME.sbgemm_p * TABLE_NAME.sbgemm_q * 4 + TABLE_NAME.offsetA + + TABLE_NAME.align) & ~TABLE_NAME.align) + ) / (TABLE_NAME.sbgemm_q * 4) - 15) & ~15); +#endif + #if BUILD_SINGLE==1 TABLE_NAME.sgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c index d9b380fff..a98772b94 100644 --- a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c @@ -24,6 +24,7 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) #include #include "common.h" @@ -47,7 +48,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. _mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N) #define MASK_STORE_512(M, N) \ result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ - asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "k"(mask)); \ + asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "Yk"(mask)); \ _mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N) #endif @@ -265,7 +266,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp int mm = M - i; if (!mm) return 0; if (mm > 4 || K < 16) { - register __mmask8 mask asm("k1") = (1UL << mm) - 1; + register __mmask8 mask = (1UL << mm) - 1; for (j = 0; j < n6; j += 6) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); @@ -588,3 +589,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } return 0; } +#else +#include "../generic/gemm_small_matrix_kernel_nn.c" +#endif + diff --git a/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c index e757197ba..9e6eb1c4d 100644 --- a/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c @@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. _mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N) #define MASK_STORE_512(M, N) \ result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ - asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "k"(mask)); \ + asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "Yk"(mask)); \ _mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N) #define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ __m512d tmp##M##N = _mm512_i64gather_pd(vindex_n, &C[(j + N*8)*ldc + i + M], 8); \ @@ -303,7 +303,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } int mm = M - i; if (mm >= 6) { - register __mmask16 mask asm("k1") = (1UL << mm) - 1; + register __mmask16 mask = (1UL << mm) - 1; for (j = 0; j < n8; j += 8) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); diff --git a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c index 18c797283..37d1ca497 100644 --- a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c @@ -24,6 +24,7 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) #include #include "common.h" @@ -320,3 +321,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } return 0; } +#else +#include "../generic/gemm_small_matrix_kernel_tn.c" +#endif + diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c index 7af51b6d8..b94aa3c84 100644 --- a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c +++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c @@ -114,10 +114,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. asm("vmovups %0, (%1, %2, 4)": : "v"(val1), "r"(addr), "r"(ldc)) #define _MASK_STORE_C_2nx16(addr, val0, val1) \ - asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \ - asm("vfmadd213ps (%1, %3, 4), %2, %0 %{%4%}": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc), "k"(mmask)); \ - asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask)); \ - asm("vmovups %0, (%1, %2, 4) %{%3%}": : "v"(val1), "r"(addr), "r"(ldc), "k"(mmask)) + asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "Yk"(mmask)); \ + asm("vfmadd213ps (%1, %3, 4), %2, %0 %{%4%}": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc), "Yk"(mmask)); \ + asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "Yk"(mmask)); \ + asm("vmovups %0, (%1, %2, 4) %{%3%}": : "v"(val1), "r"(addr), "r"(ldc), "Yk"(mmask)) #define _REORDER_C_2X(result_0, result_1) { \ __m512 tmp0, tmp1; \ @@ -154,8 +154,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. asm("vmovups %0, (%1)": : "v"(val0), "r"(addr)); #define _MASK_STORE_C_16(addr, val0) \ - asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \ - asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask)); + asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "Yk"(mmask)); \ + asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "Yk"(mmask)); #define N_STORE_4X(A, Bx, By) { \ _REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \ diff --git a/kernel/x86_64/sbgemm_microk_cooperlake_template.c b/kernel/x86_64/sbgemm_microk_cooperlake_template.c index b8ed9838e..bd5cbb744 100644 --- a/kernel/x86_64/sbgemm_microk_cooperlake_template.c +++ b/kernel/x86_64/sbgemm_microk_cooperlake_template.c @@ -13,6 +13,8 @@ #define ONE 1.e0f #define ZERO 0.e0f +#define SHUFFLE_MAGIC_NO (const int) 0x39 + #undef STORE16_COMPLETE_RESULT #undef STORE16_MASK_COMPLETE_RESULT #undef SBGEMM_BLOCK_KERNEL_NN_32x8xK @@ -356,7 +358,6 @@ void sbgemm_block_kernel_nn_32xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, floa bfloat16 * B_addr = B; float * C_addr = C; - int SHUFFLE_MAGIC_NO = 0x39; BLASLONG tag_k_32x = k & (~31); #ifndef ONE_ALPHA @@ -465,7 +466,6 @@ void sbgemm_block_kernel_nn_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, floa bfloat16 * B_addr = B; float * C_addr = C; - int SHUFFLE_MAGIC_NO = 0x39; BLASLONG tag_k_32x = k & (~31); #ifndef ONE_ALPHA @@ -1192,7 +1192,6 @@ void sbgemm_block_kernel_tn_32xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, floa bfloat16 * B_addr = B; float * C_addr = C; - int SHUFFLE_MAGIC_NO = 0x39; BLASLONG tag_k_32x = k & (~31); #ifndef ONE_ALPHA @@ -1291,7 +1290,6 @@ void sbgemm_block_kernel_tn_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, floa bfloat16 * B_addr = B; float * C_addr = C; - int SHUFFLE_MAGIC_NO = 0x39; BLASLONG tag_k_32x = k & (~31); #ifndef ONE_ALPHA diff --git a/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c index 95ed82d7c..7ed03d70d 100644 --- a/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c +++ b/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c @@ -135,7 +135,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, }; - u_int64_t permute_table2[] = { + uint64_t permute_table2[] = { 0x00, 0x01, 0x02, 0x03, 8|0x0, 8|0x1, 8|0x2, 8|0x3, 0x04, 0x05, 0x06, 0x07, 8|0x4, 8|0x5, 8|0x6, 8|0x7, }; diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index 9bc7a7c58..2366fe3aa 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -24,6 +24,7 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) #include #include "common.h" @@ -47,7 +48,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. _mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N) #define MASK_STORE_512(M, N) \ result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ - asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "k"(mask)); \ + asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "Yk"(mask)); \ _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) #endif @@ -266,7 +267,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp int mm = M - i; if (!mm) return 0; if (mm > 8 || K < 32) { - register __mmask16 mask asm("k1") = (1UL << mm) - 1; + register __mmask16 mask = (1UL << mm) - 1; for (j = 0; j < n6; j += 6) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); @@ -610,3 +611,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } return 0; } +#else +#include "../generic/gemm_small_matrix_kernel_nn.c" +#endif + diff --git a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c index a7d87f8c4..bb00228de 100644 --- a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c @@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. _mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N) #define MASK_STORE_512(M, N) \ result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ - asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "k"(mask)); \ + asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "Yk"(mask)); \ _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) #define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ __m512 tmp##M##N = _mm512_i32gather_ps(vindex_n, &C[(j + N*16)*ldc + i + M], 4); \ @@ -303,7 +303,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } int mm = M - i; if (mm >= 12) { - register __mmask16 mask asm("k1") = (1UL << mm) - 1; + register __mmask16 mask = (1UL << mm) - 1; for (j = 0; j < n8; j += 8) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); diff --git a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c index 5a9a4ea32..308f5e35e 100644 --- a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c @@ -24,6 +24,7 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) #include #include "common.h" @@ -314,3 +315,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } return 0; } +#else +#include "../generic/gemm_small_matrix_kernel_tn.c" +#endif + diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index bfe0cf7ee..fa61ac939 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -452,11 +452,6 @@ MOVDDUP(4 * SIZE, A1, a1) - movsd 0 * SIZE(YY), yy1 - movhpd 1 * SIZE(YY), yy1 - movsd 2 * SIZE(YY), yy2 - movhpd 3 * SIZE(YY), yy2 - movapd 8 * SIZE(XX), xtemp1 movapd 10 * SIZE(XX), xtemp2 movapd 12 * SIZE(XX), xtemp3 @@ -475,6 +470,12 @@ MOVDDUP(6 * SIZE - (4 * SIZE), A2, a2) ALIGN_3 +.L12_prep: + movsd 0 * SIZE(YY), yy1 + movhpd 1 * SIZE(YY), yy1 + movsd 2 * SIZE(YY), yy2 + movhpd 3 * SIZE(YY), yy2 + .L12: movapd xtemp1, xt1 mulpd a1, xt1 @@ -608,8 +609,6 @@ movlpd yy2, 6 * SIZE(YY) movhpd yy2, 7 * SIZE(YY) - movsd 10 * SIZE(YY), yy2 - movhpd 11 * SIZE(YY), yy2 movapd xtemp2, xt1 movapd 18 * SIZE(XX), xtemp2 @@ -621,8 +620,6 @@ movlpd yy1, 4 * SIZE(YY) movhpd yy1, 5 * SIZE(YY) - movsd 8 * SIZE(YY), yy1 - movhpd 9 * SIZE(YY), yy1 subq $-16 * SIZE, XX addq $ 8 * SIZE, YY @@ -630,7 +627,8 @@ addq $ 8 * SIZE, A2 decq I - jg .L12 + jg .L12_prep + jmp .L15 ALIGN_3 .L14: @@ -641,7 +639,6 @@ jle .L16 MOVDDUP(6 * SIZE - (4 * SIZE), A2, a2) - jmp .L15_pastcheck .L15: movq M, I @@ -650,6 +647,11 @@ testq $2, I jle .L16 + movsd 0 * SIZE(YY), yy1 + movhpd 1 * SIZE(YY), yy1 + movsd 2 * SIZE(YY), yy2 + movhpd 3 * SIZE(YY), yy2 + .L15_pastcheck: movapd xtemp1, xt1 mulpd a1, xt1 @@ -705,8 +707,6 @@ movlpd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) - movsd 6 * SIZE(YY), yy2 - movhpd 7 * SIZE(YY), yy2 movapd xtemp2, xt1 movapd 10 * SIZE(XX), xtemp2 @@ -717,8 +717,6 @@ movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) - movsd 4 * SIZE(YY), yy1 - movhpd 5 * SIZE(YY), yy1 addq $4 * SIZE, YY addq $4 * SIZE, A1 @@ -731,6 +729,9 @@ MOVDDUP(1 * SIZE, A1, a2) + movsd 0 * SIZE(YY), yy1 + movhpd 1 * SIZE(YY), yy1 + movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 diff --git a/lapack-netlib/BLAS/CMakeLists.txt b/lapack-netlib/BLAS/CMakeLists.txt index ee5676fc6..45cec39c2 100644 --- a/lapack-netlib/BLAS/CMakeLists.txt +++ b/lapack-netlib/BLAS/CMakeLists.txt @@ -2,9 +2,9 @@ add_subdirectory(SRC) if(BUILD_TESTING) add_subdirectory(TESTING) endif() -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/blas.pc.in ${CMAKE_CURRENT_BINARY_DIR}/blas.pc @ONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/blas.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${BLASLIB}.pc @ONLY) install(FILES - ${CMAKE_CURRENT_BINARY_DIR}/blas.pc + ${CMAKE_CURRENT_BINARY_DIR}/${BLASLIB}.pc DESTINATION ${PKG_CONFIG_DIR} COMPONENT Development ) diff --git a/lapack-netlib/BLAS/SRC/CMakeLists.txt b/lapack-netlib/BLAS/SRC/CMakeLists.txt index 41c480432..0078dca40 100644 --- a/lapack-netlib/BLAS/SRC/CMakeLists.txt +++ b/lapack-netlib/BLAS/SRC/CMakeLists.txt @@ -97,10 +97,10 @@ if(BUILD_COMPLEX16) endif() list(REMOVE_DUPLICATES SOURCES) -add_library(blas ${SOURCES}) +add_library(${BLASLIB} ${SOURCES}) set_target_properties( - blas PROPERTIES + ${BLASLIB} PROPERTIES VERSION ${LAPACK_VERSION} SOVERSION ${LAPACK_MAJOR_VERSION} ) -lapack_install_library(blas) +lapack_install_library(${BLASLIB}) diff --git a/lapack-netlib/BLAS/TESTING/CMakeLists.txt b/lapack-netlib/BLAS/TESTING/CMakeLists.txt index 9b130db0f..ae82cf937 100644 --- a/lapack-netlib/BLAS/TESTING/CMakeLists.txt +++ b/lapack-netlib/BLAS/TESTING/CMakeLists.txt @@ -2,7 +2,7 @@ macro(add_blas_test name src) get_filename_component(baseNAME ${src} NAME_WE) set(TEST_INPUT "${CMAKE_CURRENT_SOURCE_DIR}/${baseNAME}.in") add_executable(${name} ${src}) - target_link_libraries(${name} blas) + target_link_libraries(${name} ${BLASLIB}) if(EXISTS "${TEST_INPUT}") add_test(NAME BLAS-${name} COMMAND "${CMAKE_COMMAND}" -DTEST=$ diff --git a/lapack-netlib/BLAS/blas.pc.in b/lapack-netlib/BLAS/blas.pc.in index 37809773b..31e11e638 100644 --- a/lapack-netlib/BLAS/blas.pc.in +++ b/lapack-netlib/BLAS/blas.pc.in @@ -5,4 +5,4 @@ Name: BLAS Description: FORTRAN reference implementation of BLAS Basic Linear Algebra Subprograms Version: @LAPACK_VERSION@ URL: http://www.netlib.org/blas/ -Libs: -L${libdir} -lblas +Libs: -L${libdir} -l@BLASLIB@ diff --git a/lapack-netlib/CBLAS/CMakeLists.txt b/lapack-netlib/CBLAS/CMakeLists.txt index 04c5ab795..da46027ac 100644 --- a/lapack-netlib/CBLAS/CMakeLists.txt +++ b/lapack-netlib/CBLAS/CMakeLists.txt @@ -1,7 +1,7 @@ message(STATUS "CBLAS enable") enable_language(C) -set(LAPACK_INSTALL_EXPORT_NAME cblas-targets) +set(LAPACK_INSTALL_EXPORT_NAME ${CBLASLIB}-targets) # Create a header file cblas.h for the routines called in my C programs include(FortranCInterface) @@ -42,15 +42,15 @@ if(BUILD_TESTING) endif() if(NOT BLAS_FOUND) - set(ALL_TARGETS ${ALL_TARGETS} blas) + set(ALL_TARGETS ${ALL_TARGETS} ${BLASLIB}) endif() # Export cblas targets from the # install tree, if any. set(_cblas_config_install_guard_target "") if(ALL_TARGETS) - install(EXPORT cblas-targets - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cblas-${LAPACK_VERSION} + install(EXPORT ${CBLASLIB}-targets + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${CBLASLIB}-${LAPACK_VERSION} COMPONENT Development ) # Choose one of the cblas targets to use as a guard for @@ -61,7 +61,7 @@ endif() # Export cblas targets from the build tree, if any. set(_cblas_config_build_guard_target "") if(ALL_TARGETS) - export(TARGETS ${ALL_TARGETS} FILE cblas-targets.cmake) + export(TARGETS ${ALL_TARGETS} FILE ${CBLASLIB}-targets.cmake) # Choose one of the cblas targets to use as a guard # for cblas-config.cmake to load targets from the build tree. @@ -69,26 +69,26 @@ if(ALL_TARGETS) endif() configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/cblas-config-version.cmake.in - ${LAPACK_BINARY_DIR}/cblas-config-version.cmake @ONLY) + ${LAPACK_BINARY_DIR}/${CBLASLIB}-config-version.cmake @ONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/cblas-config-build.cmake.in - ${LAPACK_BINARY_DIR}/cblas-config.cmake @ONLY) + ${LAPACK_BINARY_DIR}/${CBLASLIB}-config.cmake @ONLY) -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cblas.pc.in ${CMAKE_CURRENT_BINARY_DIR}/cblas.pc @ONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cblas.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${CBLASLIB}.pc @ONLY) install(FILES - ${CMAKE_CURRENT_BINARY_DIR}/cblas.pc + ${CMAKE_CURRENT_BINARY_DIR}/${CBLASLIB}.pc DESTINATION ${PKG_CONFIG_DIR} ) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/cblas-config-install.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/cblas-config.cmake @ONLY) + ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${CBLASLIB}-config.cmake @ONLY) install(FILES - ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/cblas-config.cmake - ${LAPACK_BINARY_DIR}/cblas-config-version.cmake - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cblas-${LAPACK_VERSION} + ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${CBLASLIB}-config.cmake + ${LAPACK_BINARY_DIR}/${CBLASLIB}-config-version.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${CBLASLIB}-${LAPACK_VERSION} ) -#install(EXPORT cblas-targets -# DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cblas-${LAPACK_VERSION} +#install(EXPORT ${CBLASLIB}-targets +# DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${CBLASLIB}-${LAPACK_VERSION} # COMPONENT Development # ) diff --git a/lapack-netlib/CBLAS/cblas.pc.in b/lapack-netlib/CBLAS/cblas.pc.in index 7c95ebbb4..882642e6c 100644 --- a/lapack-netlib/CBLAS/cblas.pc.in +++ b/lapack-netlib/CBLAS/cblas.pc.in @@ -5,6 +5,6 @@ Name: CBLAS Description: C Standard Interface to BLAS Basic Linear Algebra Subprograms Version: @LAPACK_VERSION@ URL: http://www.netlib.org/blas/#_cblas -Libs: -L${libdir} -lcblas +Libs: -L${libdir} -l@CBLASLIB@ Cflags: -I${includedir} -Requires.private: blas +Requires.private: @BLASLIB@ diff --git a/lapack-netlib/CBLAS/cmake/cblas-config-build.cmake.in b/lapack-netlib/CBLAS/cmake/cblas-config-build.cmake.in index 3747f041c..dc21c2d0f 100644 --- a/lapack-netlib/CBLAS/cmake/cblas-config-build.cmake.in +++ b/lapack-netlib/CBLAS/cmake/cblas-config-build.cmake.in @@ -4,11 +4,11 @@ find_package(LAPACK NO_MODULE) # Load lapack targets from the build tree, including lapacke targets. if(NOT TARGET lapacke) - include("@LAPACK_BINARY_DIR@/lapack-targets.cmake") + include("@LAPACK_BINARY_DIR@/@LAPACKLIB@-targets.cmake") endif() # Report cblas header search locations from build tree. set(CBLAS_INCLUDE_DIRS "@LAPACK_BINARY_DIR@/include") # Report cblas libraries. -set(CBLAS_LIBRARIES cblas) +set(CBLAS_LIBRARIES @CBLASLIB@) diff --git a/lapack-netlib/CBLAS/cmake/cblas-config-install.cmake.in b/lapack-netlib/CBLAS/cmake/cblas-config-install.cmake.in index 215e28a57..44046a283 100644 --- a/lapack-netlib/CBLAS/cmake/cblas-config-install.cmake.in +++ b/lapack-netlib/CBLAS/cmake/cblas-config-install.cmake.in @@ -5,19 +5,19 @@ get_filename_component(_CBLAS_PREFIX "${_CBLAS_PREFIX}" PATH) get_filename_component(_CBLAS_PREFIX "${_CBLAS_PREFIX}" PATH) # Load the LAPACK package with which we were built. -set(LAPACK_DIR "${_CBLAS_PREFIX}/@CMAKE_INSTALL_LIBDIR@/cmake/lapack-@LAPACK_VERSION@") +set(LAPACK_DIR "${_CBLAS_PREFIX}/@CMAKE_INSTALL_LIBDIR@/cmake/@LAPACKLIB@-@LAPACK_VERSION@") find_package(LAPACK NO_MODULE) # Load lapacke targets from the install tree. -if(NOT TARGET cblas) - include(${_CBLAS_SELF_DIR}/cblas-targets.cmake) +if(NOT TARGET @CBLASLIB@) + include(${_CBLAS_SELF_DIR}/@CBLASLIB@-targets.cmake) endif() # Report lapacke header search locations. set(CBLAS_INCLUDE_DIRS ${_CBLAS_PREFIX}/include) # Report lapacke libraries. -set(CBLAS_LIBRARIES cblas) +set(CBLAS_LIBRARIES @CBLASLIB@) unset(_CBLAS_PREFIX) unset(_CBLAS_SELF_DIR) diff --git a/lapack-netlib/CBLAS/examples/CMakeLists.txt b/lapack-netlib/CBLAS/examples/CMakeLists.txt index 0241fd164..74f7d8bb8 100644 --- a/lapack-netlib/CBLAS/examples/CMakeLists.txt +++ b/lapack-netlib/CBLAS/examples/CMakeLists.txt @@ -1,8 +1,8 @@ add_executable(xexample1_CBLAS cblas_example1.c) add_executable(xexample2_CBLAS cblas_example2.c) -target_link_libraries(xexample1_CBLAS cblas) -target_link_libraries(xexample2_CBLAS cblas ${BLAS_LIBRARIES}) +target_link_libraries(xexample1_CBLAS ${CBLASLIB}) +target_link_libraries(xexample2_CBLAS ${CBLASLIB} ${BLAS_LIBRARIES}) add_test(example1_CBLAS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/xexample1_CBLAS) add_test(example2_CBLAS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/xexample2_CBLAS) diff --git a/lapack-netlib/CBLAS/examples/cblas_example1.c b/lapack-netlib/CBLAS/examples/cblas_example1.c index 3d5ed330c..d89aeadb0 100644 --- a/lapack-netlib/CBLAS/examples/cblas_example1.c +++ b/lapack-netlib/CBLAS/examples/cblas_example1.c @@ -11,7 +11,7 @@ int main ( ) double *a, *x, *y; double alpha, beta; - int m, n, lda, incx, incy, i; + CBLAS_INDEX m, n, lda, incx, incy, i; Layout = CblasColMajor; transa = CblasNoTrans; diff --git a/lapack-netlib/CBLAS/examples/cblas_example2.c b/lapack-netlib/CBLAS/examples/cblas_example2.c index d2c28d53f..e82ae518c 100644 --- a/lapack-netlib/CBLAS/examples/cblas_example2.c +++ b/lapack-netlib/CBLAS/examples/cblas_example2.c @@ -9,7 +9,7 @@ int main (int argc, char **argv ) { - int rout=-1,info=0,m,n,k,lda,ldb,ldc; + CBLAS_INDEX rout=-1,info=0,m,n,k,lda,ldb,ldc; double A[2] = {0.0,0.0}, B[2] = {0.0,0.0}, C[2] = {0.0,0.0}, diff --git a/lapack-netlib/CBLAS/include/cblas.h b/lapack-netlib/CBLAS/include/cblas.h index 9e937964e..7593064f1 100644 --- a/lapack-netlib/CBLAS/include/cblas.h +++ b/lapack-netlib/CBLAS/include/cblas.h @@ -1,6 +1,7 @@ #ifndef CBLAS_H #define CBLAS_H #include +#include #ifdef __cplusplus @@ -11,9 +12,9 @@ extern "C" { /* Assume C declarations for C++ */ * Enumerated and derived types */ #ifdef WeirdNEC - #define CBLAS_INDEX long + #define CBLAS_INDEX int64_t #else - #define CBLAS_INDEX int + #define CBLAS_INDEX int32_t #endif typedef enum {CblasRowMajor=101, CblasColMajor=102} CBLAS_LAYOUT; diff --git a/lapack-netlib/CBLAS/include/cblas_f77.h b/lapack-netlib/CBLAS/include/cblas_f77.h index 36d4a7118..bb3f3a45d 100644 --- a/lapack-netlib/CBLAS/include/cblas_f77.h +++ b/lapack-netlib/CBLAS/include/cblas_f77.h @@ -9,6 +9,8 @@ #ifndef CBLAS_F77_H #define CBLAS_F77_H +#include + #ifdef CRAY #include #define F77_CHAR _fcd @@ -17,8 +19,12 @@ #define F77_STRLEN(a) (_fcdlen) #endif +#ifndef F77_INT #ifdef WeirdNEC - #define F77_INT long + #define F77_INT int64_t +#else + #define F77_INT int32_t +#endif #endif #ifdef F77_CHAR diff --git a/lapack-netlib/CBLAS/src/CMakeLists.txt b/lapack-netlib/CBLAS/src/CMakeLists.txt index 90e19f818..1313e798b 100644 --- a/lapack-netlib/CBLAS/src/CMakeLists.txt +++ b/lapack-netlib/CBLAS/src/CMakeLists.txt @@ -113,16 +113,16 @@ if(BUILD_COMPLEX16) endif() list(REMOVE_DUPLICATES SOURCES) -add_library(cblas ${SOURCES}) +add_library(${CBLASLIB} ${SOURCES}) set_target_properties( - cblas PROPERTIES + ${CBLASLIB} PROPERTIES LINKER_LANGUAGE C VERSION ${LAPACK_VERSION} SOVERSION ${LAPACK_MAJOR_VERSION} ) -target_include_directories(cblas PUBLIC +target_include_directories(${CBLASLIB} PUBLIC $ $ ) -target_link_libraries(cblas PRIVATE ${BLAS_LIBRARIES}) -lapack_install_library(cblas) +target_link_libraries(${CBLASLIB} PRIVATE ${BLAS_LIBRARIES}) +lapack_install_library(${CBLASLIB}) diff --git a/lapack-netlib/CBLAS/testing/CMakeLists.txt b/lapack-netlib/CBLAS/testing/CMakeLists.txt index 2459695b8..34e92a423 100644 --- a/lapack-netlib/CBLAS/testing/CMakeLists.txt +++ b/lapack-netlib/CBLAS/testing/CMakeLists.txt @@ -52,9 +52,9 @@ if(BUILD_SINGLE) add_executable(xscblat2 c_sblat2.f ${STESTL2O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) add_executable(xscblat3 c_sblat3.f ${STESTL3O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) - target_link_libraries(xscblat1 cblas) - target_link_libraries(xscblat2 cblas) - target_link_libraries(xscblat3 cblas) + target_link_libraries(xscblat1 ${CBLASLIB}) + target_link_libraries(xscblat2 ${CBLASLIB}) + target_link_libraries(xscblat3 ${CBLASLIB}) add_cblas_test(stest1.out "" xscblat1) add_cblas_test(stest2.out sin2 xscblat2) @@ -66,9 +66,9 @@ if(BUILD_DOUBLE) add_executable(xdcblat2 c_dblat2.f ${DTESTL2O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) add_executable(xdcblat3 c_dblat3.f ${DTESTL3O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) - target_link_libraries(xdcblat1 cblas) - target_link_libraries(xdcblat2 cblas) - target_link_libraries(xdcblat3 cblas) + target_link_libraries(xdcblat1 ${CBLASLIB}) + target_link_libraries(xdcblat2 ${CBLASLIB}) + target_link_libraries(xdcblat3 ${CBLASLIB}) add_cblas_test(dtest1.out "" xdcblat1) add_cblas_test(dtest2.out din2 xdcblat2) @@ -80,9 +80,9 @@ if(BUILD_COMPLEX) add_executable(xccblat2 c_cblat2.f ${CTESTL2O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) add_executable(xccblat3 c_cblat3.f ${CTESTL3O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) - target_link_libraries(xccblat1 cblas ${BLAS_LIBRARIES}) - target_link_libraries(xccblat2 cblas) - target_link_libraries(xccblat3 cblas) + target_link_libraries(xccblat1 ${CBLASLIB} ${BLAS_LIBRARIES}) + target_link_libraries(xccblat2 ${CBLASLIB}) + target_link_libraries(xccblat3 ${CBLASLIB}) add_cblas_test(ctest1.out "" xccblat1) add_cblas_test(ctest2.out cin2 xccblat2) @@ -94,9 +94,9 @@ if(BUILD_COMPLEX16) add_executable(xzcblat2 c_zblat2.f ${ZTESTL2O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) add_executable(xzcblat3 c_zblat3.f ${ZTESTL3O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) - target_link_libraries(xzcblat1 cblas) - target_link_libraries(xzcblat2 cblas) - target_link_libraries(xzcblat3 cblas) + target_link_libraries(xzcblat1 ${CBLASLIB}) + target_link_libraries(xzcblat2 ${CBLASLIB}) + target_link_libraries(xzcblat3 ${CBLASLIB}) add_cblas_test(ztest1.out "" xzcblat1) add_cblas_test(ztest2.out zin2 xzcblat2) diff --git a/lapack-netlib/CMAKE/CheckLAPACKCompilerFlags.cmake b/lapack-netlib/CMAKE/CheckLAPACKCompilerFlags.cmake index add0d1797..15a8f01d6 100644 --- a/lapack-netlib/CMAKE/CheckLAPACKCompilerFlags.cmake +++ b/lapack-netlib/CMAKE/CheckLAPACKCompilerFlags.cmake @@ -14,6 +14,19 @@ macro( CheckLAPACKCompilerFlags ) set( FPE_EXIT FALSE ) +# FORTRAN ILP default +if ( FORTRAN_ILP ) + if( CMAKE_Fortran_COMPILER_ID STREQUAL "Intel" ) + if ( WIN32 ) + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} /integer-size:64") + else () + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -integer-size 64") + endif() + else() + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fdefault-integer-8") + endif() +endif() + # GNU Fortran if( CMAKE_Fortran_COMPILER_ID STREQUAL "GNU" ) if( "${CMAKE_Fortran_FLAGS}" MATCHES "-ffpe-trap=[izoupd]") diff --git a/lapack-netlib/CMAKE/lapack-config-build.cmake.in b/lapack-netlib/CMAKE/lapack-config-build.cmake.in index f7e041663..da44a6ae4 100644 --- a/lapack-netlib/CMAKE/lapack-config-build.cmake.in +++ b/lapack-netlib/CMAKE/lapack-config-build.cmake.in @@ -1,7 +1,7 @@ # Load lapack targets from the build tree if necessary. set(_LAPACK_TARGET "@_lapack_config_build_guard_target@") if(_LAPACK_TARGET AND NOT TARGET "${_LAPACK_TARGET}") - include("@LAPACK_BINARY_DIR@/lapack-targets.cmake") + include("@LAPACK_BINARY_DIR@/@LAPACKLIB@-targets.cmake") endif() unset(_LAPACK_TARGET) diff --git a/lapack-netlib/CMAKE/lapack-config-install.cmake.in b/lapack-netlib/CMAKE/lapack-config-install.cmake.in index 3de7362ea..77609609c 100644 --- a/lapack-netlib/CMAKE/lapack-config-install.cmake.in +++ b/lapack-netlib/CMAKE/lapack-config-install.cmake.in @@ -4,7 +4,7 @@ get_filename_component(_LAPACK_SELF_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) # Load lapack targets from the install tree if necessary. set(_LAPACK_TARGET "@_lapack_config_install_guard_target@") if(_LAPACK_TARGET AND NOT TARGET "${_LAPACK_TARGET}") - include("${_LAPACK_SELF_DIR}/lapack-targets.cmake") + include("${_LAPACK_SELF_DIR}/@LAPACKLIB@-targets.cmake") endif() unset(_LAPACK_TARGET) diff --git a/lapack-netlib/CMakeLists.txt b/lapack-netlib/CMakeLists.txt index df43d91b1..b704e72c5 100644 --- a/lapack-netlib/CMakeLists.txt +++ b/lapack-netlib/CMakeLists.txt @@ -44,6 +44,24 @@ endif() # By default static library option(BUILD_SHARED_LIBS "Build shared libraries" OFF) +# By default build index32 library +option(BUILD_INDEX64 "Build Index-64 API libraries" OFF) +if(BUILD_INDEX64) + set(BLASLIB "blas64") + set(CBLASLIB "cblas64") + set(LAPACKLIB "lapack64") + set(LAPACKELIB "lapacke64") + set(TMGLIB "tmglib64") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWeirdNEC -DLAPACK_ILP64 -DHAVE_LAPACK_CONFIG_H") + set(FORTRAN_ILP TRUE) +else() + set(BLASLIB "blas") + set(CBLASLIB "cblas") + set(LAPACKLIB "lapack") + set(LAPACKELIB "lapacke") + set(TMGLIB "tmglib") +endif() + include(GNUInstallDirs) # Updated OSX RPATH settings @@ -73,10 +91,10 @@ include(PreventInBuildInstalls) if(UNIX) if(CMAKE_Fortran_COMPILER_ID STREQUAL Intel) - list(APPEND CMAKE_Fortran_FLAGS "-fp-model strict") + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fp-model strict") endif() if(CMAKE_Fortran_COMPILER_ID STREQUAL XL) - list(APPEND CMAKE_Fortran_FLAGS "-qnosave -qstrict=none") + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict=none") endif() # Delete libmtsk in linking sequence for Sun/Oracle Fortran Compiler. # This library is not present in the Sun package SolarisStudio12.3-linux-x86-bin @@ -112,7 +130,7 @@ endif() # -------------------------------------------------- -set(LAPACK_INSTALL_EXPORT_NAME lapack-targets) +set(LAPACK_INSTALL_EXPORT_NAME ${LAPACKLIB}-targets) macro(lapack_install_library lib) install(TARGETS ${lib} @@ -220,7 +238,7 @@ endif() if(NOT BLAS_FOUND) message(STATUS "Using supplied NETLIB BLAS implementation") add_subdirectory(BLAS) - set(BLAS_LIBRARIES blas) + set(BLAS_LIBRARIES ${BLASLIB}) else() set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${BLAS_LINKER_FLAGS}" @@ -279,7 +297,7 @@ endif() # Neither user specified or optimized LAPACK libraries can be used if(NOT LATESTLAPACK_FOUND) message(STATUS "Using supplied NETLIB LAPACK implementation") - set(LAPACK_LIBRARIES lapack) + set(LAPACK_LIBRARIES ${LAPACKLIB}) add_subdirectory(SRC) else() set(CMAKE_EXE_LINKER_FLAGS @@ -371,23 +389,23 @@ include(CPack) # -------------------------------------------------- if(NOT BLAS_FOUND) - set(ALL_TARGETS ${ALL_TARGETS} blas) + set(ALL_TARGETS ${ALL_TARGETS} ${BLASLIB}) endif() if(NOT LATESTLAPACK_FOUND) - set(ALL_TARGETS ${ALL_TARGETS} lapack) + set(ALL_TARGETS ${ALL_TARGETS} ${LAPACKLIB}) endif() if(BUILD_TESTING OR LAPACKE_WITH_TMG) - set(ALL_TARGETS ${ALL_TARGETS} tmglib) + set(ALL_TARGETS ${ALL_TARGETS} ${TMGLIB}) endif() # Export lapack targets, not including lapacke, from the # install tree, if any. set(_lapack_config_install_guard_target "") if(ALL_TARGETS) - install(EXPORT lapack-targets - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapack-${LAPACK_VERSION} + install(EXPORT ${LAPACKLIB}-targets + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${LAPACKLIB}-${LAPACK_VERSION} COMPONENT Development ) @@ -398,18 +416,18 @@ endif() # Include cblas in targets exported from the build tree. if(CBLAS) - set(ALL_TARGETS ${ALL_TARGETS} cblas) + set(ALL_TARGETS ${ALL_TARGETS} ${CBLASLIB}) endif() # Include lapacke in targets exported from the build tree. if(LAPACKE) - set(ALL_TARGETS ${ALL_TARGETS} lapacke) + set(ALL_TARGETS ${ALL_TARGETS} ${LAPACKELIB}) endif() # Export lapack and lapacke targets from the build tree, if any. set(_lapack_config_build_guard_target "") if(ALL_TARGETS) - export(TARGETS ${ALL_TARGETS} FILE lapack-targets.cmake) + export(TARGETS ${ALL_TARGETS} FILE ${LAPACKLIB}-targets.cmake) # Choose one of the lapack or lapacke targets to use as a guard # for lapack-config.cmake to load targets from the build tree. @@ -417,30 +435,30 @@ if(ALL_TARGETS) endif() configure_file(${LAPACK_SOURCE_DIR}/CMAKE/lapack-config-build.cmake.in - ${LAPACK_BINARY_DIR}/lapack-config.cmake @ONLY) + ${LAPACK_BINARY_DIR}/${LAPACKLIB}-config.cmake @ONLY) -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lapack.pc.in ${CMAKE_CURRENT_BINARY_DIR}/lapack.pc @ONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lapack.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${LAPACKLIB}.pc @ONLY) install(FILES - ${CMAKE_CURRENT_BINARY_DIR}/lapack.pc + ${CMAKE_CURRENT_BINARY_DIR}/${LAPACKLIB}.pc DESTINATION ${PKG_CONFIG_DIR} COMPONENT Development ) configure_file(${LAPACK_SOURCE_DIR}/CMAKE/lapack-config-install.cmake.in - ${LAPACK_BINARY_DIR}/CMakeFiles/lapack-config.cmake @ONLY) + ${LAPACK_BINARY_DIR}/CMakeFiles/${LAPACKLIB}-config.cmake @ONLY) include(CMakePackageConfigHelpers) write_basic_package_version_file( - ${LAPACK_BINARY_DIR}/lapack-config-version.cmake + ${LAPACK_BINARY_DIR}/${LAPACKLIB}-config-version.cmake VERSION ${LAPACK_VERSION} COMPATIBILITY SameMajorVersion ) install(FILES - ${LAPACK_BINARY_DIR}/CMakeFiles/lapack-config.cmake - ${LAPACK_BINARY_DIR}/lapack-config-version.cmake - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapack-${LAPACK_VERSION} + ${LAPACK_BINARY_DIR}/CMakeFiles/${LAPACKLIB}-config.cmake + ${LAPACK_BINARY_DIR}/${LAPACKLIB}-config-version.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${LAPACKLIB}-${LAPACK_VERSION} COMPONENT Development ) diff --git a/lapack-netlib/LAPACKE/CMakeLists.txt b/lapack-netlib/LAPACKE/CMakeLists.txt index 0589a74ba..60d5ddbe3 100644 --- a/lapack-netlib/LAPACKE/CMakeLists.txt +++ b/lapack-netlib/LAPACKE/CMakeLists.txt @@ -1,7 +1,7 @@ message(STATUS "LAPACKE enable") enable_language(C) -set(LAPACK_INSTALL_EXPORT_NAME lapacke-targets) +set(LAPACK_INSTALL_EXPORT_NAME ${LAPACKELIB}-targets) # Create a header file lapacke_mangling.h for the routines called in my C programs include(FortranCInterface) @@ -72,28 +72,28 @@ if(LAPACKE_WITH_TMG) endif() list(APPEND SOURCES ${UTILS}) -add_library(lapacke ${SOURCES}) +add_library(${LAPACKELIB} ${SOURCES}) set_target_properties( - lapacke PROPERTIES + ${LAPACKELIB} PROPERTIES LINKER_LANGUAGE C VERSION ${LAPACK_VERSION} SOVERSION ${LAPACK_MAJOR_VERSION} ) -target_include_directories(lapacke PUBLIC +target_include_directories(${LAPACKELIB} PUBLIC $ $ ) if(WIN32 AND NOT UNIX) - target_compile_definitions(lapacke PUBLIC HAVE_LAPACK_CONFIG_H LAPACK_COMPLEX_STRUCTURE) + target_compile_definitions(${LAPACKELIB} PUBLIC HAVE_LAPACK_CONFIG_H LAPACK_COMPLEX_STRUCTURE) message(STATUS "Windows BUILD") endif() if(LAPACKE_WITH_TMG) - target_link_libraries(lapacke PRIVATE tmglib) + target_link_libraries(${LAPACKELIB} PRIVATE ${TMGLIB}) endif() -target_link_libraries(lapacke PRIVATE ${LAPACK_LIBRARIES}) +target_link_libraries(${LAPACKELIB} PRIVATE ${LAPACK_LIBRARIES}) -lapack_install_library(lapacke) +lapack_install_library(${LAPACKELIB}) install( FILES ${LAPACKE_INCLUDE} ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} @@ -105,28 +105,28 @@ if(BUILD_TESTING) endif() -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lapacke.pc.in ${CMAKE_CURRENT_BINARY_DIR}/lapacke.pc @ONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lapacke.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${LAPACKELIB}.pc @ONLY) install(FILES - ${CMAKE_CURRENT_BINARY_DIR}/lapacke.pc + ${CMAKE_CURRENT_BINARY_DIR}/${LAPACKELIB}.pc DESTINATION ${PKG_CONFIG_DIR} COMPONENT Development ) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/lapacke-config-version.cmake.in - ${LAPACK_BINARY_DIR}/lapacke-config-version.cmake @ONLY) + ${LAPACK_BINARY_DIR}/${LAPACKELIB}-config-version.cmake @ONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/lapacke-config-build.cmake.in - ${LAPACK_BINARY_DIR}/lapacke-config.cmake @ONLY) + ${LAPACK_BINARY_DIR}/${LAPACKELIB}-config.cmake @ONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/lapacke-config-install.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/lapacke-config.cmake @ONLY) + ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${LAPACKELIB}-config.cmake @ONLY) install(FILES - ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/lapacke-config.cmake - ${LAPACK_BINARY_DIR}/lapacke-config-version.cmake - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapacke-${LAPACK_VERSION} + ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${LAPACKELIB}-config.cmake + ${LAPACK_BINARY_DIR}/${LAPACKELIB}-config-version.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${LAPACKELIB}-${LAPACK_VERSION} COMPONENT Development ) -install(EXPORT lapacke-targets - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapacke-${LAPACK_VERSION} +install(EXPORT ${LAPACKELIB}-targets + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${LAPACKELIB}-${LAPACK_VERSION} COMPONENT Development ) diff --git a/lapack-netlib/LAPACKE/cmake/lapacke-config-build.cmake.in b/lapack-netlib/LAPACKE/cmake/lapacke-config-build.cmake.in index 0a1350172..49ce4770a 100644 --- a/lapack-netlib/LAPACKE/cmake/lapacke-config-build.cmake.in +++ b/lapack-netlib/LAPACKE/cmake/lapacke-config-build.cmake.in @@ -3,8 +3,8 @@ set(LAPACK_DIR "@LAPACK_BINARY_DIR@") find_package(LAPACK NO_MODULE) # Load lapack targets from the build tree, including lapacke targets. -if(NOT TARGET lapacke) - include("@LAPACK_BINARY_DIR@/lapack-targets.cmake") +if(NOT TARGET @LAPACKELIB@) + include("@LAPACK_BINARY_DIR@/@LAPACKLIB@-targets.cmake") endif() # Hint for project building against lapack @@ -14,4 +14,4 @@ set(LAPACKE_Fortran_COMPILER_ID ${LAPACK_Fortran_COMPILER_ID}) set(LAPACKE_INCLUDE_DIRS "@LAPACK_BINARY_DIR@/include") # Report lapacke libraries. -set(LAPACKE_LIBRARIES lapacke ${LAPACK_LIBRARIES}) +set(LAPACKE_LIBRARIES @LAPACKELIB@ ${LAPACK_LIBRARIES}) diff --git a/lapack-netlib/LAPACKE/cmake/lapacke-config-install.cmake.in b/lapack-netlib/LAPACKE/cmake/lapacke-config-install.cmake.in index 57a5c2b2f..2e5c36fa1 100644 --- a/lapack-netlib/LAPACKE/cmake/lapacke-config-install.cmake.in +++ b/lapack-netlib/LAPACKE/cmake/lapacke-config-install.cmake.in @@ -5,12 +5,12 @@ get_filename_component(_LAPACKE_PREFIX "${_LAPACKE_PREFIX}" PATH) get_filename_component(_LAPACKE_PREFIX "${_LAPACKE_PREFIX}" PATH) # Load the LAPACK package with which we were built. -set(LAPACK_DIR "${_LAPACKE_PREFIX}/@CMAKE_INSTALL_LIBDIR@/cmake/lapack-@LAPACK_VERSION@") +set(LAPACK_DIR "${_LAPACKE_PREFIX}/@CMAKE_INSTALL_LIBDIR@/cmake/@LAPACK@-@LAPACK_VERSION@") find_package(LAPACK NO_MODULE) # Load lapacke targets from the install tree. -if(NOT TARGET lapacke) - include(${_LAPACKE_SELF_DIR}/lapacke-targets.cmake) +if(NOT TARGET @LAPACKELIB@) + include(${_LAPACKE_SELF_DIR}/@LAPACKELIB@-targets.cmake) endif() # Hint for project building against lapack @@ -20,7 +20,7 @@ set(LAPACKE_Fortran_COMPILER_ID ${LAPACK_Fortran_COMPILER_ID}) set(LAPACKE_INCLUDE_DIRS ${_LAPACKE_PREFIX}/include) # Report lapacke libraries. -set(LAPACKE_LIBRARIES lapacke ${LAPACK_LIBRARIES}) +set(LAPACKE_LIBRARIES @LAPACKELIB@ ${LAPACK_LIBRARIES}) unset(_LAPACKE_PREFIX) unset(_LAPACKE_SELF_DIR) diff --git a/lapack-netlib/LAPACKE/example/CMakeLists.txt b/lapack-netlib/LAPACKE/example/CMakeLists.txt index fa75c731c..27db8ee21 100644 --- a/lapack-netlib/LAPACKE/example/CMakeLists.txt +++ b/lapack-netlib/LAPACKE/example/CMakeLists.txt @@ -3,10 +3,10 @@ add_executable(xexample_DGESV_colmajor example_DGESV_colmajor.c lapacke_example_ add_executable(xexample_DGELS_rowmajor example_DGELS_rowmajor.c lapacke_example_aux.c lapacke_example_aux.h) add_executable(xexample_DGELS_colmajor example_DGELS_colmajor.c lapacke_example_aux.c lapacke_example_aux.h) -target_link_libraries(xexample_DGESV_rowmajor lapacke) -target_link_libraries(xexample_DGESV_colmajor lapacke) -target_link_libraries(xexample_DGELS_rowmajor lapacke) -target_link_libraries(xexample_DGELS_colmajor lapacke) +target_link_libraries(xexample_DGESV_rowmajor ${LAPACKELIB}) +target_link_libraries(xexample_DGESV_colmajor ${LAPACKELIB}) +target_link_libraries(xexample_DGELS_rowmajor ${LAPACKELIB}) +target_link_libraries(xexample_DGELS_colmajor ${LAPACKELIB}) add_test(example_DGESV_rowmajor ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/xexample_DGESV_rowmajor) add_test(example_DGESV_colmajor ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/xexample_DGESV_colmajor) diff --git a/lapack-netlib/LAPACKE/include/lapacke_config.h b/lapack-netlib/LAPACKE/include/lapacke_config.h index 8262c3488..c6542955e 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_config.h +++ b/lapack-netlib/LAPACKE/include/lapacke_config.h @@ -49,12 +49,13 @@ extern "C" { #endif /* __cplusplus */ #include +#include #ifndef lapack_int #if defined(LAPACK_ILP64) -#define lapack_int long +#define lapack_int int64_t #else -#define lapack_int int +#define lapack_int int32_t #endif #endif diff --git a/lapack-netlib/LAPACKE/include/lapacke_utils.h b/lapack-netlib/LAPACKE/include/lapacke_utils.h index a9236d23f..ec29f24fc 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_utils.h +++ b/lapack-netlib/LAPACKE/include/lapacke_utils.h @@ -67,7 +67,11 @@ extern "C" { void LAPACKE_xerbla( const char *name, lapack_int info ); /* Compare two chars (case-insensitive) */ -lapack_logical LAPACKE_lsame( char ca, char cb ); +lapack_logical LAPACKE_lsame( char ca, char cb ) +#if defined __GNUC__ + __attribute__((const)) +#endif + ; /* Functions to convert column-major to row-major 2d arrays and vice versa. */ void LAPACKE_cgb_trans( int matrix_layout, lapack_int m, lapack_int n, diff --git a/lapack-netlib/LAPACKE/lapacke.pc.in b/lapack-netlib/LAPACKE/lapacke.pc.in index 68da73957..0097c2597 100644 --- a/lapack-netlib/LAPACKE/lapacke.pc.in +++ b/lapack-netlib/LAPACKE/lapacke.pc.in @@ -5,6 +5,6 @@ Name: LAPACKE Description: C Standard Interface to LAPACK Linear Algebra PACKage Version: @LAPACK_VERSION@ URL: http://www.netlib.org/lapack/#_standard_c_language_apis_for_lapack -Libs: -L${libdir} -llapacke +Libs: -L${libdir} -l@LAPACKELIB@ Cflags: -I${includedir} -Requires.private: lapack +Requires.private: @LAPACKLIB@ diff --git a/lapack-netlib/SRC/CMakeLists.txt b/lapack-netlib/SRC/CMakeLists.txt index f19bdd302..bb1459165 100644 --- a/lapack-netlib/SRC/CMakeLists.txt +++ b/lapack-netlib/SRC/CMakeLists.txt @@ -500,21 +500,21 @@ if(BUILD_COMPLEX16) endif() list(REMOVE_DUPLICATES SOURCES) -add_library(lapack ${SOURCES}) +add_library(${LAPACKLIB} ${SOURCES}) set_target_properties( - lapack PROPERTIES + ${LAPACKLIB} PROPERTIES VERSION ${LAPACK_VERSION} SOVERSION ${LAPACK_MAJOR_VERSION} ) if(USE_XBLAS) - target_link_libraries(lapack PRIVATE ${XBLAS_LIBRARY}) + target_link_libraries(${LAPACKLIB} PRIVATE ${XBLAS_LIBRARY}) endif() -target_link_libraries(lapack PRIVATE ${BLAS_LIBRARIES}) +target_link_libraries(${LAPACKLIB} PRIVATE ${BLAS_LIBRARIES}) if(_is_coverage_build) - target_link_libraries(lapack PRIVATE gcov) - add_coverage(lapack) + target_link_libraries(${LAPACKLIB} PRIVATE gcov) + add_coverage(${LAPACKLIB}) endif() -lapack_install_library(lapack) +lapack_install_library(${LAPACKLIB}) diff --git a/lapack-netlib/TESTING/MATGEN/CMakeLists.txt b/lapack-netlib/TESTING/MATGEN/CMakeLists.txt index bc986da3a..3639c0320 100644 --- a/lapack-netlib/TESTING/MATGEN/CMakeLists.txt +++ b/lapack-netlib/TESTING/MATGEN/CMakeLists.txt @@ -47,6 +47,6 @@ if(BUILD_COMPLEX16) endif() list(REMOVE_DUPLICATES SOURCES) -add_library(tmglib ${SOURCES}) -target_link_libraries(tmglib ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) -lapack_install_library(tmglib) +add_library(${TMGLIB} ${SOURCES}) +target_link_libraries(${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) +lapack_install_library(${TMGLIB}) diff --git a/param.h b/param.h index 8649e4486..09170ba23 100644 --- a/param.h +++ b/param.h @@ -3128,9 +3128,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 -#if defined(CORTEXA57) || \ +#if defined(CORTEXA57) || defined(CORTEXX1) || \ defined(CORTEXA72) || defined(CORTEXA73) || \ - defined(FALKOR) || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) + defined(FALKOR) || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) || defined(FT2000) #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -3147,7 +3147,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*FIXME: this should be using the cache size, but there is currently no easy way to query that on ARM. So if getarch counted more than 8 cores we simply assume the host is a big desktop or server with abundant cache rather than a phone or embedded device */ -#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) +#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) || defined(VORTEX)|| defined(CORTEXX1) #define SGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_P 256 #define CGEMM_DEFAULT_P 256 @@ -3377,7 +3377,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(ARMV8SVE) || defined(A64FX) +#elif defined(ARMV8SVE) || defined(A64FX) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ @@ -3423,8 +3423,8 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_M 4 -#define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 diff --git a/relapack/config.h b/relapack/config.h index e4fab0a12..9d6919463 100644 --- a/relapack/config.h +++ b/relapack/config.h @@ -115,7 +115,7 @@ #define INCLUDE_CTGSYL INCLUDE_XTGSYL #define INCLUDE_ZTGSYL INCLUDE_XTGSYL -#define INCLUDE_XGEMMT 0 +#define INCLUDE_XGEMMT 1 #define INCLUDE_SGEMMT INCLUDE_XGEMMT #define INCLUDE_DGEMMT INCLUDE_XGEMMT #define INCLUDE_CGEMMT INCLUDE_XGEMMT diff --git a/relapack/src/lapack_wrappers.c b/relapack/src/lapack_wrappers.c index 0252f3d92..fc3dbc11e 100644 --- a/relapack/src/lapack_wrappers.c +++ b/relapack/src/lapack_wrappers.c @@ -566,7 +566,8 @@ void LAPACK(sgemmt)( const float *B, const blasint *ldB, const float *beta, float *C, const blasint *ldC ) { - RELAPACK_sgemmt(uplo, n, A, ldA, info); + blasint info; + RELAPACK_sgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info); } #endif @@ -578,7 +579,8 @@ void LAPACK(dgemmt)( const double *B, const blasint *ldB, const double *beta, double *C, const blasint *ldC ) { - RELAPACK_dgemmt(uplo, n, A, ldA, info); + blasint info; + RELAPACK_dgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info); } #endif @@ -590,7 +592,8 @@ void LAPACK(cgemmt)( const float *B, const blasint *ldB, const float *beta, float *C, const blasint *ldC ) { - RELAPACK_cgemmt(uplo, n, A, ldA, info); + blasint info; + RELAPACK_cgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info); } #endif @@ -602,6 +605,7 @@ void LAPACK(zgemmt)( const double *B, const blasint *ldB, const double *beta, double *C, const blasint *ldC ) { - RELAPACK_zgemmt(uplo, n, A, ldA, info); + blasint info; + RELAPACK_zgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info); } #endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e4ee8b28b..5214d9cab 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -30,6 +30,10 @@ if(WIN32) FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1 "if (Test-Path $args[2]) { Remove-Item -Force $args[2] } \n" "$ErrorActionPreference = \"Stop\"\n" +"If ((Get-Content $args[1] | & file - | %{$_ -match \"BOM\"}) -contains $true) {\n" +"echo 'Skipped due to wrong input encoding'\n" +"exit 0\n" +"}\n" "Get-Content $args[1] | & $args[0]\n" "If ((Get-Content $args[2] | %{$_ -match \"FATAL\"}) -contains $true) {\n" "echo Error\n"