Merge branch 'develop' into clapack

This commit is contained in:
Martin Kroeker 2022-03-30 18:01:38 +02:00 committed by GitHub
commit e3250e2362
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
63 changed files with 1217 additions and 221 deletions

View File

@ -23,9 +23,9 @@ if(MSVC AND NOT DEFINED NOFORTRAN)
endif()
#######
if(MSVC)
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
endif()
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF)
option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
@ -320,7 +320,9 @@ if (NOT NOFORTRAN)
if(NOT NO_CBLAS)
add_subdirectory(ctest)
endif()
if (BUILD_TESTING)
add_subdirectory(lapack-netlib/TESTING)
endif()
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
add_subdirectory(cpp_thread_test)
endif()

View File

@ -55,6 +55,13 @@ FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
endif
endif
ifeq ($(CORE), FT2000)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
endif
endif
# Use a72 tunings because Neoverse-N1 is only available
# in GCC>=9
ifeq ($(CORE), NEOVERSEN1)
@ -229,6 +236,43 @@ endif
endif
endif
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
ifeq ($(CORE), CORTEXX1)
CCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72
endif
endif
endif
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
ifeq ($(CORE), CORTEXX2)
CCOMMON_OPT += -march=armv8.4-a+sve
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.4-a+sve
endif
endif
endif
#ifeq (1, $(filter 1,$(ISCLANG)))
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
ifeq ($(CORE), CORTEXA510)
CCOMMON_OPT += -march=armv8.4-a+sve
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.4-a+sve
endif
endif
endif
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
ifeq ($(CORE), CORTEXA710)
CCOMMON_OPT += -march=armv8.4-a+sve
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.4-a+sve
endif
endif
endif
endif
endif

View File

@ -71,7 +71,8 @@ endif
getarch : getarch.c cpuid.S dummy $(CPUIDEMU)
$(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) -o $(@F) getarch.c cpuid.S $(CPUIDEMU)
avx512=$$(perl c_check - - $(CC) $(TARGET_FLAGS) $(CFLAGS) | grep NO_AVX512); \
$(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) $${avx512:+-D$${avx512}} -o $(@F) getarch.c cpuid.S $(CPUIDEMU)
getarch_2nd : getarch_2nd.c config.h dummy
ifndef TARGET_CORE

View File

@ -92,6 +92,10 @@ CORTEXA53
CORTEXA57
CORTEXA72
CORTEXA73
CORTEXA510
CORTEXA710
CORTEXX1
CORTEXX2
NEOVERSEN1
NEOVERSEV1
NEOVERSEN2
@ -103,6 +107,9 @@ THUNDERX2T99
TSV110
THUNDERX3T110
VORTEX
A64FX
ARMV8SVE
FT2000
9.System Z:
ZARCH_GENERIC

View File

@ -65,7 +65,7 @@ jobs:
- task: CMake@1
inputs:
workingDirectory: 'build' # Optional
cmakeArgs: '-G "Visual Studio 16 2019" ..'
cmakeArgs: '-G "Visual Studio 17 2022" ..'
- task: CMake@1
inputs:
cmakeArgs: '--build . --config Release'
@ -103,7 +103,7 @@ jobs:
- job: Windows_flang_clang
pool:
vmImage: 'windows-latest'
vmImage: 'windows-2022'
steps:
- script: |
set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%"
@ -114,8 +114,8 @@ jobs:
conda install --yes --quiet ninja flang
mkdir build
cd build
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
cmake --build . --config Release
ctest

View File

@ -254,7 +254,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) {
# $tmpf = new File::Temp( UNLINK => 1 );
($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
print $fh "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
$args = " -march=skylake-avx512 -c -o $tmpf.o $tmpf";
if ($compiler eq "PGI") {
$args = " -tp skylake -c -o $tmpf.o $tmpf";
@ -278,7 +278,7 @@ if ($data =~ /HAVE_C11/) {
$c11_atomics = 0;
} else {
($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
print $tmpf "#include <stdatomic.h>\nint main(void){}\n";
print $fh "#include <stdatomic.h>\nint main(void){}\n";
$args = " -c -o $tmpf.o $tmpf";
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
@ -316,6 +316,7 @@ if ($architecture ne $hostarch) {
}
$cross = 1 if ($os ne $hostos);
$cross = 0 if (($os eq "Android") && ($hostos eq "Linux") && ($ENV{TERMUX_APP_PID} != ""));
$openmp = "" if $ENV{USE_OPENMP} != 1;

View File

@ -161,6 +161,30 @@ if (${CORE} STREQUAL ARMV8SVE)
endif ()
endif ()
if (${CORE} STREQUAL CORTEXA510)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
endif ()
endif ()
if (${CORE} STREQUAL CORTEXA710)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
endif ()
endif ()
if (${CORE} STREQUAL CORTEXX1)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a")
endif ()
endif ()
if (${CORE} STREQUAL CORTEXX2)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
endif ()
endif ()
if (${CORE} STREQUAL POWER10)
if (NOT DYNAMIC_ARCH)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)

View File

@ -67,8 +67,16 @@ if (${F_COMPILER} STREQUAL "GFORTRAN")
if (BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
if (INTERFACE64)
if (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel")
if (WIN32)
set(FCOMMON_OPT "${FCOMMON_OPT} /integer-size:64")
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -integer-size 64")
endif ()
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
endif ()
endif ()
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -m32")
endif ()

View File

@ -2610,8 +2610,9 @@
#endif
#ifndef ASSEMBLER
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\
|| defined(ARCH_LOONGARCH64) || defined(ARCH_E2K)
#if !defined(DYNAMIC_ARCH) \
&& (defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) \
|| defined(ARCH_LOONGARCH64) || defined(ARCH_E2K))
extern BLASLONG gemm_offset_a;
extern BLASLONG gemm_offset_b;
extern BLASLONG sbgemm_p;

View File

@ -45,6 +45,10 @@ size_t length64=sizeof(value64);
#define CPU_NEOVERSEN1 11
#define CPU_NEOVERSEV1 16
#define CPU_NEOVERSEN2 17
#define CPU_CORTEXX1 18
#define CPU_CORTEXX2 19
#define CPU_CORTEXA510 20
#define CPU_CORTEXA710 21
// Qualcomm
#define CPU_FALKOR 6
// Cavium
@ -59,6 +63,8 @@ size_t length64=sizeof(value64);
#define CPU_VORTEX 13
// Fujitsu
#define CPU_A64FX 15
// Phytium
#define CPU_FT2000 22
static char *cpuname[] = {
"UNKNOWN",
@ -73,12 +79,17 @@ static char *cpuname[] = {
"TSV110",
"EMAG8180",
"NEOVERSEN1",
"NEOVERSEV1"
"NEOVERSEN2"
"THUNDERX3T110",
"VORTEX",
"CORTEXA55",
"A64FX"
"A64FX",
"NEOVERSEV1",
"NEOVERSEN2",
"CORTEXX1",
"CORTEXX2",
"CORTEXA510",
"CORTEXA710",
"FT2000"
};
static char *cpuname_lower[] = {
@ -94,12 +105,17 @@ static char *cpuname_lower[] = {
"tsv110",
"emag8180",
"neoversen1",
"neoversev1",
"neoversen2",
"thunderx3t110",
"vortex",
"cortexa55",
"a64fx"
"a64fx",
"neoversev1",
"neoversen2",
"cortexx1",
"cortexx2",
"cortexa510",
"cortexa710",
"ft2000"
};
int get_feature(char *search)
@ -182,6 +198,14 @@ int detect(void)
return CPU_NEOVERSEN2;
else if (strstr(cpu_part, "0xd05"))
return CPU_CORTEXA55;
else if (strstr(cpu_part, "0xd46"))
return CPU_CORTEXA510;
else if (strstr(cpu_part, "0xd47"))
return CPU_CORTEXA710;
else if (strstr(cpu_part, "0xd44"))
return CPU_CORTEXX1;
else if (strstr(cpu_part, "0xd4c"))
return CPU_CORTEXX2;
}
// Qualcomm
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
@ -202,6 +226,13 @@ int detect(void)
// Fujitsu
else if (strstr(cpu_implementer, "0x46") && strstr(cpu_part, "0x001"))
return CPU_A64FX;
// Apple
else if (strstr(cpu_implementer, "0x61") && strstr(cpu_part, "0x022"))
return CPU_VORTEX;
// Phytium
else if (strstr(cpu_implementer, "0x70") && (strstr(cpu_part, "0x660") || strstr(cpu_part, "0x661")
|| strstr(cpu_part, "0x662") || strstr(cpu_part, "0x663")))
return CPU_FT2000;
}
p = (char *) NULL ;
@ -382,7 +413,24 @@ void get_cpuconfig(void)
printf("#define DTB_DEFAULT_ENTRIES 48\n");
printf("#define DTB_SIZE 4096\n");
break;
case CPU_CORTEXA510:
case CPU_CORTEXA710:
case CPU_CORTEXX1:
case CPU_CORTEXX2:
printf("#define ARMV9\n");
printf("#define %s\n", cpuname[d]);
printf("#define L1_CODE_SIZE 65536\n");
printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_CODE_ASSOCIATIVE 4\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L1_DATA_ASSOCIATIVE 4\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 8\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
case CPU_FALKOR:
printf("#define FALKOR\n");
printf("#define L1_CODE_SIZE 65536\n");
@ -469,9 +517,9 @@ void get_cpuconfig(void)
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
#ifdef __APPLE__
case CPU_VORTEX:
printf("#define VORTEX \n");
#ifdef __APPLE__
sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0);
printf("#define L1_CODE_SIZE %lld \n",value64);
sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0);
@ -480,10 +528,10 @@ void get_cpuconfig(void)
printf("#define L1_DATA_SIZE %lld \n",value64);
sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0);
printf("#define L2_SIZE %lld \n",value64);
#endif
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
#endif
case CPU_A64FX:
printf("#define A64FX\n");
printf("#define L1_CODE_SIZE 65535\n");
@ -494,6 +542,16 @@ void get_cpuconfig(void)
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
case CPU_FT2000:
printf("#define FT2000\n");
printf("#define L1_CODE_SIZE 32768\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 33554432\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
}
get_cpucount();
}

View File

@ -1707,8 +1707,18 @@ int get_cpuname(void){
if (model == 0xf && stepping < 0xe)
return CPUTYPE_NANO;
return CPUTYPE_NEHALEM;
case 0x7:
switch (exmodel) {
case 5:
if (support_avx2())
return CPUTYPE_ZEN;
else
return CPUTYPE_DUNNINGTON;
default:
if (family >= 0x7)
return CPUTYPE_NEHALEM;
}
default:
if (family >= 0x8)
return CPUTYPE_NEHALEM;
else
return CPUTYPE_VIAC3;
@ -1716,8 +1726,21 @@ int get_cpuname(void){
}
if (vendor == VENDOR_ZHAOXIN){
switch (family) {
case 0x7:
switch (exmodel) {
case 5:
if (support_avx2())
return CPUTYPE_ZEN;
else
return CPUTYPE_DUNNINGTON;
default:
return CPUTYPE_NEHALEM;
}
default:
return CPUTYPE_NEHALEM;
}
}
if (vendor == VENDOR_RISE){
switch (family) {
@ -2416,8 +2439,18 @@ int get_coretype(void){
if (model == 0xf && stepping < 0xe)
return CORE_NANO;
return CORE_NEHALEM;
case 0x7:
switch (exmodel) {
case 5:
if (support_avx2())
return CORE_ZEN;
else
return CORE_DUNNINGTON;
default:
if (family >= 0x7)
return CORE_NEHALEM;
}
default:
if (family >= 0x8)
return CORE_NEHALEM;
else
return CORE_VIAC3;
@ -2425,8 +2458,21 @@ int get_coretype(void){
}
if (vendor == VENDOR_ZHAOXIN) {
switch (family) {
case 0x7:
switch (exmodel) {
case 5:
if (support_avx2())
return CORE_ZEN;
else
return CORE_DUNNINGTON;
default:
return CORE_NEHALEM;
}
default:
return CORE_NEHALEM;
}
}
return CORE_UNKNOWN;
}

View File

@ -96,7 +96,7 @@ extern gotoblas_t gotoblas_BARCELONA;
#endif
#ifdef DYN_ATOM
extern gotoblas_t gotoblas_ATOM;
elif defined(DYN_NEHALEM)
#elif defined(DYN_NEHALEM)
#define gotoblas_ATOM gotoblas_NEHALEM
#else
#define gotoblas_ATOM gotoblas_PRESCOTT
@ -875,15 +875,38 @@ static gotoblas_t *get_coretype(void){
if (model == 0xf && stepping < 0xe)
return &gotoblas_NANO;
return &gotoblas_NEHALEM;
case 0x7:
switch (exmodel) {
case 5:
if (support_avx2())
return &gotoblas_ZEN;
else
return &gotoblas_DUNNINGTON;
default:
if (family >= 0x7)
return &gotoblas_NEHALEM;
}
default:
if (family >= 0x8)
return &gotoblas_NEHALEM;
}
}
if (vendor == VENDOR_ZHAOXIN) {
switch (family) {
case 0x7:
switch (exmodel) {
case 5:
if (support_avx2())
return &gotoblas_ZEN;
else
return &gotoblas_DUNNINGTON;
default:
return &gotoblas_NEHALEM;
}
default:
return &gotoblas_NEHALEM;
}
}
return NULL;
}

View File

@ -60,6 +60,9 @@ static char* openblas_config_str=""
#ifdef USE_OPENMP
"USE_OPENMP "
#endif
#ifdef USE_TLS
"USE_TLS "
#endif
#ifndef DYNAMIC_ARCH
CHAR_CORENAME
#endif

View File

@ -94,14 +94,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <sys/sysinfo.h>
#endif
#if defined(__x86_64__) || defined(_M_X64)
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#else
#ifndef NO_AVX512
#define NO_AVX512
#endif
#endif
#endif
/* #define FORCE_P2 */
/* #define FORCE_KATMAI */
/* #define FORCE_COPPERMINE */
@ -1240,7 +1232,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa53"
#define CORENAME "CORTEXA53"
#else
#endif
#ifdef FORCE_CORTEXA57
@ -1256,7 +1247,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa57"
#define CORENAME "CORTEXA57"
#else
#endif
#ifdef FORCE_CORTEXA72
@ -1272,7 +1262,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa72"
#define CORENAME "CORTEXA72"
#else
#endif
#ifdef FORCE_CORTEXA73
@ -1288,7 +1277,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa73"
#define CORENAME "CORTEXA73"
#else
#endif
#ifdef FORCE_CORTEXX1
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "CORTEXX1"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXX1 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexx1"
#define CORENAME "CORTEXX1"
#endif
#ifdef FORCE_CORTEXX2
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "CORTEXX2"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXX2 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 -DARMV9"
#define LIBNAME "cortexx2"
#define CORENAME "CORTEXX2"
#endif
#ifdef FORCE_CORTEXA510
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "CORTEXA510"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXA510 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 -DARMV9"
#define LIBNAME "cortexa510"
#define CORENAME "CORTEXA510"
#endif
#ifdef FORCE_CORTEXA710
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "CORTEXA710"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXA710 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 -DARMV9"
#define LIBNAME "cortexa710"
#define CORENAME "CORTEXA710"
#endif
#ifdef FORCE_NEOVERSEN1
@ -1305,7 +1349,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-march=armv8.2-a -mtune=neoverse-n1"
#define LIBNAME "neoversen1"
#define CORENAME "NEOVERSEN1"
#else
#endif
#ifdef FORCE_NEOVERSEV1
@ -1322,7 +1365,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-march=armv8.4-a -mtune=neoverse-v1"
#define LIBNAME "neoversev1"
#define CORENAME "NEOVERSEV1"
#else
#endif
@ -1340,7 +1382,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-march=armv8.5-a -mtune=neoverse-n2"
#define LIBNAME "neoversen2"
#define CORENAME "NEOVERSEN2"
#else
#endif
#ifdef FORCE_CORTEXA55
@ -1356,7 +1397,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa55"
#define CORENAME "CORTEXA55"
#else
#endif
#ifdef FORCE_FALKOR
@ -1372,7 +1412,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "falkor"
#define CORENAME "FALKOR"
#else
#endif
#ifdef FORCE_THUNDERX
@ -1387,7 +1426,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "thunderx"
#define CORENAME "THUNDERX"
#else
#endif
#ifdef FORCE_THUNDERX2T99
@ -1405,7 +1443,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "thunderx2t99"
#define CORENAME "THUNDERX2T99"
#else
#endif
#ifdef FORCE_TSV110
@ -1421,7 +1458,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "tsv110"
#define CORENAME "TSV110"
#else
#endif
#ifdef FORCE_EMAG8180
@ -1456,7 +1492,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "thunderx3t110"
#define CORENAME "THUNDERX3T110"
#else
#endif
#ifdef FORCE_VORTEX
@ -1488,7 +1523,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8"
#define LIBNAME "a64fx"
#define CORENAME "A64FX"
#else
#endif
#ifdef FORCE_FT2000
#define ARMV8
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "FT2000"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DFT2000 " \
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
"-DL2_SIZE=33554426-DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "ft2000"
#define CORENAME "FT2000"
#endif
#ifdef FORCE_ZARCH_GENERIC

View File

@ -678,7 +678,7 @@ endif ()
set(SBGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c)
endif ()
if (NOT DEFINED SBGEMM_SMALL_K_B0_TT)
set($SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
set(SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
endif ()
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16")

View File

@ -0,0 +1,216 @@
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRSMKERNEL_LN = trsm_kernel_LN_sve.c
STRSMKERNEL_LT = trsm_kernel_LT_sve.c
STRSMKERNEL_RN = trsm_kernel_RN_sve.c
STRSMKERNEL_RT = trsm_kernel_RT_sve.c
DTRSMKERNEL_LN = trsm_kernel_LN_sve.c
DTRSMKERNEL_LT = trsm_kernel_LT_sve.c
DTRSMKERNEL_RN = trsm_kernel_RN_sve.c
DTRSMKERNEL_RT = trsm_kernel_RT_sve.c
TRSMCOPYLN_M = trsm_lncopy_sve.c
TRSMCOPYLT_M = trsm_ltcopy_sve.c
TRSMCOPYUN_M = trsm_uncopy_sve.c
TRSMCOPYUT_M = trsm_utcopy_sve.c
CTRSMKERNEL_LN = trsm_kernel_LN_sve.c
CTRSMKERNEL_LT = trsm_kernel_LT_sve.c
CTRSMKERNEL_RN = trsm_kernel_RN_sve.c
CTRSMKERNEL_RT = trsm_kernel_RT_sve.c
ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c
ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c
ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c
ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c
ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c
ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c
ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c
ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
SDOTKERNEL = ../generic/dot.c
else
SDOTKERNEL = dot.S
endif
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S
SGEMM_BETA = sgemm_beta.S
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
SGEMMINCOPY = sgemm_ncopy_sve_v1.c
SGEMMITCOPY = sgemm_tcopy_sve_v1.c
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRMMUNCOPY_M = trmm_uncopy_sve_v1.c
STRMMLNCOPY_M = trmm_lncopy_sve_v1.c
STRMMUTCOPY_M = trmm_utcopy_sve_v1.c
STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
SSYMMUCOPY_M = symm_ucopy_sve.c
SSYMMLCOPY_M = symm_lcopy_sve.c
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
DSYMMUCOPY_M = symm_ucopy_sve.c
DSYMMLCOPY_M = symm_lcopy_sve.c
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
CGEMMINCOPY = cgemm_ncopy_sve_v1.c
CGEMMITCOPY = cgemm_tcopy_sve_v1.c
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
CHEMMLTCOPY_M = zhemm_ltcopy_sve.c
CHEMMUTCOPY_M = zhemm_utcopy_sve.c
CSYMMUCOPY_M = zsymm_ucopy_sve.c
CSYMMLCOPY_M = zsymm_lcopy_sve.c
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
ZGEMMINCOPY = zgemm_ncopy_sve_v1.c
ZGEMMITCOPY = zgemm_tcopy_sve_v1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c
ZHEMMUTCOPY_M = zhemm_utcopy_sve.c
ZSYMMUCOPY_M = zsymm_ucopy_sve.c
ZSYMMLCOPY_M = zsymm_lcopy_sve.c

View File

@ -0,0 +1,216 @@
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRSMKERNEL_LN = trsm_kernel_LN_sve.c
STRSMKERNEL_LT = trsm_kernel_LT_sve.c
STRSMKERNEL_RN = trsm_kernel_RN_sve.c
STRSMKERNEL_RT = trsm_kernel_RT_sve.c
DTRSMKERNEL_LN = trsm_kernel_LN_sve.c
DTRSMKERNEL_LT = trsm_kernel_LT_sve.c
DTRSMKERNEL_RN = trsm_kernel_RN_sve.c
DTRSMKERNEL_RT = trsm_kernel_RT_sve.c
TRSMCOPYLN_M = trsm_lncopy_sve.c
TRSMCOPYLT_M = trsm_ltcopy_sve.c
TRSMCOPYUN_M = trsm_uncopy_sve.c
TRSMCOPYUT_M = trsm_utcopy_sve.c
CTRSMKERNEL_LN = trsm_kernel_LN_sve.c
CTRSMKERNEL_LT = trsm_kernel_LT_sve.c
CTRSMKERNEL_RN = trsm_kernel_RN_sve.c
CTRSMKERNEL_RT = trsm_kernel_RT_sve.c
ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c
ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c
ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c
ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c
ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c
ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c
ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c
ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
SDOTKERNEL = ../generic/dot.c
else
SDOTKERNEL = dot.S
endif
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S
SGEMM_BETA = sgemm_beta.S
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
SGEMMINCOPY = sgemm_ncopy_sve_v1.c
SGEMMITCOPY = sgemm_tcopy_sve_v1.c
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRMMUNCOPY_M = trmm_uncopy_sve_v1.c
STRMMLNCOPY_M = trmm_lncopy_sve_v1.c
STRMMUTCOPY_M = trmm_utcopy_sve_v1.c
STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
SSYMMUCOPY_M = symm_ucopy_sve.c
SSYMMLCOPY_M = symm_lcopy_sve.c
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
DSYMMUCOPY_M = symm_ucopy_sve.c
DSYMMLCOPY_M = symm_lcopy_sve.c
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
CGEMMINCOPY = cgemm_ncopy_sve_v1.c
CGEMMITCOPY = cgemm_tcopy_sve_v1.c
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
CHEMMLTCOPY_M = zhemm_ltcopy_sve.c
CHEMMUTCOPY_M = zhemm_utcopy_sve.c
CSYMMUCOPY_M = zsymm_ucopy_sve.c
CSYMMLCOPY_M = zsymm_lcopy_sve.c
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
ZGEMMINCOPY = zgemm_ncopy_sve_v1.c
ZGEMMITCOPY = zgemm_tcopy_sve_v1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c
ZHEMMUTCOPY_M = zhemm_utcopy_sve.c
ZSYMMUCOPY_M = zsymm_ucopy_sve.c
ZSYMMLCOPY_M = zsymm_lcopy_sve.c

View File

@ -0,0 +1 @@
include $(KERNELDIR)/KERNEL.CORTEXA57

View File

@ -0,0 +1,216 @@
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRSMKERNEL_LN = trsm_kernel_LN_sve.c
STRSMKERNEL_LT = trsm_kernel_LT_sve.c
STRSMKERNEL_RN = trsm_kernel_RN_sve.c
STRSMKERNEL_RT = trsm_kernel_RT_sve.c
DTRSMKERNEL_LN = trsm_kernel_LN_sve.c
DTRSMKERNEL_LT = trsm_kernel_LT_sve.c
DTRSMKERNEL_RN = trsm_kernel_RN_sve.c
DTRSMKERNEL_RT = trsm_kernel_RT_sve.c
TRSMCOPYLN_M = trsm_lncopy_sve.c
TRSMCOPYLT_M = trsm_ltcopy_sve.c
TRSMCOPYUN_M = trsm_uncopy_sve.c
TRSMCOPYUT_M = trsm_utcopy_sve.c
CTRSMKERNEL_LN = trsm_kernel_LN_sve.c
CTRSMKERNEL_LT = trsm_kernel_LT_sve.c
CTRSMKERNEL_RN = trsm_kernel_RN_sve.c
CTRSMKERNEL_RT = trsm_kernel_RT_sve.c
ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c
ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c
ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c
ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c
ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c
ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c
ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c
ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
SDOTKERNEL = ../generic/dot.c
else
SDOTKERNEL = dot.S
endif
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S
SGEMM_BETA = sgemm_beta.S
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
SGEMMINCOPY = sgemm_ncopy_sve_v1.c
SGEMMITCOPY = sgemm_tcopy_sve_v1.c
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRMMUNCOPY_M = trmm_uncopy_sve_v1.c
STRMMLNCOPY_M = trmm_lncopy_sve_v1.c
STRMMUTCOPY_M = trmm_utcopy_sve_v1.c
STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
SSYMMUCOPY_M = symm_ucopy_sve.c
SSYMMLCOPY_M = symm_lcopy_sve.c
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
DSYMMUCOPY_M = symm_ucopy_sve.c
DSYMMLCOPY_M = symm_lcopy_sve.c
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
CGEMMINCOPY = cgemm_ncopy_sve_v1.c
CGEMMITCOPY = cgemm_tcopy_sve_v1.c
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
CHEMMLTCOPY_M = zhemm_ltcopy_sve.c
CHEMMUTCOPY_M = zhemm_utcopy_sve.c
CSYMMUCOPY_M = zsymm_ucopy_sve.c
CSYMMLCOPY_M = zsymm_lcopy_sve.c
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
ZGEMMINCOPY = zgemm_ncopy_sve_v1.c
ZGEMMITCOPY = zgemm_tcopy_sve_v1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c
ZHEMMUTCOPY_M = zhemm_utcopy_sve.c
ZSYMMUCOPY_M = zsymm_ucopy_sve.c
ZSYMMLCOPY_M = zsymm_lcopy_sve.c

View File

@ -0,0 +1,3 @@
include $(KERNELDIR)/KERNEL.CORTEXA57

View File

@ -1239,7 +1239,6 @@ static void init_parameter(void) {
#ifdef BUILD_BFLOAT16
TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P;
TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R;
TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q;
#endif
#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1)
@ -1824,6 +1823,13 @@ static void init_parameter(void) {
fprintf(stderr, "L2 = %8d DGEMM_P .. %d\n", l2, TABLE_NAME.dgemm_p);
#endif
#if BUILD_BFLOAT16==1
TABLE_NAME.sbgemm_r = (((BUFFER_SIZE -
((TABLE_NAME.sbgemm_p * TABLE_NAME.sbgemm_q * 4 + TABLE_NAME.offsetA
+ TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.sbgemm_q * 4) - 15) & ~15);
#endif
#if BUILD_SINGLE==1
TABLE_NAME.sgemm_r = (((BUFFER_SIZE -
((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA

View File

@ -24,6 +24,7 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
#include <immintrin.h>
#include "common.h"
@ -47,7 +48,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
_mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N)
#define MASK_STORE_512(M, N) \
result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "k"(mask)); \
asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "Yk"(mask)); \
_mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N)
#endif
@ -265,7 +266,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
int mm = M - i;
if (!mm) return 0;
if (mm > 4 || K < 16) {
register __mmask8 mask asm("k1") = (1UL << mm) - 1;
register __mmask8 mask = (1UL << mm) - 1;
for (j = 0; j < n6; j += 6) {
DECLARE_RESULT_512(0, 0);
DECLARE_RESULT_512(0, 1);
@ -588,3 +589,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
}
return 0;
}
#else
#include "../generic/gemm_small_matrix_kernel_nn.c"
#endif

View File

@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
_mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N)
#define MASK_STORE_512(M, N) \
result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "k"(mask)); \
asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "Yk"(mask)); \
_mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N)
#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
__m512d tmp##M##N = _mm512_i64gather_pd(vindex_n, &C[(j + N*8)*ldc + i + M], 8); \
@ -303,7 +303,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
}
int mm = M - i;
if (mm >= 6) {
register __mmask16 mask asm("k1") = (1UL << mm) - 1;
register __mmask16 mask = (1UL << mm) - 1;
for (j = 0; j < n8; j += 8) {
DECLARE_RESULT_512(0, 0);
DECLARE_RESULT_512(0, 1);

View File

@ -24,6 +24,7 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
#include <immintrin.h>
#include "common.h"
@ -320,3 +321,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
}
return 0;
}
#else
#include "../generic/gemm_small_matrix_kernel_tn.c"
#endif

View File

@ -114,10 +114,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
asm("vmovups %0, (%1, %2, 4)": : "v"(val1), "r"(addr), "r"(ldc))
#define _MASK_STORE_C_2nx16(addr, val0, val1) \
asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \
asm("vfmadd213ps (%1, %3, 4), %2, %0 %{%4%}": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc), "k"(mmask)); \
asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask)); \
asm("vmovups %0, (%1, %2, 4) %{%3%}": : "v"(val1), "r"(addr), "r"(ldc), "k"(mmask))
asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "Yk"(mmask)); \
asm("vfmadd213ps (%1, %3, 4), %2, %0 %{%4%}": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc), "Yk"(mmask)); \
asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "Yk"(mmask)); \
asm("vmovups %0, (%1, %2, 4) %{%3%}": : "v"(val1), "r"(addr), "r"(ldc), "Yk"(mmask))
#define _REORDER_C_2X(result_0, result_1) { \
__m512 tmp0, tmp1; \
@ -154,8 +154,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
asm("vmovups %0, (%1)": : "v"(val0), "r"(addr));
#define _MASK_STORE_C_16(addr, val0) \
asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \
asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask));
asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "Yk"(mmask)); \
asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "Yk"(mmask));
#define N_STORE_4X(A, Bx, By) { \
_REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \

View File

@ -13,6 +13,8 @@
#define ONE 1.e0f
#define ZERO 0.e0f
#define SHUFFLE_MAGIC_NO (const int) 0x39
#undef STORE16_COMPLETE_RESULT
#undef STORE16_MASK_COMPLETE_RESULT
#undef SBGEMM_BLOCK_KERNEL_NN_32x8xK
@ -356,7 +358,6 @@ void sbgemm_block_kernel_nn_32xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, floa
bfloat16 * B_addr = B;
float * C_addr = C;
int SHUFFLE_MAGIC_NO = 0x39;
BLASLONG tag_k_32x = k & (~31);
#ifndef ONE_ALPHA
@ -465,7 +466,6 @@ void sbgemm_block_kernel_nn_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, floa
bfloat16 * B_addr = B;
float * C_addr = C;
int SHUFFLE_MAGIC_NO = 0x39;
BLASLONG tag_k_32x = k & (~31);
#ifndef ONE_ALPHA
@ -1192,7 +1192,6 @@ void sbgemm_block_kernel_tn_32xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, floa
bfloat16 * B_addr = B;
float * C_addr = C;
int SHUFFLE_MAGIC_NO = 0x39;
BLASLONG tag_k_32x = k & (~31);
#ifndef ONE_ALPHA
@ -1291,7 +1290,6 @@ void sbgemm_block_kernel_tn_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, floa
bfloat16 * B_addr = B;
float * C_addr = C;
int SHUFFLE_MAGIC_NO = 0x39;
BLASLONG tag_k_32x = k & (~31);
#ifndef ONE_ALPHA

View File

@ -135,7 +135,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b,
0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f,
};
u_int64_t permute_table2[] = {
uint64_t permute_table2[] = {
0x00, 0x01, 0x02, 0x03, 8|0x0, 8|0x1, 8|0x2, 8|0x3,
0x04, 0x05, 0x06, 0x07, 8|0x4, 8|0x5, 8|0x6, 8|0x7,
};

View File

@ -24,6 +24,7 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
#include <immintrin.h>
#include "common.h"
@ -47,7 +48,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
_mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N)
#define MASK_STORE_512(M, N) \
result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "k"(mask)); \
asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "Yk"(mask)); \
_mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N)
#endif
@ -266,7 +267,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
int mm = M - i;
if (!mm) return 0;
if (mm > 8 || K < 32) {
register __mmask16 mask asm("k1") = (1UL << mm) - 1;
register __mmask16 mask = (1UL << mm) - 1;
for (j = 0; j < n6; j += 6) {
DECLARE_RESULT_512(0, 0);
DECLARE_RESULT_512(0, 1);
@ -610,3 +611,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
}
return 0;
}
#else
#include "../generic/gemm_small_matrix_kernel_nn.c"
#endif

View File

@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
_mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N)
#define MASK_STORE_512(M, N) \
result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "k"(mask)); \
asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "Yk"(mask)); \
_mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N)
#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
__m512 tmp##M##N = _mm512_i32gather_ps(vindex_n, &C[(j + N*16)*ldc + i + M], 4); \
@ -303,7 +303,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
}
int mm = M - i;
if (mm >= 12) {
register __mmask16 mask asm("k1") = (1UL << mm) - 1;
register __mmask16 mask = (1UL << mm) - 1;
for (j = 0; j < n8; j += 8) {
DECLARE_RESULT_512(0, 0);
DECLARE_RESULT_512(0, 1);

View File

@ -24,6 +24,7 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
#include <immintrin.h>
#include "common.h"
@ -314,3 +315,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
}
return 0;
}
#else
#include "../generic/gemm_small_matrix_kernel_tn.c"
#endif

View File

@ -452,11 +452,6 @@
MOVDDUP(4 * SIZE, A1, a1)
movsd 0 * SIZE(YY), yy1
movhpd 1 * SIZE(YY), yy1
movsd 2 * SIZE(YY), yy2
movhpd 3 * SIZE(YY), yy2
movapd 8 * SIZE(XX), xtemp1
movapd 10 * SIZE(XX), xtemp2
movapd 12 * SIZE(XX), xtemp3
@ -475,6 +470,12 @@
MOVDDUP(6 * SIZE - (4 * SIZE), A2, a2)
ALIGN_3
.L12_prep:
movsd 0 * SIZE(YY), yy1
movhpd 1 * SIZE(YY), yy1
movsd 2 * SIZE(YY), yy2
movhpd 3 * SIZE(YY), yy2
.L12:
movapd xtemp1, xt1
mulpd a1, xt1
@ -608,8 +609,6 @@
movlpd yy2, 6 * SIZE(YY)
movhpd yy2, 7 * SIZE(YY)
movsd 10 * SIZE(YY), yy2
movhpd 11 * SIZE(YY), yy2
movapd xtemp2, xt1
movapd 18 * SIZE(XX), xtemp2
@ -621,8 +620,6 @@
movlpd yy1, 4 * SIZE(YY)
movhpd yy1, 5 * SIZE(YY)
movsd 8 * SIZE(YY), yy1
movhpd 9 * SIZE(YY), yy1
subq $-16 * SIZE, XX
addq $ 8 * SIZE, YY
@ -630,7 +627,8 @@
addq $ 8 * SIZE, A2
decq I
jg .L12
jg .L12_prep
jmp .L15
ALIGN_3
.L14:
@ -641,7 +639,6 @@
jle .L16
MOVDDUP(6 * SIZE - (4 * SIZE), A2, a2)
jmp .L15_pastcheck
.L15:
movq M, I
@ -650,6 +647,11 @@
testq $2, I
jle .L16
movsd 0 * SIZE(YY), yy1
movhpd 1 * SIZE(YY), yy1
movsd 2 * SIZE(YY), yy2
movhpd 3 * SIZE(YY), yy2
.L15_pastcheck:
movapd xtemp1, xt1
mulpd a1, xt1
@ -705,8 +707,6 @@
movlpd yy2, 2 * SIZE(YY)
movhpd yy2, 3 * SIZE(YY)
movsd 6 * SIZE(YY), yy2
movhpd 7 * SIZE(YY), yy2
movapd xtemp2, xt1
movapd 10 * SIZE(XX), xtemp2
@ -717,8 +717,6 @@
movlpd yy1, 0 * SIZE(YY)
movhpd yy1, 1 * SIZE(YY)
movsd 4 * SIZE(YY), yy1
movhpd 5 * SIZE(YY), yy1
addq $4 * SIZE, YY
addq $4 * SIZE, A1
@ -731,6 +729,9 @@
MOVDDUP(1 * SIZE, A1, a2)
movsd 0 * SIZE(YY), yy1
movhpd 1 * SIZE(YY), yy1
movapd xtemp1, xt1
mulpd a1, xt1
mulpd atemp1, a1

View File

@ -2,9 +2,9 @@ add_subdirectory(SRC)
if(BUILD_TESTING)
add_subdirectory(TESTING)
endif()
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/blas.pc.in ${CMAKE_CURRENT_BINARY_DIR}/blas.pc @ONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/blas.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${BLASLIB}.pc @ONLY)
install(FILES
${CMAKE_CURRENT_BINARY_DIR}/blas.pc
${CMAKE_CURRENT_BINARY_DIR}/${BLASLIB}.pc
DESTINATION ${PKG_CONFIG_DIR}
COMPONENT Development
)

View File

@ -97,10 +97,10 @@ if(BUILD_COMPLEX16)
endif()
list(REMOVE_DUPLICATES SOURCES)
add_library(blas ${SOURCES})
add_library(${BLASLIB} ${SOURCES})
set_target_properties(
blas PROPERTIES
${BLASLIB} PROPERTIES
VERSION ${LAPACK_VERSION}
SOVERSION ${LAPACK_MAJOR_VERSION}
)
lapack_install_library(blas)
lapack_install_library(${BLASLIB})

View File

@ -2,7 +2,7 @@ macro(add_blas_test name src)
get_filename_component(baseNAME ${src} NAME_WE)
set(TEST_INPUT "${CMAKE_CURRENT_SOURCE_DIR}/${baseNAME}.in")
add_executable(${name} ${src})
target_link_libraries(${name} blas)
target_link_libraries(${name} ${BLASLIB})
if(EXISTS "${TEST_INPUT}")
add_test(NAME BLAS-${name} COMMAND "${CMAKE_COMMAND}"
-DTEST=$<TARGET_FILE:${name}>

View File

@ -5,4 +5,4 @@ Name: BLAS
Description: FORTRAN reference implementation of BLAS Basic Linear Algebra Subprograms
Version: @LAPACK_VERSION@
URL: http://www.netlib.org/blas/
Libs: -L${libdir} -lblas
Libs: -L${libdir} -l@BLASLIB@

View File

@ -1,7 +1,7 @@
message(STATUS "CBLAS enable")
enable_language(C)
set(LAPACK_INSTALL_EXPORT_NAME cblas-targets)
set(LAPACK_INSTALL_EXPORT_NAME ${CBLASLIB}-targets)
# Create a header file cblas.h for the routines called in my C programs
include(FortranCInterface)
@ -42,15 +42,15 @@ if(BUILD_TESTING)
endif()
if(NOT BLAS_FOUND)
set(ALL_TARGETS ${ALL_TARGETS} blas)
set(ALL_TARGETS ${ALL_TARGETS} ${BLASLIB})
endif()
# Export cblas targets from the
# install tree, if any.
set(_cblas_config_install_guard_target "")
if(ALL_TARGETS)
install(EXPORT cblas-targets
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cblas-${LAPACK_VERSION}
install(EXPORT ${CBLASLIB}-targets
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${CBLASLIB}-${LAPACK_VERSION}
COMPONENT Development
)
# Choose one of the cblas targets to use as a guard for
@ -61,7 +61,7 @@ endif()
# Export cblas targets from the build tree, if any.
set(_cblas_config_build_guard_target "")
if(ALL_TARGETS)
export(TARGETS ${ALL_TARGETS} FILE cblas-targets.cmake)
export(TARGETS ${ALL_TARGETS} FILE ${CBLASLIB}-targets.cmake)
# Choose one of the cblas targets to use as a guard
# for cblas-config.cmake to load targets from the build tree.
@ -69,26 +69,26 @@ if(ALL_TARGETS)
endif()
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/cblas-config-version.cmake.in
${LAPACK_BINARY_DIR}/cblas-config-version.cmake @ONLY)
${LAPACK_BINARY_DIR}/${CBLASLIB}-config-version.cmake @ONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/cblas-config-build.cmake.in
${LAPACK_BINARY_DIR}/cblas-config.cmake @ONLY)
${LAPACK_BINARY_DIR}/${CBLASLIB}-config.cmake @ONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cblas.pc.in ${CMAKE_CURRENT_BINARY_DIR}/cblas.pc @ONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cblas.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${CBLASLIB}.pc @ONLY)
install(FILES
${CMAKE_CURRENT_BINARY_DIR}/cblas.pc
${CMAKE_CURRENT_BINARY_DIR}/${CBLASLIB}.pc
DESTINATION ${PKG_CONFIG_DIR}
)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/cblas-config-install.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/cblas-config.cmake @ONLY)
${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${CBLASLIB}-config.cmake @ONLY)
install(FILES
${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/cblas-config.cmake
${LAPACK_BINARY_DIR}/cblas-config-version.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cblas-${LAPACK_VERSION}
${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${CBLASLIB}-config.cmake
${LAPACK_BINARY_DIR}/${CBLASLIB}-config-version.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${CBLASLIB}-${LAPACK_VERSION}
)
#install(EXPORT cblas-targets
# DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cblas-${LAPACK_VERSION}
#install(EXPORT ${CBLASLIB}-targets
# DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${CBLASLIB}-${LAPACK_VERSION}
# COMPONENT Development
# )

View File

@ -5,6 +5,6 @@ Name: CBLAS
Description: C Standard Interface to BLAS Basic Linear Algebra Subprograms
Version: @LAPACK_VERSION@
URL: http://www.netlib.org/blas/#_cblas
Libs: -L${libdir} -lcblas
Libs: -L${libdir} -l@CBLASLIB@
Cflags: -I${includedir}
Requires.private: blas
Requires.private: @BLASLIB@

View File

@ -4,11 +4,11 @@ find_package(LAPACK NO_MODULE)
# Load lapack targets from the build tree, including lapacke targets.
if(NOT TARGET lapacke)
include("@LAPACK_BINARY_DIR@/lapack-targets.cmake")
include("@LAPACK_BINARY_DIR@/@LAPACKLIB@-targets.cmake")
endif()
# Report cblas header search locations from build tree.
set(CBLAS_INCLUDE_DIRS "@LAPACK_BINARY_DIR@/include")
# Report cblas libraries.
set(CBLAS_LIBRARIES cblas)
set(CBLAS_LIBRARIES @CBLASLIB@)

View File

@ -5,19 +5,19 @@ get_filename_component(_CBLAS_PREFIX "${_CBLAS_PREFIX}" PATH)
get_filename_component(_CBLAS_PREFIX "${_CBLAS_PREFIX}" PATH)
# Load the LAPACK package with which we were built.
set(LAPACK_DIR "${_CBLAS_PREFIX}/@CMAKE_INSTALL_LIBDIR@/cmake/lapack-@LAPACK_VERSION@")
set(LAPACK_DIR "${_CBLAS_PREFIX}/@CMAKE_INSTALL_LIBDIR@/cmake/@LAPACKLIB@-@LAPACK_VERSION@")
find_package(LAPACK NO_MODULE)
# Load lapacke targets from the install tree.
if(NOT TARGET cblas)
include(${_CBLAS_SELF_DIR}/cblas-targets.cmake)
if(NOT TARGET @CBLASLIB@)
include(${_CBLAS_SELF_DIR}/@CBLASLIB@-targets.cmake)
endif()
# Report lapacke header search locations.
set(CBLAS_INCLUDE_DIRS ${_CBLAS_PREFIX}/include)
# Report lapacke libraries.
set(CBLAS_LIBRARIES cblas)
set(CBLAS_LIBRARIES @CBLASLIB@)
unset(_CBLAS_PREFIX)
unset(_CBLAS_SELF_DIR)

View File

@ -1,8 +1,8 @@
add_executable(xexample1_CBLAS cblas_example1.c)
add_executable(xexample2_CBLAS cblas_example2.c)
target_link_libraries(xexample1_CBLAS cblas)
target_link_libraries(xexample2_CBLAS cblas ${BLAS_LIBRARIES})
target_link_libraries(xexample1_CBLAS ${CBLASLIB})
target_link_libraries(xexample2_CBLAS ${CBLASLIB} ${BLAS_LIBRARIES})
add_test(example1_CBLAS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/xexample1_CBLAS)
add_test(example2_CBLAS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/xexample2_CBLAS)

View File

@ -11,7 +11,7 @@ int main ( )
double *a, *x, *y;
double alpha, beta;
int m, n, lda, incx, incy, i;
CBLAS_INDEX m, n, lda, incx, incy, i;
Layout = CblasColMajor;
transa = CblasNoTrans;

View File

@ -9,7 +9,7 @@
int main (int argc, char **argv )
{
int rout=-1,info=0,m,n,k,lda,ldb,ldc;
CBLAS_INDEX rout=-1,info=0,m,n,k,lda,ldb,ldc;
double A[2] = {0.0,0.0},
B[2] = {0.0,0.0},
C[2] = {0.0,0.0},

View File

@ -1,6 +1,7 @@
#ifndef CBLAS_H
#define CBLAS_H
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
@ -11,9 +12,9 @@ extern "C" { /* Assume C declarations for C++ */
* Enumerated and derived types
*/
#ifdef WeirdNEC
#define CBLAS_INDEX long
#define CBLAS_INDEX int64_t
#else
#define CBLAS_INDEX int
#define CBLAS_INDEX int32_t
#endif
typedef enum {CblasRowMajor=101, CblasColMajor=102} CBLAS_LAYOUT;

View File

@ -9,6 +9,8 @@
#ifndef CBLAS_F77_H
#define CBLAS_F77_H
#include <stdint.h>
#ifdef CRAY
#include <fortran.h>
#define F77_CHAR _fcd
@ -17,8 +19,12 @@
#define F77_STRLEN(a) (_fcdlen)
#endif
#ifndef F77_INT
#ifdef WeirdNEC
#define F77_INT long
#define F77_INT int64_t
#else
#define F77_INT int32_t
#endif
#endif
#ifdef F77_CHAR

View File

@ -113,16 +113,16 @@ if(BUILD_COMPLEX16)
endif()
list(REMOVE_DUPLICATES SOURCES)
add_library(cblas ${SOURCES})
add_library(${CBLASLIB} ${SOURCES})
set_target_properties(
cblas PROPERTIES
${CBLASLIB} PROPERTIES
LINKER_LANGUAGE C
VERSION ${LAPACK_VERSION}
SOVERSION ${LAPACK_MAJOR_VERSION}
)
target_include_directories(cblas PUBLIC
target_include_directories(${CBLASLIB} PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
$<INSTALL_INTERFACE:include>
)
target_link_libraries(cblas PRIVATE ${BLAS_LIBRARIES})
lapack_install_library(cblas)
target_link_libraries(${CBLASLIB} PRIVATE ${BLAS_LIBRARIES})
lapack_install_library(${CBLASLIB})

View File

@ -52,9 +52,9 @@ if(BUILD_SINGLE)
add_executable(xscblat2 c_sblat2.f ${STESTL2O} ${LAPACK_BINARY_DIR}/include/cblas_test.h)
add_executable(xscblat3 c_sblat3.f ${STESTL3O} ${LAPACK_BINARY_DIR}/include/cblas_test.h)
target_link_libraries(xscblat1 cblas)
target_link_libraries(xscblat2 cblas)
target_link_libraries(xscblat3 cblas)
target_link_libraries(xscblat1 ${CBLASLIB})
target_link_libraries(xscblat2 ${CBLASLIB})
target_link_libraries(xscblat3 ${CBLASLIB})
add_cblas_test(stest1.out "" xscblat1)
add_cblas_test(stest2.out sin2 xscblat2)
@ -66,9 +66,9 @@ if(BUILD_DOUBLE)
add_executable(xdcblat2 c_dblat2.f ${DTESTL2O} ${LAPACK_BINARY_DIR}/include/cblas_test.h)
add_executable(xdcblat3 c_dblat3.f ${DTESTL3O} ${LAPACK_BINARY_DIR}/include/cblas_test.h)
target_link_libraries(xdcblat1 cblas)
target_link_libraries(xdcblat2 cblas)
target_link_libraries(xdcblat3 cblas)
target_link_libraries(xdcblat1 ${CBLASLIB})
target_link_libraries(xdcblat2 ${CBLASLIB})
target_link_libraries(xdcblat3 ${CBLASLIB})
add_cblas_test(dtest1.out "" xdcblat1)
add_cblas_test(dtest2.out din2 xdcblat2)
@ -80,9 +80,9 @@ if(BUILD_COMPLEX)
add_executable(xccblat2 c_cblat2.f ${CTESTL2O} ${LAPACK_BINARY_DIR}/include/cblas_test.h)
add_executable(xccblat3 c_cblat3.f ${CTESTL3O} ${LAPACK_BINARY_DIR}/include/cblas_test.h)
target_link_libraries(xccblat1 cblas ${BLAS_LIBRARIES})
target_link_libraries(xccblat2 cblas)
target_link_libraries(xccblat3 cblas)
target_link_libraries(xccblat1 ${CBLASLIB} ${BLAS_LIBRARIES})
target_link_libraries(xccblat2 ${CBLASLIB})
target_link_libraries(xccblat3 ${CBLASLIB})
add_cblas_test(ctest1.out "" xccblat1)
add_cblas_test(ctest2.out cin2 xccblat2)
@ -94,9 +94,9 @@ if(BUILD_COMPLEX16)
add_executable(xzcblat2 c_zblat2.f ${ZTESTL2O} ${LAPACK_BINARY_DIR}/include/cblas_test.h)
add_executable(xzcblat3 c_zblat3.f ${ZTESTL3O} ${LAPACK_BINARY_DIR}/include/cblas_test.h)
target_link_libraries(xzcblat1 cblas)
target_link_libraries(xzcblat2 cblas)
target_link_libraries(xzcblat3 cblas)
target_link_libraries(xzcblat1 ${CBLASLIB})
target_link_libraries(xzcblat2 ${CBLASLIB})
target_link_libraries(xzcblat3 ${CBLASLIB})
add_cblas_test(ztest1.out "" xzcblat1)
add_cblas_test(ztest2.out zin2 xzcblat2)

View File

@ -14,6 +14,19 @@ macro( CheckLAPACKCompilerFlags )
set( FPE_EXIT FALSE )
# FORTRAN ILP default
if ( FORTRAN_ILP )
if( CMAKE_Fortran_COMPILER_ID STREQUAL "Intel" )
if ( WIN32 )
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} /integer-size:64")
else ()
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -integer-size 64")
endif()
else()
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fdefault-integer-8")
endif()
endif()
# GNU Fortran
if( CMAKE_Fortran_COMPILER_ID STREQUAL "GNU" )
if( "${CMAKE_Fortran_FLAGS}" MATCHES "-ffpe-trap=[izoupd]")

View File

@ -1,7 +1,7 @@
# Load lapack targets from the build tree if necessary.
set(_LAPACK_TARGET "@_lapack_config_build_guard_target@")
if(_LAPACK_TARGET AND NOT TARGET "${_LAPACK_TARGET}")
include("@LAPACK_BINARY_DIR@/lapack-targets.cmake")
include("@LAPACK_BINARY_DIR@/@LAPACKLIB@-targets.cmake")
endif()
unset(_LAPACK_TARGET)

View File

@ -4,7 +4,7 @@ get_filename_component(_LAPACK_SELF_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
# Load lapack targets from the install tree if necessary.
set(_LAPACK_TARGET "@_lapack_config_install_guard_target@")
if(_LAPACK_TARGET AND NOT TARGET "${_LAPACK_TARGET}")
include("${_LAPACK_SELF_DIR}/lapack-targets.cmake")
include("${_LAPACK_SELF_DIR}/@LAPACKLIB@-targets.cmake")
endif()
unset(_LAPACK_TARGET)

View File

@ -44,6 +44,24 @@ endif()
# By default static library
option(BUILD_SHARED_LIBS "Build shared libraries" OFF)
# By default build index32 library
option(BUILD_INDEX64 "Build Index-64 API libraries" OFF)
if(BUILD_INDEX64)
set(BLASLIB "blas64")
set(CBLASLIB "cblas64")
set(LAPACKLIB "lapack64")
set(LAPACKELIB "lapacke64")
set(TMGLIB "tmglib64")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWeirdNEC -DLAPACK_ILP64 -DHAVE_LAPACK_CONFIG_H")
set(FORTRAN_ILP TRUE)
else()
set(BLASLIB "blas")
set(CBLASLIB "cblas")
set(LAPACKLIB "lapack")
set(LAPACKELIB "lapacke")
set(TMGLIB "tmglib")
endif()
include(GNUInstallDirs)
# Updated OSX RPATH settings
@ -73,10 +91,10 @@ include(PreventInBuildInstalls)
if(UNIX)
if(CMAKE_Fortran_COMPILER_ID STREQUAL Intel)
list(APPEND CMAKE_Fortran_FLAGS "-fp-model strict")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fp-model strict")
endif()
if(CMAKE_Fortran_COMPILER_ID STREQUAL XL)
list(APPEND CMAKE_Fortran_FLAGS "-qnosave -qstrict=none")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict=none")
endif()
# Delete libmtsk in linking sequence for Sun/Oracle Fortran Compiler.
# This library is not present in the Sun package SolarisStudio12.3-linux-x86-bin
@ -112,7 +130,7 @@ endif()
# --------------------------------------------------
set(LAPACK_INSTALL_EXPORT_NAME lapack-targets)
set(LAPACK_INSTALL_EXPORT_NAME ${LAPACKLIB}-targets)
macro(lapack_install_library lib)
install(TARGETS ${lib}
@ -220,7 +238,7 @@ endif()
if(NOT BLAS_FOUND)
message(STATUS "Using supplied NETLIB BLAS implementation")
add_subdirectory(BLAS)
set(BLAS_LIBRARIES blas)
set(BLAS_LIBRARIES ${BLASLIB})
else()
set(CMAKE_EXE_LINKER_FLAGS
"${CMAKE_EXE_LINKER_FLAGS} ${BLAS_LINKER_FLAGS}"
@ -279,7 +297,7 @@ endif()
# Neither user specified or optimized LAPACK libraries can be used
if(NOT LATESTLAPACK_FOUND)
message(STATUS "Using supplied NETLIB LAPACK implementation")
set(LAPACK_LIBRARIES lapack)
set(LAPACK_LIBRARIES ${LAPACKLIB})
add_subdirectory(SRC)
else()
set(CMAKE_EXE_LINKER_FLAGS
@ -371,23 +389,23 @@ include(CPack)
# --------------------------------------------------
if(NOT BLAS_FOUND)
set(ALL_TARGETS ${ALL_TARGETS} blas)
set(ALL_TARGETS ${ALL_TARGETS} ${BLASLIB})
endif()
if(NOT LATESTLAPACK_FOUND)
set(ALL_TARGETS ${ALL_TARGETS} lapack)
set(ALL_TARGETS ${ALL_TARGETS} ${LAPACKLIB})
endif()
if(BUILD_TESTING OR LAPACKE_WITH_TMG)
set(ALL_TARGETS ${ALL_TARGETS} tmglib)
set(ALL_TARGETS ${ALL_TARGETS} ${TMGLIB})
endif()
# Export lapack targets, not including lapacke, from the
# install tree, if any.
set(_lapack_config_install_guard_target "")
if(ALL_TARGETS)
install(EXPORT lapack-targets
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapack-${LAPACK_VERSION}
install(EXPORT ${LAPACKLIB}-targets
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${LAPACKLIB}-${LAPACK_VERSION}
COMPONENT Development
)
@ -398,18 +416,18 @@ endif()
# Include cblas in targets exported from the build tree.
if(CBLAS)
set(ALL_TARGETS ${ALL_TARGETS} cblas)
set(ALL_TARGETS ${ALL_TARGETS} ${CBLASLIB})
endif()
# Include lapacke in targets exported from the build tree.
if(LAPACKE)
set(ALL_TARGETS ${ALL_TARGETS} lapacke)
set(ALL_TARGETS ${ALL_TARGETS} ${LAPACKELIB})
endif()
# Export lapack and lapacke targets from the build tree, if any.
set(_lapack_config_build_guard_target "")
if(ALL_TARGETS)
export(TARGETS ${ALL_TARGETS} FILE lapack-targets.cmake)
export(TARGETS ${ALL_TARGETS} FILE ${LAPACKLIB}-targets.cmake)
# Choose one of the lapack or lapacke targets to use as a guard
# for lapack-config.cmake to load targets from the build tree.
@ -417,30 +435,30 @@ if(ALL_TARGETS)
endif()
configure_file(${LAPACK_SOURCE_DIR}/CMAKE/lapack-config-build.cmake.in
${LAPACK_BINARY_DIR}/lapack-config.cmake @ONLY)
${LAPACK_BINARY_DIR}/${LAPACKLIB}-config.cmake @ONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lapack.pc.in ${CMAKE_CURRENT_BINARY_DIR}/lapack.pc @ONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lapack.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${LAPACKLIB}.pc @ONLY)
install(FILES
${CMAKE_CURRENT_BINARY_DIR}/lapack.pc
${CMAKE_CURRENT_BINARY_DIR}/${LAPACKLIB}.pc
DESTINATION ${PKG_CONFIG_DIR}
COMPONENT Development
)
configure_file(${LAPACK_SOURCE_DIR}/CMAKE/lapack-config-install.cmake.in
${LAPACK_BINARY_DIR}/CMakeFiles/lapack-config.cmake @ONLY)
${LAPACK_BINARY_DIR}/CMakeFiles/${LAPACKLIB}-config.cmake @ONLY)
include(CMakePackageConfigHelpers)
write_basic_package_version_file(
${LAPACK_BINARY_DIR}/lapack-config-version.cmake
${LAPACK_BINARY_DIR}/${LAPACKLIB}-config-version.cmake
VERSION ${LAPACK_VERSION}
COMPATIBILITY SameMajorVersion
)
install(FILES
${LAPACK_BINARY_DIR}/CMakeFiles/lapack-config.cmake
${LAPACK_BINARY_DIR}/lapack-config-version.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapack-${LAPACK_VERSION}
${LAPACK_BINARY_DIR}/CMakeFiles/${LAPACKLIB}-config.cmake
${LAPACK_BINARY_DIR}/${LAPACKLIB}-config-version.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${LAPACKLIB}-${LAPACK_VERSION}
COMPONENT Development
)

View File

@ -1,7 +1,7 @@
message(STATUS "LAPACKE enable")
enable_language(C)
set(LAPACK_INSTALL_EXPORT_NAME lapacke-targets)
set(LAPACK_INSTALL_EXPORT_NAME ${LAPACKELIB}-targets)
# Create a header file lapacke_mangling.h for the routines called in my C programs
include(FortranCInterface)
@ -72,28 +72,28 @@ if(LAPACKE_WITH_TMG)
endif()
list(APPEND SOURCES ${UTILS})
add_library(lapacke ${SOURCES})
add_library(${LAPACKELIB} ${SOURCES})
set_target_properties(
lapacke PROPERTIES
${LAPACKELIB} PROPERTIES
LINKER_LANGUAGE C
VERSION ${LAPACK_VERSION}
SOVERSION ${LAPACK_MAJOR_VERSION}
)
target_include_directories(lapacke PUBLIC
target_include_directories(${LAPACKELIB} PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>
)
if(WIN32 AND NOT UNIX)
target_compile_definitions(lapacke PUBLIC HAVE_LAPACK_CONFIG_H LAPACK_COMPLEX_STRUCTURE)
target_compile_definitions(${LAPACKELIB} PUBLIC HAVE_LAPACK_CONFIG_H LAPACK_COMPLEX_STRUCTURE)
message(STATUS "Windows BUILD")
endif()
if(LAPACKE_WITH_TMG)
target_link_libraries(lapacke PRIVATE tmglib)
target_link_libraries(${LAPACKELIB} PRIVATE ${TMGLIB})
endif()
target_link_libraries(lapacke PRIVATE ${LAPACK_LIBRARIES})
target_link_libraries(${LAPACKELIB} PRIVATE ${LAPACK_LIBRARIES})
lapack_install_library(lapacke)
lapack_install_library(${LAPACKELIB})
install(
FILES ${LAPACKE_INCLUDE} ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
@ -105,28 +105,28 @@ if(BUILD_TESTING)
endif()
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lapacke.pc.in ${CMAKE_CURRENT_BINARY_DIR}/lapacke.pc @ONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lapacke.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${LAPACKELIB}.pc @ONLY)
install(FILES
${CMAKE_CURRENT_BINARY_DIR}/lapacke.pc
${CMAKE_CURRENT_BINARY_DIR}/${LAPACKELIB}.pc
DESTINATION ${PKG_CONFIG_DIR}
COMPONENT Development
)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/lapacke-config-version.cmake.in
${LAPACK_BINARY_DIR}/lapacke-config-version.cmake @ONLY)
${LAPACK_BINARY_DIR}/${LAPACKELIB}-config-version.cmake @ONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/lapacke-config-build.cmake.in
${LAPACK_BINARY_DIR}/lapacke-config.cmake @ONLY)
${LAPACK_BINARY_DIR}/${LAPACKELIB}-config.cmake @ONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/lapacke-config-install.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/lapacke-config.cmake @ONLY)
${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${LAPACKELIB}-config.cmake @ONLY)
install(FILES
${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/lapacke-config.cmake
${LAPACK_BINARY_DIR}/lapacke-config-version.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapacke-${LAPACK_VERSION}
${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${LAPACKELIB}-config.cmake
${LAPACK_BINARY_DIR}/${LAPACKELIB}-config-version.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${LAPACKELIB}-${LAPACK_VERSION}
COMPONENT Development
)
install(EXPORT lapacke-targets
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapacke-${LAPACK_VERSION}
install(EXPORT ${LAPACKELIB}-targets
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${LAPACKELIB}-${LAPACK_VERSION}
COMPONENT Development
)

View File

@ -3,8 +3,8 @@ set(LAPACK_DIR "@LAPACK_BINARY_DIR@")
find_package(LAPACK NO_MODULE)
# Load lapack targets from the build tree, including lapacke targets.
if(NOT TARGET lapacke)
include("@LAPACK_BINARY_DIR@/lapack-targets.cmake")
if(NOT TARGET @LAPACKELIB@)
include("@LAPACK_BINARY_DIR@/@LAPACKLIB@-targets.cmake")
endif()
# Hint for project building against lapack
@ -14,4 +14,4 @@ set(LAPACKE_Fortran_COMPILER_ID ${LAPACK_Fortran_COMPILER_ID})
set(LAPACKE_INCLUDE_DIRS "@LAPACK_BINARY_DIR@/include")
# Report lapacke libraries.
set(LAPACKE_LIBRARIES lapacke ${LAPACK_LIBRARIES})
set(LAPACKE_LIBRARIES @LAPACKELIB@ ${LAPACK_LIBRARIES})

View File

@ -5,12 +5,12 @@ get_filename_component(_LAPACKE_PREFIX "${_LAPACKE_PREFIX}" PATH)
get_filename_component(_LAPACKE_PREFIX "${_LAPACKE_PREFIX}" PATH)
# Load the LAPACK package with which we were built.
set(LAPACK_DIR "${_LAPACKE_PREFIX}/@CMAKE_INSTALL_LIBDIR@/cmake/lapack-@LAPACK_VERSION@")
set(LAPACK_DIR "${_LAPACKE_PREFIX}/@CMAKE_INSTALL_LIBDIR@/cmake/@LAPACK@-@LAPACK_VERSION@")
find_package(LAPACK NO_MODULE)
# Load lapacke targets from the install tree.
if(NOT TARGET lapacke)
include(${_LAPACKE_SELF_DIR}/lapacke-targets.cmake)
if(NOT TARGET @LAPACKELIB@)
include(${_LAPACKE_SELF_DIR}/@LAPACKELIB@-targets.cmake)
endif()
# Hint for project building against lapack
@ -20,7 +20,7 @@ set(LAPACKE_Fortran_COMPILER_ID ${LAPACK_Fortran_COMPILER_ID})
set(LAPACKE_INCLUDE_DIRS ${_LAPACKE_PREFIX}/include)
# Report lapacke libraries.
set(LAPACKE_LIBRARIES lapacke ${LAPACK_LIBRARIES})
set(LAPACKE_LIBRARIES @LAPACKELIB@ ${LAPACK_LIBRARIES})
unset(_LAPACKE_PREFIX)
unset(_LAPACKE_SELF_DIR)

View File

@ -3,10 +3,10 @@ add_executable(xexample_DGESV_colmajor example_DGESV_colmajor.c lapacke_example_
add_executable(xexample_DGELS_rowmajor example_DGELS_rowmajor.c lapacke_example_aux.c lapacke_example_aux.h)
add_executable(xexample_DGELS_colmajor example_DGELS_colmajor.c lapacke_example_aux.c lapacke_example_aux.h)
target_link_libraries(xexample_DGESV_rowmajor lapacke)
target_link_libraries(xexample_DGESV_colmajor lapacke)
target_link_libraries(xexample_DGELS_rowmajor lapacke)
target_link_libraries(xexample_DGELS_colmajor lapacke)
target_link_libraries(xexample_DGESV_rowmajor ${LAPACKELIB})
target_link_libraries(xexample_DGESV_colmajor ${LAPACKELIB})
target_link_libraries(xexample_DGELS_rowmajor ${LAPACKELIB})
target_link_libraries(xexample_DGELS_colmajor ${LAPACKELIB})
add_test(example_DGESV_rowmajor ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/xexample_DGESV_rowmajor)
add_test(example_DGESV_colmajor ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/xexample_DGESV_colmajor)

View File

@ -49,12 +49,13 @@ extern "C" {
#endif /* __cplusplus */
#include <stdlib.h>
#include <stdint.h>
#ifndef lapack_int
#if defined(LAPACK_ILP64)
#define lapack_int long
#define lapack_int int64_t
#else
#define lapack_int int
#define lapack_int int32_t
#endif
#endif

View File

@ -67,7 +67,11 @@ extern "C" {
void LAPACKE_xerbla( const char *name, lapack_int info );
/* Compare two chars (case-insensitive) */
lapack_logical LAPACKE_lsame( char ca, char cb );
lapack_logical LAPACKE_lsame( char ca, char cb )
#if defined __GNUC__
__attribute__((const))
#endif
;
/* Functions to convert column-major to row-major 2d arrays and vice versa. */
void LAPACKE_cgb_trans( int matrix_layout, lapack_int m, lapack_int n,

View File

@ -5,6 +5,6 @@ Name: LAPACKE
Description: C Standard Interface to LAPACK Linear Algebra PACKage
Version: @LAPACK_VERSION@
URL: http://www.netlib.org/lapack/#_standard_c_language_apis_for_lapack
Libs: -L${libdir} -llapacke
Libs: -L${libdir} -l@LAPACKELIB@
Cflags: -I${includedir}
Requires.private: lapack
Requires.private: @LAPACKLIB@

View File

@ -500,21 +500,21 @@ if(BUILD_COMPLEX16)
endif()
list(REMOVE_DUPLICATES SOURCES)
add_library(lapack ${SOURCES})
add_library(${LAPACKLIB} ${SOURCES})
set_target_properties(
lapack PROPERTIES
${LAPACKLIB} PROPERTIES
VERSION ${LAPACK_VERSION}
SOVERSION ${LAPACK_MAJOR_VERSION}
)
if(USE_XBLAS)
target_link_libraries(lapack PRIVATE ${XBLAS_LIBRARY})
target_link_libraries(${LAPACKLIB} PRIVATE ${XBLAS_LIBRARY})
endif()
target_link_libraries(lapack PRIVATE ${BLAS_LIBRARIES})
target_link_libraries(${LAPACKLIB} PRIVATE ${BLAS_LIBRARIES})
if(_is_coverage_build)
target_link_libraries(lapack PRIVATE gcov)
add_coverage(lapack)
target_link_libraries(${LAPACKLIB} PRIVATE gcov)
add_coverage(${LAPACKLIB})
endif()
lapack_install_library(lapack)
lapack_install_library(${LAPACKLIB})

View File

@ -47,6 +47,6 @@ if(BUILD_COMPLEX16)
endif()
list(REMOVE_DUPLICATES SOURCES)
add_library(tmglib ${SOURCES})
target_link_libraries(tmglib ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
lapack_install_library(tmglib)
add_library(${TMGLIB} ${SOURCES})
target_link_libraries(${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
lapack_install_library(${TMGLIB})

12
param.h
View File

@ -3128,9 +3128,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 16
#if defined(CORTEXA57) || \
#if defined(CORTEXA57) || defined(CORTEXX1) || \
defined(CORTEXA72) || defined(CORTEXA73) || \
defined(FALKOR) || defined(TSV110) || defined(EMAG8180) || defined(VORTEX)
defined(FALKOR) || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) || defined(FT2000)
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4
@ -3147,7 +3147,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*FIXME: this should be using the cache size, but there is currently no easy way to
query that on ARM. So if getarch counted more than 8 cores we simply assume the host
is a big desktop or server with abundant cache rather than a phone or embedded device */
#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) || defined(VORTEX)
#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) || defined(VORTEX)|| defined(CORTEXX1)
#define SGEMM_DEFAULT_P 512
#define DGEMM_DEFAULT_P 256
#define CGEMM_DEFAULT_P 256
@ -3377,7 +3377,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
#elif defined(ARMV8SVE) || defined(A64FX)
#elif defined(ARMV8SVE) || defined(A64FX) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2)
/* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl".
Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */
@ -3423,8 +3423,8 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 8
#define CGEMM_DEFAULT_UNROLL_N 4

View File

@ -115,7 +115,7 @@
#define INCLUDE_CTGSYL INCLUDE_XTGSYL
#define INCLUDE_ZTGSYL INCLUDE_XTGSYL
#define INCLUDE_XGEMMT 0
#define INCLUDE_XGEMMT 1
#define INCLUDE_SGEMMT INCLUDE_XGEMMT
#define INCLUDE_DGEMMT INCLUDE_XGEMMT
#define INCLUDE_CGEMMT INCLUDE_XGEMMT

View File

@ -566,7 +566,8 @@ void LAPACK(sgemmt)(
const float *B, const blasint *ldB,
const float *beta, float *C, const blasint *ldC
) {
RELAPACK_sgemmt(uplo, n, A, ldA, info);
blasint info;
RELAPACK_sgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info);
}
#endif
@ -578,7 +579,8 @@ void LAPACK(dgemmt)(
const double *B, const blasint *ldB,
const double *beta, double *C, const blasint *ldC
) {
RELAPACK_dgemmt(uplo, n, A, ldA, info);
blasint info;
RELAPACK_dgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info);
}
#endif
@ -590,7 +592,8 @@ void LAPACK(cgemmt)(
const float *B, const blasint *ldB,
const float *beta, float *C, const blasint *ldC
) {
RELAPACK_cgemmt(uplo, n, A, ldA, info);
blasint info;
RELAPACK_cgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info);
}
#endif
@ -602,6 +605,7 @@ void LAPACK(zgemmt)(
const double *B, const blasint *ldB,
const double *beta, double *C, const blasint *ldC
) {
RELAPACK_zgemmt(uplo, n, A, ldA, info);
blasint info;
RELAPACK_zgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info);
}
#endif

View File

@ -30,6 +30,10 @@ if(WIN32)
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1
"if (Test-Path $args[2]) { Remove-Item -Force $args[2] } \n"
"$ErrorActionPreference = \"Stop\"\n"
"If ((Get-Content $args[1] | & file - | %{$_ -match \"BOM\"}) -contains $true) {\n"
"echo 'Skipped due to wrong input encoding'\n"
"exit 0\n"
"}\n"
"Get-Content $args[1] | & $args[0]\n"
"If ((Get-Content $args[2] | %{$_ -match \"FATAL\"}) -contains $true) {\n"
"echo Error\n"