diff --git a/.gitignore b/.gitignore index 6cfc5b3c1..118205ca2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,16 +1,23 @@ *.obj *.lib *.dll +*.dylib *.def *.o lapack-3.1.1 lapack-3.1.1.tgz +lapack-3.4.1 +lapack-3.4.1.tgz *.so *.a .svn *~ +lib.grd +nohup.out config.h Makefile.conf +Makefile.conf_last +config_last.h getarch getarch_2nd utest/openblas_utest diff --git a/Changelog.txt b/Changelog.txt index 0ed35b0e4..c222c7eee 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,17 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.0 +26-Jun-2012 +common: + * Removed the limitation (64) of numbers of CPU cores. + Now, it supports 256 cores at max. + * Supported clang compiler. + * Fixed some build bugs on FreeBSD +x86/x86-64: + * Optimized Level-3 BLAS on Intel Sandy Bridge x86-64 by AVX instructions. + Please use gcc >= 4.6 or clang >=3.1. + * Support AMD Bobcat by using GotoBLAS2 AMD Barcelona codes. + ==================================================================== Version 0.1.1 29-Apr-2012 @@ -7,6 +20,8 @@ common: * Supported LAPACKE, a C interface to LAPACKE. (Thank Zaheer Chothia) * Fixed the build bug (MD5 and download) on Mac OSX. * Auto download CUnit 2.1.2-2 from SF.net with UTEST_CHECK=1. + * Fxied the compatibility issue for compilers without C99 complex number + (e.g. Visual Studio) x86/x86_64: * Auto-detect Intel Sandy Bridge Core i7-3xxx & Xeon E7 Westmere-EX. * Test alpha=Nan in dscale. diff --git a/GotoBLAS_03FAQ.txt b/GotoBLAS_03FAQ.txt index b6033fe53..be623d608 100644 --- a/GotoBLAS_03FAQ.txt +++ b/GotoBLAS_03FAQ.txt @@ -90,6 +90,15 @@ number of threads will consume extra resource. I recommend you to specify minimum number of threads. +1.9 Q I have segfaults when I compile with USE_OPENMP=1. What's wrong? + + A This may be related to a bug in the Linux kernel 2.6.32. Try applying + the patch segaults.patch using + + patch < segfaults.patch + + and see if the crashes persist. Note that this patch will lead to many + compiler warnings. 2. Architecture Specific issue or Implementation diff --git a/Makefile b/Makefile index 8d78a844b..796217291 100644 --- a/Makefile +++ b/Makefile @@ -256,12 +256,17 @@ LAPACK_URL=http://www.netlib.org/lapack/lapack-3.4.1.tgz lapack-3.4.1.tgz : ifndef NOFORTRAN -ifeq ($(OSNAME), Darwin) +#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or +ifeq ($(OSNAME), $(filter $(OSNAME),Darwin NetBSD)) curl -O $(LAPACK_URL) +else +ifeq ($(OSNAME), FreeBSD) + fetch $(LAPACK_URL) else wget $(LAPACK_URL) endif endif +endif large.tgz : ifndef NOFORTRAN diff --git a/Makefile.rule b/Makefile.rule index b6cf98a3e..299273773 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.1.1 +VERSION = 0.2.0 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library @@ -94,8 +94,8 @@ VERSION = 0.1.1 # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute # with single thread. You can use this flag to avoid the overhead of multi-threading -# in small matrix sizes. The default value is 4. -# GEMM_MULTITHREAD_THRESHOLD = 4 +# in small matrix sizes. The default value is 50. +# GEMM_MULTITHREAD_THRESHOLD = 50 # If you need santy check by comparing reference BLAS. It'll be very # slow (Not implemented yet). diff --git a/Makefile.system b/Makefile.system index e2fe9f730..425cbb68a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -14,7 +14,15 @@ NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-3.4.1 endif # Default C compiler +# - Only set if not specified on the command line or inherited from the environment. +# - CC is an implicit variable so neither '?=' or 'ifndef' can be used. +# http://stackoverflow.com/questions/4029274/mingw-and-make-variables +# - Default value is 'cc' which is not always a valid command (e.g. MinGW). +ifeq ($(origin CC),default) CC = gcc +endif + +# Default Fortran compiler (FC) is selected by f_check. ifndef MAKEFILE_RULE include $(TOPDIR)/Makefile.rule @@ -45,7 +53,7 @@ GETARCH_FLAGS += -DUSE64BITINT endif ifndef GEMM_MULTITHREAD_THRESHOLD -GEMM_MULTITHREAD_THRESHOLD=4 +GEMM_MULTITHREAD_THRESHOLD=50 endif GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) @@ -108,6 +116,14 @@ export MACOSX_DEPLOYMENT_TARGET=10.2 MD5SUM = md5 -r endif +ifeq ($(OSNAME), FreeBSD) +MD5SUM = md5 -r +endif + +ifeq ($(OSNAME), NetBSD) +MD5SUM = md5 -r +endif + ifeq ($(OSNAME), Linux) EXTRALIB += -lm endif @@ -231,11 +247,11 @@ endif ifdef DYNAMIC_ARCH ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ - CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO + CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO endif ifeq ($(ARCH), x86_64) -DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA ATOM NANO +DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO endif ifndef DYNAMIC_CORE @@ -754,6 +770,7 @@ export HAVE_SSE4_1 export HAVE_SSE4_2 export HAVE_SSE4A export HAVE_SSE5 +export HAVE_AVX export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE diff --git a/README b/README deleted file mode 100644 index 6372e96bd..000000000 --- a/README +++ /dev/null @@ -1,84 +0,0 @@ -OpenBLAS Readme - -1.Introduction -OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS.(http://www.rdcps.ac.cn) -Please read the documents on OpenBLAS wiki pages(http://github.com/xianyi/OpenBLAS/wiki). - -2.Intallation -Download from project homepage. http://xianyi.github.com/OpenBLAS/ -Or, -check out codes from git://github.com/xianyi/OpenBLAS.git -1)Normal compile - (a) type "make" to detect the CPU automatically. - or - (b) type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. - -2)Cross compile -Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly. - -examples: -On X86 box, compile this library for loongson3a CPU. -make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A - -3)Debug version -make DEBUG=1 - -4)Intall to the directory (Optional) -e.g. -make install PREFIX=your_installation_directory -The default directory is /opt/OpenBLAS - -3.Support CPU & OS -Please read GotoBLAS_01Readme.txt - -Additional support CPU: -x86_64: - Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes. -MIPS64: - ICT Loongson 3A //Level 3 BLAS subroutines are optimized. - -4.Usages -Link with libopenblas.a or -lopenblas for shared library. - -4.1 Set the number of threads with environment variables. for example, -export OPENBLAS_NUM_THREADS=4 - or -export GOTO_NUM_THREADS=4 - or -export OMP_NUM_THREADS=4 - -The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. - -If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. - -4.2 Set the number of threads with calling functions. for example, -void goto_set_num_threads(int num_threads); -or -void openblas_set_num_threads(int num_threads); - -If you compile this lib with USE_OPENMP=1, you should use the above functions, too. - -5.Report Bugs -Please add a issue in https://github.com/xianyi/OpenBLAS/issues - -6.To-Do List: -Optimization on ICT Loongson 3A CPU - -7.Contact -OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas - -8.ChangeLog -Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. - -9.Known Issues -* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit - is 64. On 32 bits, it is 32. -* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS. - -10. Specification of Git Branches -We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). -Now, there are 4 branches in github.com. - * The master branch. This a main branch to reflect a production-ready state. - * The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release. - * The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future. - * The gh-pages branch. This is for web pages diff --git a/README.md b/README.md new file mode 100644 index 000000000..82e9f528c --- /dev/null +++ b/README.md @@ -0,0 +1,110 @@ +# OpenBLAS + +## Introduction +OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS . + +Please read the documents on OpenBLAS wiki pages . + +## Installation +Download from project homepage. http://xianyi.github.com/OpenBLAS/ + +Or, check out codes from git://github.com/xianyi/OpenBLAS.git +### Normal compile + * type "make" to detect the CPU automatically. + or + * type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. + +### Cross compile +Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly. + +Examples: + +On X86 box, compile this library for loongson3a CPU. + + make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A + +### Debug version + + make DEBUG=1 + +### Intall to the directory (Optional) + +Example: + + make install PREFIX=your_installation_directory + +The default directory is /opt/OpenBLAS + +## Support CPU & OS +Please read GotoBLAS_01Readme.txt + +### Additional support CPU: + +#### x86/x86-64: +- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. +- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64. +- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. + +#### MIPS64: +- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. +- **ICT Loongson 3B**: Experimental + +### Support OS: +- **GNU/Linux** +- **MingWin/Windows**: Please read . +- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. +- **FreeBSD**: Supportted by community. We didn't test the library on this OS. + +## Usages +Link with libopenblas.a or -lopenblas for shared library. + +### Set the number of threads with environment variables. + +Examples: + + export OPENBLAS_NUM_THREADS=4 + + or + + export GOTO_NUM_THREADS=4 + + or + + export OMP_NUM_THREADS=4 + +The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. + +If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. + +### Set the number of threads on runtime. + +We provided the below functions to controll the number of threads on runtime. So far, we didn't support changing the number of threads on Windows. On Windows, these functions are dummy. + + void goto_set_num_threads(int num_threads); + + void openblas_set_num_threads(int num_threads); + +If you compile this lib with USE_OPENMP=1, you should use the above functions, too. + +## Report Bugs +Please add a issue in https://github.com/xianyi/OpenBLAS/issues + +## Contact +OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas + +## ChangeLog +Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. + +## Troubleshooting +* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. +* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. +* The number of CPUs/Cores should less than or equal to 256. +* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. + +## Specification of Git Branches +We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). +Now, there are 4 branches in github.com. + * The master branch. This a main branch to reflect a production-ready state. + * The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release. + * The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future. + * The gh-pages branch. This is for web pages diff --git a/TargetList.txt b/TargetList.txt index 1c3d7c5b9..1a212e6ca 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -18,6 +18,7 @@ CORE2 PENRYN DUNNINGTON NEHALEM +SANDYBRIDGE ATOM b)AMD CPU: @@ -27,6 +28,7 @@ OPTERON_SSE3 BARCELONA SHANGHAI ISTANBUL +BOBCAT c)VIA CPU: SSE_GENERIC @@ -47,6 +49,7 @@ CELL 3.MIPS64 CPU: SICORTEX LOONGSON3A +LOONGSON3B 4.IA64 CPU: ITANIUM2 diff --git a/c_check b/c_check index 6ce5e4cc0..4d82237d4 100644 --- a/c_check +++ b/c_check @@ -43,14 +43,14 @@ $compiler = DEC if ($data =~ /COMPILER_DEC/); $compiler = GCC if ($compiler eq ""); $os = Linux if ($data =~ /OS_LINUX/); -$os = FreeBSD if ($data =~ /OS_FreeBSD/); -$os = NetBSD if ($data =~ /OS_NetBSD/); -$os = Darwin if ($data =~ /OS_Darwin/); -$os = SunOS if ($data =~ /OS_SunOS/); +$os = FreeBSD if ($data =~ /OS_FREEBSD/); +$os = NetBSD if ($data =~ /OS_NETBSD/); +$os = Darwin if ($data =~ /OS_DARWIN/); +$os = SunOS if ($data =~ /OS_SUNOS/); $os = AIX if ($data =~ /OS_AIX/); $os = osf if ($data =~ /OS_OSF/); $os = WINNT if ($data =~ /OS_WINNT/); -$os = CYGWIN_NT if ($data =~ /OS_CYGWIN/); +$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/); $os = Interix if ($data =~ /OS_INTERIX/); $architecture = x86 if ($data =~ /ARCH_X86/); diff --git a/cblas.h b/cblas.h index f3708a994..ee8bf08b2 100644 --- a/cblas.h +++ b/cblas.h @@ -9,6 +9,10 @@ extern "C" { #include #include "common.h" +/*Set the number of threads on runtime.*/ +void openblas_set_num_threads(int num_threads); +void goto_set_num_threads(int num_threads); + #define CBLAS_INDEX size_t enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; diff --git a/common.h b/common.h index c6d30ddcf..3718cdee4 100644 --- a/common.h +++ b/common.h @@ -68,7 +68,7 @@ extern "C" { #define SMP #endif -#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_Interix) +#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) #define WINDOWS_ABI #define OS_WINDOWS @@ -89,7 +89,7 @@ extern "C" { #include #endif -#ifdef OS_DARWIN +#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) #include #endif diff --git a/common_interface.h b/common_interface.h index 898d91001..dbe0bb851 100644 --- a/common_interface.h +++ b/common_interface.h @@ -45,6 +45,8 @@ extern "C" { int BLASFUNC(xerbla)(char *, blasint *info, blasint); +void BLASFUNC(openblas_set_num_threads)(int *); + FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); diff --git a/common_thread.h b/common_thread.h index dc963a635..97e060976 100644 --- a/common_thread.h +++ b/common_thread.h @@ -135,7 +135,7 @@ static __inline int num_cpu_avail(int level) { int openmp_nthreads=0; #endif - if ((blas_cpu_number == 1) + if (blas_cpu_number == 1 #ifdef USE_OPENMP || omp_in_parallel() diff --git a/common_x86.h b/common_x86.h index fbb91f888..4316318ec 100644 --- a/common_x86.h +++ b/common_x86.h @@ -254,7 +254,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define PROFCODE #endif -#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) +#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) #define SAVEREGISTERS \ subl $32, %esp;\ movups %xmm6, 0(%esp);\ @@ -269,7 +269,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define RESTOREREGISTERS #endif -#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) +#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) #define PROLOGUE \ .text; \ .align 16; \ @@ -282,7 +282,7 @@ REALNAME: #define EPILOGUE .end REALNAME #endif -#if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) #define PROLOGUE \ .text; \ .align 16; \ @@ -356,4 +356,11 @@ REALNAME: #ifndef ALIGN_6 #define ALIGN_6 .align 64 + +// ffreep %st(0). +// Because Clang didn't support ffreep, we directly use the opcode. +// Please check out http://www.sandpile.org/x86/opc_fpu.htm +#ifndef ffreep +#define ffreep .byte 0xdf, 0xc0 # +#endif #endif diff --git a/common_x86_64.h b/common_x86_64.h index 53b702185..19b0ac53c 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -353,7 +353,7 @@ REALNAME: #define EPILOGUE .end REALNAME #endif -#if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) || defined(C_PGI) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI) #define PROLOGUE \ .text; \ .align 512; \ @@ -425,6 +425,7 @@ REALNAME: #define ALIGN_2 .align 2 #define ALIGN_3 .align 3 #define ALIGN_4 .align 4 +#define ALIGN_5 .align 5 #define ffreep fstp #endif @@ -448,4 +449,10 @@ REALNAME: #define ALIGN_6 .align 64 #endif +// ffreep %st(0). +// Because Clang didn't support ffreep, we directly use the opcode. +// Please check out http://www.sandpile.org/x86/opc_fpu.htm +#ifndef ffreep +#define ffreep .byte 0xdf, 0xc0 # +#endif #endif diff --git a/cpuid.h b/cpuid.h index 665ede077..fdcfcea00 100644 --- a/cpuid.h +++ b/cpuid.h @@ -103,6 +103,8 @@ #define CORE_NEHALEM 17 #define CORE_ATOM 18 #define CORE_NANO 19 +#define CORE_SANDYBRIDGE 20 +#define CORE_BOBCAT 21 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -122,6 +124,7 @@ #define HAVE_MISALIGNSSE (1 << 15) #define HAVE_128BITFPU (1 << 16) #define HAVE_FASTMOVU (1 << 17) +#define HAVE_AVX (1 << 18) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 @@ -188,4 +191,6 @@ typedef struct { #define CPUTYPE_NSGEODE 41 #define CPUTYPE_VIAC3 42 #define CPUTYPE_NANO 43 +#define CPUTYPE_SANDYBRIDGE 44 +#define CPUTYPE_BOBCAT 45 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index 7b36fdbdf..204f41d51 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -189,6 +189,7 @@ int get_cputype(int gettype){ if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3; if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1; if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; + if ((ecx & (1 << 28)) != 0) feature |= HAVE_AVX; if (have_excpuid() >= 0x01) { cpuid(0x80000001, &eax, &ebx, &ecx, &edx); @@ -983,13 +984,13 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; case 10: //Intel Core i5-2000 /i7-2000 (Sandy Bridge) - return CPUTYPE_NEHALEM; + return CPUTYPE_SANDYBRIDGE; case 12: //Xeon Processor 5600 (Westmere-EP) return CPUTYPE_NEHALEM; case 13: //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) - return CPUTYPE_NEHALEM; + return CPUTYPE_SANDYBRIDGE; case 15: //Xeon Processor E7 (Westmere-EX) return CPUTYPE_NEHALEM; @@ -1027,6 +1028,8 @@ int get_cpuname(void){ case 1: case 10: return CPUTYPE_BARCELONA; + case 5: + return CPUTYPE_BOBCAT; } break; } @@ -1146,6 +1149,8 @@ static char *cpuname[] = { "NSGEODE", "VIAC3", "NANO", + "SANDYBRIDGE", + "BOBCAT", }; static char *lowercpuname[] = { @@ -1192,6 +1197,8 @@ static char *lowercpuname[] = { "tms3x00", "nsgeode", "nano", + "sandybridge", + "bobcat", }; static char *corename[] = { @@ -1215,6 +1222,8 @@ static char *corename[] = { "NEHALEM", "ATOM", "NANO", + "SANDYBRIDGE", + "BOBCAT", }; static char *corename_lower[] = { @@ -1238,6 +1247,8 @@ static char *corename_lower[] = { "nehalem", "atom", "nano", + "sandybridge", + "bobcat", }; @@ -1321,13 +1332,13 @@ int get_coretype(void){ return CORE_NEHALEM; case 10: //Intel Core i5-2000 /i7-2000 (Sandy Bridge) - return CORE_NEHALEM; + return CORE_SANDYBRIDGE; case 12: //Xeon Processor 5600 (Westmere-EP) return CORE_NEHALEM; case 13: //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) - return CORE_NEHALEM; + return CORE_SANDYBRIDGE; case 15: //Xeon Processor E7 (Westmere-EX) return CORE_NEHALEM; @@ -1346,7 +1357,9 @@ int get_coretype(void){ if (family <= 0x5) return CORE_80486; if (family <= 0xe) return CORE_ATHLON; if (family == 0xf){ - if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else return CORE_BARCELONA; + if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; + else if (exfamily == 5) return CORE_BOBCAT; + else return CORE_BARCELONA; } } @@ -1426,6 +1439,7 @@ void get_cpuconfig(void){ if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n"); if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); + if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); @@ -1491,6 +1505,7 @@ void get_sse(void){ if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n"); if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); + if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); diff --git a/ctest.c b/ctest.c index 0c373bf2b..9fc0b0c40 100644 --- a/ctest.c +++ b/ctest.c @@ -35,19 +35,19 @@ OS_LINUX #endif #if defined(__FreeBSD__) -OS_FreeBSD +OS_FREEBSD #endif #if defined(__NetBSD__) -OS_NetBSD +OS_NETBSD #endif #if defined(__sun) -OS_SunOS +OS_SUNOS #endif #if defined(__APPLE__) -OS_Darwin +OS_DARWIN #endif #if defined(_AIX) @@ -63,7 +63,7 @@ OS_WINNT #endif #if defined(__CYGWIN__) -OS_CYGWIN +OS_CYGWIN_NT #endif #if defined(__INTERIX) diff --git a/driver/others/Makefile b/driver/others/Makefile index 75b552b65..2fdbb4a42 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -1,12 +1,12 @@ TOPDIR = ../.. include ../../Makefile.system -COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) +COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) ifdef SMP -COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) +COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) ifndef NO_AFFINITY COMMONOBJS += init.$(SUFFIX) endif diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 6708509e1..c71e7c276 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -63,6 +63,14 @@ static blas_pool_t pool; static HANDLE blas_threads [MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER]; +void goto_set_num_threads(int num) +{ +} + +void openblas_set_num_threads(int num) +{ +} + static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (!(mode & BLAS_COMPLEX)){ diff --git a/driver/others/init.c b/driver/others/init.c index 4adba661f..4a6f0aae8 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -85,6 +85,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MAX_NODES 16 #define MAX_CPUS 256 +#define NCPUBITS (8*sizeof(unsigned long)) +#define MAX_BITMASK_LEN (MAX_CPUS/NCPUBITS) +#define CPUELT(cpu) ((cpu) / NCPUBITS) +#define CPUMASK(cpu) ((unsigned long) 1UL << ((cpu) % NCPUBITS)) + #define SH_MAGIC 0x510510 @@ -103,10 +108,10 @@ typedef struct { int num_nodes; int num_procs; int final_num_procs; - unsigned long avail; - + unsigned long avail [MAX_BITMASK_LEN]; + int avail_count; unsigned long cpu_info [MAX_CPUS]; - unsigned long node_info [MAX_NODES]; + unsigned long node_info [MAX_NODES][MAX_BITMASK_LEN]; int cpu_use[MAX_CPUS]; } shm_t; @@ -126,7 +131,8 @@ static shm_t *common = (void *)-1; static int shmid, pshmid; static void *paddr; -static unsigned long lprocmask, lnodemask; +static unsigned long lprocmask[MAX_BITMASK_LEN], lnodemask; +static int lprocmask_count = 0; static int numprocs = 1; static int numnodes = 1; @@ -177,70 +183,114 @@ static inline int rcount(unsigned long number) { than sizeof(unsigned long). On 64 bits, the limit is 64. On 32 bits, it is 32. ***/ -static inline unsigned long get_cpumap(int node) { +static inline void get_cpumap(int node, unsigned long * node_info) { int infile; - unsigned long affinity; + unsigned long affinity[32]; char name[160]; char cpumap[160]; - char *p, *dummy; + char *dummy; int i=0; + int count=0; + int k=0; sprintf(name, CPUMAP_NAME, node); infile = open(name, O_RDONLY); + for(i=0; i<32; i++){ + affinity[i] = 0; + } - affinity = 0; - if (infile != -1) { read(infile, cpumap, sizeof(cpumap)); - p = cpumap; - while (*p != '\n' && i<160){ - if(*p != ',') { - name[i++]=*p; + + for(i=0; i<160; i++){ + if(cpumap[i] == '\n') + break; + if(cpumap[i] != ','){ + name[k++]=cpumap[i]; + + //Enough data for Hex + if(k >= NCPUBITS/4){ + affinity[count++] = strtoul(name, &dummy, 16); + k=0; + } } - p++; + + } + if(k!=0){ + name[k]='\0'; + affinity[count++] = strtoul(name, &dummy, 16); + k=0; + } + // 0-63bit -> node_info[0], 64-128bit -> node_info[1] .... + // revert the sequence + for(i=0; i= NCPUBITS/4){ + affinity[count++] = strtoul(name, &dummy, 16); + k=0; + } + } + + } + if(k!=0){ + name[k]='\0'; + affinity[count++] = strtoul(name, &dummy, 16); + k=0; + } + // 0-63bit -> node_info[0], 64-128bit -> node_info[1] .... + // revert the sequence + for(i=0; i num_nodes = 0; @@ -258,7 +309,9 @@ static int numa_check(void) { return 0; } - for (node = 0; node < MAX_NODES; node ++) common -> node_info[node] = 0; + for (node = 0; node < MAX_NODES; node ++) { + for (j = 0; j node_info[node][j] = 0; + } while ((dir = readdir(dp)) != NULL) { if (*(unsigned int *) dir -> d_name == 0x065646f6eU) { @@ -266,12 +319,12 @@ static int numa_check(void) { node = atoi(&dir -> d_name[4]); if (node > MAX_NODES) { - fprintf(stderr, "\nGotoBLAS Warining : MAX_NODES (NUMA) is too small. Terminated.\n"); + fprintf(stderr, "\nOpenBLAS Warning : MAX_NODES (NUMA) is too small. Terminated.\n"); exit(1); } common -> num_nodes ++; - common -> node_info[node] = get_cpumap(node); + get_cpumap(node, common->node_info[node]); } } @@ -284,7 +337,7 @@ static int numa_check(void) { fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes); for (node = 0; node < common -> num_nodes; node ++) - fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node]); + fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node][0]); #endif return common -> num_nodes; @@ -296,11 +349,13 @@ static void numa_mapping(void) { int i, j, h; unsigned long work, bit; int count = 0; + int bitmask_idx = 0; for (node = 0; node < common -> num_nodes; node ++) { core = 0; for (cpu = 0; cpu < common -> num_procs; cpu ++) { - if (common -> node_info[node] & common -> avail & (1UL << cpu)) { + bitmask_idx = CPUELT(cpu); + if (common -> node_info[node][bitmask_idx] & common -> avail[bitmask_idx] & CPUMASK(cpu)) { common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu); count ++; core ++; @@ -357,58 +412,89 @@ static void numa_mapping(void) { static void disable_hyperthread(void) { - unsigned long share; + unsigned long share[MAX_BITMASK_LEN]; int cpu; + int bitmask_idx = 0; + int i=0, count=0; + bitmask_idx = CPUELT(common -> num_procs); - if(common->num_procs > 64){ - fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); - exit(1); - }else if(common->num_procs == 64){ - common -> avail = 0xFFFFFFFFFFFFFFFFUL; - }else - common -> avail = (1UL << common -> num_procs) - 1; + for(i=0; i< bitmask_idx; i++){ + common -> avail[count++] = 0xFFFFFFFFFFFFFFFFUL; + } + if(CPUMASK(common -> num_procs) != 1){ + common -> avail[count++] = CPUMASK(common -> num_procs) - 1; + } + common -> avail_count = count; + + /* if(common->num_procs > 64){ */ + /* fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); */ + /* exit(1); */ + /* }else if(common->num_procs == 64){ */ + /* common -> avail = 0xFFFFFFFFFFFFFFFFUL; */ + /* }else */ + /* common -> avail = (1UL << common -> num_procs) - 1; */ #ifdef DEBUG - fprintf(stderr, "\nAvail CPUs : %04lx.\n", common -> avail); + fprintf(stderr, "\nAvail CPUs : "); + for(i=0; i avail[i]); + fprintf(stderr, ".\n"); #endif for (cpu = 0; cpu < common -> num_procs; cpu ++) { - - share = (get_share(cpu, 1) & common -> avail); - - if (popcount(share) > 1) { + + get_share(cpu, 1, share); + + //When the shared cpu are in different element of share & avail array, this may be a bug. + for (i = 0; i < count ; i++){ + if (popcount(share[i]) > 1) { #ifdef DEBUG - fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", - cpu, share & ~(1UL << cpu)); + fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", + cpu, share[i] & ~(CPUMASK(cpu))); #endif - common -> avail &= ~((share & ~(1UL << cpu))); + common -> avail[i] &= ~((share[i] & ~ CPUMASK(cpu))); + } } } } static void disable_affinity(void) { - + int i=0; + int bitmask_idx=0; + int count=0; #ifdef DEBUG - fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail); + fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail[0]); fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]); #endif - if(common->final_num_procs > 64){ - fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); - exit(1); - }else if(common->final_num_procs == 64){ - lprocmask = 0xFFFFFFFFFFFFFFFFUL; - }else - lprocmask = (1UL << common -> final_num_procs) - 1; + /* if(common->final_num_procs > 64){ */ + /* fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); */ + /* exit(1); */ + /* }else if(common->final_num_procs == 64){ */ + /* lprocmask = 0xFFFFFFFFFFFFFFFFUL; */ + /* }else */ + /* lprocmask = (1UL << common -> final_num_procs) - 1; */ + + bitmask_idx = CPUELT(common -> final_num_procs); + + for(i=0; i< bitmask_idx; i++){ + lprocmask[count++] = 0xFFFFFFFFFFFFFFFFUL; + } + if(CPUMASK(common -> final_num_procs) != 1){ + lprocmask[count++] = CPUMASK(common -> final_num_procs) - 1; + } + lprocmask_count = count; #ifndef USE_OPENMP - lprocmask &= *(unsigned long *)&cpu_orig_mask[0]; + for(i=0; i< count; i++){ + lprocmask[i] &= ((unsigned long *)&cpu_orig_mask[0])[i]; + } #endif #ifdef DEBUG - fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask); + fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask[0]); #endif } @@ -498,7 +584,7 @@ static void create_pshmem(void) { static void local_cpu_map(void) { int cpu, id, mapping; - + int bitmask_idx = 0; cpu = 0; mapping = 0; @@ -508,8 +594,9 @@ static void local_cpu_map(void) { if (id > 0) { if (is_dead(id)) common -> cpu_use[cpu] = 0; } - - if ((common -> cpu_use[cpu] == 0) && (lprocmask & (1UL << cpu))) { + + bitmask_idx = CPUELT(cpu); + if ((common -> cpu_use[cpu] == 0) && (lprocmask[bitmask_idx] & CPUMASK(cpu))) { common -> cpu_use[cpu] = pshmid; cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]); @@ -595,6 +682,7 @@ void gotoblas_affinity_init(void) { #ifndef USE_OPENMP cpu_set_t cpu_mask; #endif + int i; if (initialized) return; @@ -646,6 +734,11 @@ void gotoblas_affinity_init(void) { common -> num_procs = get_nprocs(); + if(common -> num_procs > MAX_CPUS) { + fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS); + exit(1); + } + for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu; numa_check(); @@ -654,7 +747,8 @@ void gotoblas_affinity_init(void) { if (common -> num_nodes > 1) numa_mapping(); - common -> final_num_procs = popcount(common -> avail); + common -> final_num_procs = 0; + for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += popcount(common -> avail[i]); for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; @@ -664,7 +758,8 @@ void gotoblas_affinity_init(void) { disable_affinity(); - num_avail = popcount(lprocmask); + num_avail = 0; + for(i=0; i num_avail)) numprocs = num_avail; diff --git a/driver/others/memory.c b/driver/others/memory.c index 3f1a5f60a..9b8863f39 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -103,7 +103,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif -#if defined(OS_FreeBSD) || defined(OS_Darwin) +#if defined(OS_FREEBSD) || defined(OS_DARWIN) #include #endif @@ -185,7 +185,7 @@ int get_num_procs(void) { #endif -#if defined(OS_FreeBSD) || defined(OS_Darwin) +#if defined(OS_FREEBSD) || defined(OS_DARWIN) int get_num_procs(void) { @@ -215,7 +215,7 @@ int goto_get_num_procs (void) { int blas_get_cpu_number(void){ char *p; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) int max_num; #endif int blas_goto_num = 0; @@ -223,7 +223,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) max_num = get_num_procs(); #endif @@ -250,7 +250,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif diff --git a/driver/others/openblas_set_num_threads.c b/driver/others/openblas_set_num_threads.c index 7ca3b7114..27de83ffc 100644 --- a/driver/others/openblas_set_num_threads.c +++ b/driver/others/openblas_set_num_threads.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,7 +33,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifdef SMP_SERVER -#ifdef OS_LINUX extern void openblas_set_num_threads(int num_threads) ; @@ -41,5 +40,13 @@ void NAME(int* num_threads){ openblas_set_num_threads(*num_threads); } -#endif +#else +//Single thread + +void openblas_set_num_threads(int num_threads) { +} + +void NAME(int* num_threads){ + +} #endif diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 21f56e889..d261e5a4e 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -163,9 +163,9 @@ int get_L2_size(void){ int eax, ebx, ecx, edx; -#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \ +#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ - defined(CORE_NEHALEM) || defined(ATOM) || defined(GENERIC) + defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); @@ -384,6 +384,17 @@ void blas_set_parameter(void){ #endif #endif +#if defined(SANDYBRIDGE) + sgemm_p = 1024; + dgemm_p = 512; + cgemm_p = 512; + zgemm_p = 256; +#ifdef EXPRECISION + qgemm_p = 256; + xgemm_p = 128; +#endif +#endif + #if defined(CORE_PRESCOTT) || defined(GENERIC) size >>= 6; @@ -435,7 +446,7 @@ void blas_set_parameter(void){ #endif #endif -#if defined(CORE_BARCELONA) +#if defined(CORE_BARCELONA) || defined(CORE_BOBCAT) size >>= 8; sgemm_p = 232 * size; diff --git a/exports/Makefile b/exports/Makefile index 971bd0bed..40a3a7c63 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -70,11 +70,11 @@ dll2 : libgoto2_shared.dll $(RANLIB) ../$(LIBNAME) ifeq ($(BINARY32), 1) $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ - --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) + --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB) -lib /machine:i386 /def:libopenblas.def else $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ - --entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) + --entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB) -lib /machine:X64 /def:libopenblas.def endif @@ -121,7 +121,7 @@ so : ../$(LIBSONAME) ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ - -Wl,--retain-symbols-file=linux.def $(EXTRALIB) + -Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB) $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. rm -f linktest diff --git a/exports/gensymbol b/exports/gensymbol index 1f30d7b15..e09a8b6ab 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -72,7 +72,16 @@ zgemm3m, cgemm3m, zsymm3m, csymm3m, zhemm3m, chemm3m, ); +@misc_no_underscore_objs = ( + openblas_set_num_threads, goto_set_num_threads, + ); + +@misc_underscore_objs = ( + openblas_set_num_threads, + ); + @lapackobjs = ( + # These routines are provided by OpenBLAS. sgesv, dgesv, cgesv, zgesv, sgetf2, dgetf2, cgetf2, zgetf2, sgetrf, dgetrf, cgetrf, zgetrf, @@ -88,24 +97,79 @@ ); @lapackobjs2 = ( - sgbbrd, sgbcon, sgbequ, sgbrfs, sgbsv, + # These routines are provided by LAPACK (reference implementation). + # + # This list is prepared by copying all routines listed in + # `lapack-3.4.1/SRC/Makefile` and replacing the '.o' suffix with a comma. + # Thereafter the following routines should be removed: + # - those provided by OpenBLAS (see @lapackobjs) + # - extra precision routines (see @lapack_extendedprecision_objs) + # Each of these have been marked individually with "already provided" or "excluded". + + # ALLAUX -- Auxiliary routines called from all precisions + # already provided by @blasobjs: xerbla, lsame + ilaenv, ieeeck, lsamen, xerbla_array, iparmq, + ilaprec, ilatrans, ilauplo, iladiag, chla_transtype, + ilaver, slamch, + + # SCLAUX -- Auxiliary routines called from both REAL and COMPLEX. + # excluded: second_$(TIMER) + sbdsdc, + sbdsqr, sdisna, slabad, slacpy, sladiv, slae2, slaebz, + slaed0, slaed1, slaed2, slaed3, slaed4, slaed5, slaed6, + slaed7, slaed8, slaed9, slaeda, slaev2, slagtf, + slagts, slamrg, slanst, + slapy2, slapy3, slarnv, + slarra, slarrb, slarrc, slarrd, slarre, slarrf, slarrj, + slarrk, slarrr, slaneg, + slartg, slaruv, slas2, slascl, + slasd0, slasd1, slasd2, slasd3, slasd4, slasd5, slasd6, + slasd7, slasd8, slasda, slasdq, slasdt, + slaset, slasq1, slasq2, slasq3, slasq4, slasq5, slasq6, + slasr, slasrt, slassq, slasv2, spttrf, sstebz, sstedc, + ssteqr, ssterf, slaisnan, sisnan, + slartgp, slartgs, + + # DZLAUX -- Auxiliary routines called from both DOUBLE and COMPLEX*16. + # excluded: dsecnd_$(TIMER) + dbdsdc, + dbdsqr, ddisna, dlabad, dlacpy, dladiv, dlae2, dlaebz, + dlaed0, dlaed1, dlaed2, dlaed3, dlaed4, dlaed5, dlaed6, + dlaed7, dlaed8, dlaed9, dlaeda, dlaev2, dlagtf, + dlagts, dlamrg, dlanst, + dlapy2, dlapy3, dlarnv, + dlarra, dlarrb, dlarrc, dlarrd, dlarre, dlarrf, dlarrj, + dlarrk, dlarrr, dlaneg, + dlartg, dlaruv, dlas2, dlascl, + dlasd0, dlasd1, dlasd2, dlasd3, dlasd4, dlasd5, dlasd6, + dlasd7, dlasd8, dlasda, dlasdq, dlasdt, + dlaset, dlasq1, dlasq2, dlasq3, dlasq4, dlasq5, dlasq6, + dlasr, dlasrt, dlassq, dlasv2, dpttrf, dstebz, dstedc, + dsteqr, dsterf, dlaisnan, disnan, + dlartgp, dlartgs, + dlamch, + + # SLASRC -- Single precision real LAPACK routines + # already provided by @lapackobjs: + # sgesv, sgetf2, slaswp, slauu2, slauum, spotf2, spotri, strti2, strtri + sgbbrd, sgbcon, sgbequ, sgbrfs, sgbsv, sgbsvx, sgbtf2, sgbtrf, sgbtrs, sgebak, sgebal, sgebd2, sgebrd, sgecon, sgeequ, sgees, sgeesx, sgeev, sgeevx, sgegs, sgegv, sgehd2, sgehrd, sgelq2, sgelqf, sgels, sgelsd, sgelss, sgelsx, sgelsy, sgeql2, sgeqlf, - sgeqp3, sgeqpf, sgeqr2, sgeqr2p, sgeqrf, sgeqrfp, sgerfs, sgerq2, sgerqf, - sgesc2, sgesdd, sgesvd, sgesvx, sgetc2, - sgetri, + sgeqp3, sgeqpf, sgeqr2, sgeqr2p, sgeqrf, sgeqrfp, sgerfs, + sgerq2, sgerqf, sgesc2, sgesdd, sgesvd, sgesvx, + sgetc2, sgetri, sggbak, sggbal, sgges, sggesx, sggev, sggevx, sggglm, sgghrd, sgglse, sggqrf, - sggrqf, sggsvd, sggsvp, sgtcon, sgtrfs, sgtsv, + sggrqf, sggsvd, sggsvp, sgtcon, sgtrfs, sgtsv, sgtsvx, sgttrf, sgttrs, sgtts2, shgeqz, shsein, shseqr, slabrd, slacon, slacn2, slaein, slaexc, slag2, slags2, slagtm, slagv2, slahqr, slahrd, slahr2, slaic1, slaln2, slals0, slalsa, slalsd, slangb, slange, slangt, slanhs, slansb, slansp, slansy, slantb, slantp, slantr, slanv2, - slapll, slapmt, slapy2, slapy3, + slapll, slapmt, slaqgb, slaqge, slaqp2, slaqps, slaqsb, slaqsp, slaqsy, slaqr0, slaqr1, slaqr2, slaqr3, slaqr4, slaqr5, slaqtr, slar1v, slar2v, ilaslr, ilaslc, @@ -119,19 +183,21 @@ sormbr, sormhr, sorml2, sormlq, sormql, sormqr, sormr2, sormr3, sormrq, sormrz, sormtr, spbcon, spbequ, spbrfs, spbstf, spbsv, spbsvx, - spbtf2, spbtrf, spbtrs, spocon, spoequ, sporfs, sposv, - sposvx, spotrs, spstrf, spstf2, + spbtf2, spbtrf, spbtrs, spocon, spoequ, sporfs, sposv, + sposvx, spstrf, spstf2, sppcon, sppequ, spprfs, sppsv, sppsvx, spptrf, spptri, spptrs, sptcon, - spteqr, sptrfs, sptsv, sptsvx, spttrs, sptts2, srscl, + spteqr, sptrfs, sptsv, sptsvx, spttrs, sptts2, srscl, ssbev, ssbevd, ssbevx, ssbgst, ssbgv, ssbgvd, ssbgvx, ssbtrd, sspcon, sspev, sspevd, sspevx, sspgst, sspgv, sspgvd, sspgvx, ssprfs, sspsv, sspsvx, ssptrd, ssptrf, ssptri, ssptrs, sstegr, sstein, sstev, sstevd, sstevr, - sstevx, ssycon, ssyev, ssyevd, ssyevr, ssyevx, ssygs2, + sstevx, + ssycon, ssyev, ssyevd, ssyevr, ssyevx, ssygs2, ssygst, ssygv, ssygvd, ssygvx, ssyrfs, ssysv, ssysvx, ssytd2, ssytf2, ssytrd, ssytrf, ssytri, ssytri2, ssytri2x, - ssyswapr, ssytrs, ssytrs2, ssyconv, stbcon, + ssyswapr, ssytrs, ssytrs2, ssyconv, + stbcon, stbrfs, stbtrs, stgevc, stgex2, stgexc, stgsen, stgsja, stgsna, stgsy2, stgsyl, stpcon, stprfs, stptri, stptrs, @@ -144,26 +210,38 @@ sbbcsd, slapmr, sorbdb, sorcsd, sgeqrt, sgeqrt2, sgeqrt3, sgemqrt, stpqrt, stpqrt2, stpmqrt, stprfb, - + + # DSLASRC -- Double-single mixed precision real routines called from + # single, single-extra and double precision real LAPACK + # routines (i.e. from SLASRC, SXLASRC, DLASRC). + # + # already provided by @lapackobjs: + # sgetrs, spotrf, sgetrf + spotrs, + + # CLASRC -- Single precision complex LAPACK routines + # already provided by @blasobjs: csymv + # already provided by @lapackobjs: + # cgesv, cgetf2, claswp, clauu2, clauum, cpotf2, cpotri, ctrti2, ctrtri cbdsqr, cgbbrd, cgbcon, cgbequ, cgbrfs, cgbsv, cgbsvx, cgbtf2, cgbtrf, cgbtrs, cgebak, cgebal, cgebd2, cgebrd, cgecon, cgeequ, cgees, cgeesx, cgeev, cgeevx, cgegs, cgegv, cgehd2, cgehrd, cgelq2, cgelqf, cgels, cgelsd, cgelss, cgelsx, cgelsy, cgeql2, cgeqlf, cgeqp3, - cgeqpf, cgeqr2, cgeqr2p, cgeqrf, cgeqrfp, cgerfs, cgerq2, cgerqf, - cgesc2, cgesdd, cgesvd, cgesvx, cgetc2, - cgetri, + cgeqpf, cgeqr2, cgeqr2p, cgeqrf, cgeqrfp, cgerfs, + cgerq2, cgerqf, cgesc2, cgesdd, cgesvd, + cgesvx, cgetc2, cgetri, cggbak, cggbal, cgges, cggesx, cggev, cggevx, cggglm, cgghrd, cgglse, cggqrf, cggrqf, cggsvd, cggsvp, - cgtcon, cgtrfs, cgtsv, cgtsvx, cgttrf, cgttrs, cgtts2, chbev, + cgtcon, cgtrfs, cgtsv, cgtsvx, cgttrf, cgttrs, cgtts2, chbev, chbevd, chbevx, chbgst, chbgv, chbgvd, chbgvx, chbtrd, checon, cheev, cheevd, cheevr, cheevx, chegs2, chegst, chegv, chegvd, chegvx, cherfs, chesv, chesvx, chetd2, chetf2, chetrd, - chetrf, chetri, chetri2, chetri2x, cheswapr, + chetrf, chetri, chetri2, chetri2x, cheswapr, chetrs, chetrs2, chgeqz, chpcon, chpev, chpevd, - chpevx, chpgst, chpgv, chpgvd, chpgvx, chprfs, chpsv, + chpevx, chpgst, chpgv, chpgvd, chpgvx, chprfs, chpsv, chpsvx, chptrd, chptrf, chptri, chptrs, chsein, chseqr, clabrd, clacgv, clacon, clacn2, clacp2, clacpy, clacrm, clacrt, cladiv, @@ -177,21 +255,22 @@ claqhb, claqhe, claqhp, claqp2, claqps, claqsb, claqr0, claqr1, claqr2, claqr3, claqr4, claqr5, claqsp, claqsy, clar1v, clar2v, ilaclr, ilaclc, - clarf, clarfb, clarfg, clarfgp, clarft, + clarf, clarfb, clarfg, clarft, clarfgp, clarfx, clargv, clarnv, clarrv, clartg, clartv, clarz, clarzb, clarzt, clascl, claset, clasr, classq, clasyf, clatbs, clatdf, clatps, clatrd, clatrs, clatrz, - clatzm, cpbcon, cpbequ, cpbrfs, cpbstf, cpbsv, + clatzm, cpbcon, cpbequ, cpbrfs, cpbstf, cpbsv, cpbsvx, cpbtf2, cpbtrf, cpbtrs, cpocon, cpoequ, cporfs, - cposv, cposvx, cpotrs, cpstrf, cpstf2, + cposv, cposvx, cpstrf, cpstf2, cppcon, cppequ, cpprfs, cppsv, cppsvx, cpptrf, cpptri, cpptrs, cptcon, cpteqr, cptrfs, cptsv, cptsvx, cpttrf, cpttrs, cptts2, - crot, cspcon, cspmv, cspr, csprfs, cspsv, + crot, cspcon, cspmv, cspr, csprfs, cspsv, cspsvx, csptrf, csptri, csptrs, csrscl, cstedc, - cstegr, cstein, csteqr, csycon, - csyr, csyrfs, csysv, csysvx, csytf2, csytrf, csytri, - csytri2, csytri2x, csyswapr, - csytrs, csytrs2, csyconv, ctbcon, ctbrfs, ctbtrs, ctgevc, ctgex2, + cstegr, cstein, csteqr, + csycon, + csyr, csyrfs, csysv, csysvx, csytf2, csytrf, csytri, csytri2, csytri2x, + csyswapr, csytrs, csytrs2, csyconv, + ctbcon, ctbrfs, ctbtrs, ctgevc, ctgex2, ctgexc, ctgsen, ctgsja, ctgsna, ctgsy2, ctgsyl, ctpcon, ctprfs, ctptri, ctptrs, ctrcon, ctrevc, ctrexc, ctrrfs, ctrsen, ctrsna, @@ -207,29 +286,41 @@ cgeqrt, cgeqrt2, cgeqrt3, cgemqrt, ctpqrt, ctpqrt2, ctpmqrt, ctprfb, + # ZCLASRC -- Double-single mixed precision complex routines called from + # single, single-extra and double precision complex LAPACK + # routines (i.e. from CLASRC, CXLASRC, ZLASRC). + # + # already provided by @lapackobjs: + # cgetrs, cpotrf, cgetrf + cpotrs, + + # DLASRC -- Double precision real LAPACK routines + # already provided by @lapackobjs: + # dgesv, dgetf2, dgetrs, dlaswp, dlauu2, dlauum, dpotf2, dpotrf, dpotri, + # dtrti2, dtrtri dgbbrd, dgbcon, dgbequ, dgbrfs, dgbsv, dgbsvx, dgbtf2, dgbtrf, dgbtrs, dgebak, dgebal, dgebd2, dgebrd, dgecon, dgeequ, dgees, dgeesx, dgeev, dgeevx, dgegs, dgegv, dgehd2, dgehrd, dgelq2, dgelqf, dgels, dgelsd, dgelss, dgelsx, dgelsy, dgeql2, dgeqlf, - dgeqp3, dgeqpf, dgeqr2, dgeqr2p, dgeqrf, dgeqrfp, dgerfs, dgerq2, dgerqf, - dgesc2, dgesdd, dgesvd, dgesvx, dgetc2, - dgetri, + dgeqp3, dgeqpf, dgeqr2, dgeqr2p, dgeqrf, dgeqrfp, dgerfs, + dgerq2, dgerqf, dgesc2, dgesdd, dgesvd, dgesvx, + dgetc2, dgetri, dggbak, dggbal, dgges, dggesx, dggev, dggevx, dggglm, dgghrd, dgglse, dggqrf, - dggrqf, dggsvd, dggsvp, dgtcon, dgtrfs, dgtsv, + dggrqf, dggsvd, dggsvp, dgtcon, dgtrfs, dgtsv, dgtsvx, dgttrf, dgttrs, dgtts2, dhgeqz, dhsein, dhseqr, dlabrd, dlacon, dlacn2, dlaein, dlaexc, dlag2, dlags2, dlagtm, dlagv2, dlahqr, dlahrd, dlahr2, dlaic1, dlaln2, dlals0, dlalsa, dlalsd, dlangb, dlange, dlangt, dlanhs, dlansb, dlansp, dlansy, dlantb, dlantp, dlantr, dlanv2, - dlapll, dlapmt, dlapy2, dlapy3, + dlapll, dlapmt, dlaqgb, dlaqge, dlaqp2, dlaqps, dlaqsb, dlaqsp, dlaqsy, dlaqr0, dlaqr1, dlaqr2, dlaqr3, dlaqr4, dlaqr5, dlaqtr, dlar1v, dlar2v, iladlr, iladlc, - dlarf, dlarfb, dlarfg, dlarfgp, dlarft, dlarfx, dlargv, - dlarrv, dlartv, + dlarf, dlarfb, dlarfg, dlarfgp, dlarft, dlarfx, + dlargv, dlarrv, dlartv, dlarz, dlarzb, dlarzt, dlasy2, dlasyf, dlatbs, dlatdf, dlatps, dlatrd, dlatrs, dlatrz, dlatzm, dopgtr, dopmtr, dorg2l, dorg2r, @@ -238,21 +329,22 @@ dormbr, dormhr, dorml2, dormlq, dormql, dormqr, dormr2, dormr3, dormrq, dormrz, dormtr, dpbcon, dpbequ, dpbrfs, dpbstf, dpbsv, dpbsvx, - dpbtf2, dpbtrf, dpbtrs, dpocon, dpoequ, dporfs, dposv, + dpbtf2, dpbtrf, dpbtrs, dpocon, dpoequ, dporfs, dposv, dposvx, dpotrs, dpstrf, dpstf2, dppcon, dppequ, dpprfs, dppsv, dppsvx, dpptrf, dpptri, dpptrs, dptcon, - dpteqr, dptrfs, dptsv, dptsvx, dpttrs, dptts2, drscl, + dpteqr, dptrfs, dptsv, dptsvx, dpttrs, dptts2, drscl, dsbev, dsbevd, dsbevx, dsbgst, dsbgv, dsbgvd, dsbgvx, dsbtrd, dspcon, dspev, dspevd, dspevx, dspgst, dspgv, dspgvd, dspgvx, dsprfs, dspsv, dspsvx, dsptrd, dsptrf, dsptri, dsptrs, dstegr, dstein, dstev, dstevd, dstevr, - dstevx, dsycon, dsyev, dsyevd, dsyevr, + dstevx, + dsycon, dsyev, dsyevd, dsyevr, dsyevx, dsygs2, dsygst, dsygv, dsygvd, dsygvx, dsyrfs, dsysv, dsysvx, - dsytd2, dsytf2, dsytrd, dsytrf, dsytri, dsytrs, dsytrs2, - dsytri2, dsytri2x, dsyswapr, dsyconv, dtbcon, - dtbrfs, dtbtrs, dtgevc, dtgex2, dtgexc, dtgsen, + dsytd2, dsytf2, dsytrd, dsytrf, dsytri, dsytri2, dsytri2x, + dsyswapr, dsytrs, dsytrs2, dsyconv, + dtbcon, dtbrfs, dtbtrs, dtgevc, dtgex2, dtgexc, dtgsen, dtgsja, dtgsna, dtgsy2, dtgsyl, dtpcon, dtprfs, dtptri, dtptrs, dtrcon, dtrevc, dtrexc, dtrrfs, dtrsen, dtrsna, dtrsyl, @@ -266,6 +358,11 @@ dgeqrt, dgeqrt2, dgeqrt3, dgemqrt, dtpqrt, dtpqrt2, dtpmqrt, dtprfb, + # ZLASRC -- Double precision complex LAPACK routines + # already provided by @blasobjs: zsymv + # already provided by @lapackobjs: + # zgesv, zgetrs, zgetf2, zlaswp, zlauu2, zlauum, zpotf2, zpotrf, zpotri, + # ztrti2, ztrtri zbdsqr, zgbbrd, zgbcon, zgbequ, zgbrfs, zgbsv, zgbsvx, zgbtf2, zgbtrf, zgbtrs, zgebak, zgebal, zgebd2, zgebrd, zgecon, zgeequ, zgees, zgeesx, zgeev, zgeevx, @@ -277,14 +374,14 @@ zggbak, zggbal, zgges, zggesx, zggev, zggevx, zggglm, zgghrd, zgglse, zggqrf, zggrqf, zggsvd, zggsvp, - zgtcon, zgtrfs, zgtsv, zgtsvx, zgttrf, zgttrs, zgtts2, zhbev, + zgtcon, zgtrfs, zgtsv, zgtsvx, zgttrf, zgttrs, zgtts2, zhbev, zhbevd, zhbevx, zhbgst, zhbgv, zhbgvd, zhbgvx, zhbtrd, zhecon, zheev, zheevd, zheevr, zheevx, zhegs2, zhegst, zhegv, zhegvd, zhegvx, zherfs, zhesv, zhesvx, zhetd2, zhetf2, zhetrd, zhetrf, zhetri, zhetri2, zhetri2x, zheswapr, zhetrs, zhetrs2, zhgeqz, zhpcon, zhpev, zhpevd, - zhpevx, zhpgst, zhpgv, zhpgvd, zhpgvx, zhprfs, zhpsv, + zhpevx, zhpgst, zhpgv, zhpgvd, zhpgvx, zhprfs, zhpsv, zhpsvx, zhptrd, zhptrf, zhptri, zhptrs, zhsein, zhseqr, zlabrd, zlacgv, zlacon, zlacn2, zlacp2, zlacpy, zlacrm, zlacrt, zladiv, @@ -300,22 +397,23 @@ zlaqr0, zlaqr1, zlaqr2, zlaqr3, zlaqr4, zlaqr5, zlaqsp, zlaqsy, zlar1v, zlar2v, ilazlr, ilazlc, zlarcm, zlarf, zlarfb, - zlarfg, zlarfgp, zlarft, + zlarfg, zlarft, zlarfgp, zlarfx, zlargv, zlarnv, zlarrv, zlartg, zlartv, - zlarz, zlarzb, zlarzt, zlascl, zlaset, zlasr, + zlarz, zlarzb, zlarzt, zlascl, zlaset, zlasr, zlassq, zlasyf, zlatbs, zlatdf, zlatps, zlatrd, zlatrs, zlatrz, zlatzm, - zpbcon, zpbequ, zpbrfs, zpbstf, zpbsv, + zpbcon, zpbequ, zpbrfs, zpbstf, zpbsv, zpbsvx, zpbtf2, zpbtrf, zpbtrs, zpocon, zpoequ, zporfs, zposv, zposvx, zpotrs, zpstrf, zpstf2, zppcon, zppequ, zpprfs, zppsv, zppsvx, zpptrf, zpptri, zpptrs, zptcon, zpteqr, zptrfs, zptsv, zptsvx, zpttrf, zpttrs, zptts2, - zrot, zspcon, zspmv, zspr, zsprfs, zspsv, + zrot, zspcon, zspmv, zspr, zsprfs, zspsv, zspsvx, zsptrf, zsptri, zsptrs, zdrscl, zstedc, - zstegr, zstein, zsteqr, zsycon, - zsyr, zsyrfs, zsysv, zsysvx, zsytf2, zsytrf, zsytri, - zsytri2, zsytri2x, zsyswapr, - zsytrs, zsytrs2, zsyconv, ztbcon, ztbrfs, ztbtrs, ztgevc, ztgex2, + zstegr, zstein, zsteqr, + zsycon, + zsyr, zsyrfs, zsysv, zsysvx, zsytf2, zsytrf, zsytri, zsytri2, zsytri2x, + zsyswapr, zsytrs, zsytrs2, zsyconv, + ztbcon, ztbrfs, ztbtrs, ztgevc, ztgex2, ztgexc, ztgsen, ztgsja, ztgsna, ztgsy2, ztgsyl, ztpcon, ztprfs, ztptri, ztptrs, ztrcon, ztrevc, ztrexc, ztrrfs, ztrsen, ztrsna, @@ -332,7 +430,6 @@ zbbcsd, zlapmr, zunbdb, zuncsd, zgeqrt, zgeqrt2, zgeqrt3, zgemqrt, ztpqrt, ztpqrt2, ztpmqrt, ztprfb, - ); @lapack_extendedprecision_objs = ( @@ -341,15 +438,170 @@ ); @lapackeobjs = ( - lapack_make_complex_double, - lapack_make_complex_float, + # LAPACK C interface routines. + # + # This list is prepared in a similar manner to @lapackobjs2, however the + # functions all begin with an uppercase prefix (with the exception of the + # make_complex_* routines). + # + # The functions corresponding to @(MATGEN_OBJ) and @(SRCX_OBJ) are not + # exported since the respective LAPACK routines are not built by default. + + # @(OBJ) from `lapack-3.4.1/lapacke/utils/Makefile` + LAPACKE_cgb_nancheck, + LAPACKE_cgb_trans, + LAPACKE_cge_nancheck, + LAPACKE_cge_trans, + LAPACKE_cgg_nancheck, + LAPACKE_cgg_trans, + LAPACKE_cgt_nancheck, + LAPACKE_chb_nancheck, + LAPACKE_chb_trans, + LAPACKE_che_nancheck, + LAPACKE_che_trans, + LAPACKE_chp_nancheck, + LAPACKE_chp_trans, + LAPACKE_chs_nancheck, + LAPACKE_chs_trans, LAPACKE_c_nancheck, + LAPACKE_cpb_nancheck, + LAPACKE_cpb_trans, + LAPACKE_cpf_nancheck, + LAPACKE_cpf_trans, + LAPACKE_cpo_nancheck, + LAPACKE_cpo_trans, + LAPACKE_cpp_nancheck, + LAPACKE_cpp_trans, + LAPACKE_cpt_nancheck, + LAPACKE_csp_nancheck, + LAPACKE_csp_trans, + LAPACKE_cst_nancheck, + LAPACKE_csy_nancheck, + LAPACKE_csy_trans, + LAPACKE_ctb_nancheck, + LAPACKE_ctb_trans, + LAPACKE_ctf_nancheck, + LAPACKE_ctf_trans, + LAPACKE_ctp_nancheck, + LAPACKE_ctp_trans, + LAPACKE_ctr_nancheck, + LAPACKE_ctr_trans, + LAPACKE_dgb_nancheck, + LAPACKE_dgb_trans, + LAPACKE_dge_nancheck, + LAPACKE_dge_trans, + LAPACKE_dgg_nancheck, + LAPACKE_dgg_trans, + LAPACKE_dgt_nancheck, + LAPACKE_dhs_nancheck, + LAPACKE_dhs_trans, + LAPACKE_d_nancheck, + LAPACKE_dpb_nancheck, + LAPACKE_dpb_trans, + LAPACKE_dpf_nancheck, + LAPACKE_dpf_trans, + LAPACKE_dpo_nancheck, + LAPACKE_dpo_trans, + LAPACKE_dpp_nancheck, + LAPACKE_dpp_trans, + LAPACKE_dpt_nancheck, + LAPACKE_dsb_nancheck, + LAPACKE_dsb_trans, + LAPACKE_dsp_nancheck, + LAPACKE_dsp_trans, + LAPACKE_dst_nancheck, + LAPACKE_dsy_nancheck, + LAPACKE_dsy_trans, + LAPACKE_dtb_nancheck, + LAPACKE_dtb_trans, + LAPACKE_dtf_nancheck, + LAPACKE_dtf_trans, + LAPACKE_dtp_nancheck, + LAPACKE_dtp_trans, + LAPACKE_dtr_nancheck, + LAPACKE_dtr_trans, + LAPACKE_lsame, + LAPACKE_sgb_nancheck, + LAPACKE_sgb_trans, + LAPACKE_sge_nancheck, + LAPACKE_sge_trans, + LAPACKE_sgg_nancheck, + LAPACKE_sgg_trans, + LAPACKE_sgt_nancheck, + LAPACKE_shs_nancheck, + LAPACKE_shs_trans, + LAPACKE_s_nancheck, + LAPACKE_spb_nancheck, + LAPACKE_spb_trans, + LAPACKE_spf_nancheck, + LAPACKE_spf_trans, + LAPACKE_spo_nancheck, + LAPACKE_spo_trans, + LAPACKE_spp_nancheck, + LAPACKE_spp_trans, + LAPACKE_spt_nancheck, + LAPACKE_ssb_nancheck, + LAPACKE_ssb_trans, + LAPACKE_ssp_nancheck, + LAPACKE_ssp_trans, + LAPACKE_sst_nancheck, + LAPACKE_ssy_nancheck, + LAPACKE_ssy_trans, + LAPACKE_stb_nancheck, + LAPACKE_stb_trans, + LAPACKE_stf_nancheck, + LAPACKE_stf_trans, + LAPACKE_stp_nancheck, + LAPACKE_stp_trans, + LAPACKE_str_nancheck, + LAPACKE_str_trans, + LAPACKE_xerbla, + LAPACKE_zgb_nancheck, + LAPACKE_zgb_trans, + LAPACKE_zge_nancheck, + LAPACKE_zge_trans, + LAPACKE_zgg_nancheck, + LAPACKE_zgg_trans, + LAPACKE_zgt_nancheck, + LAPACKE_zhb_nancheck, + LAPACKE_zhb_trans, + LAPACKE_zhe_nancheck, + LAPACKE_zhe_trans, + LAPACKE_zhp_nancheck, + LAPACKE_zhp_trans, + LAPACKE_zhs_nancheck, + LAPACKE_zhs_trans, + LAPACKE_z_nancheck, + LAPACKE_zpb_nancheck, + LAPACKE_zpb_trans, + LAPACKE_zpf_nancheck, + LAPACKE_zpf_trans, + LAPACKE_zpo_nancheck, + LAPACKE_zpo_trans, + LAPACKE_zpp_nancheck, + LAPACKE_zpp_trans, + LAPACKE_zpt_nancheck, + LAPACKE_zsp_nancheck, + LAPACKE_zsp_trans, + LAPACKE_zst_nancheck, + LAPACKE_zsy_nancheck, + LAPACKE_zsy_trans, + LAPACKE_ztb_nancheck, + LAPACKE_ztb_trans, + LAPACKE_ztf_nancheck, + LAPACKE_ztf_trans, + LAPACKE_ztp_nancheck, + LAPACKE_ztp_trans, + LAPACKE_ztr_nancheck, + LAPACKE_ztr_trans, + lapack_make_complex_float, + lapack_make_complex_double, + + # @(SRC_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` LAPACKE_cbbcsd, LAPACKE_cbbcsd_work, LAPACKE_cbdsqr, LAPACKE_cbdsqr_work, - LAPACKE_cgb_nancheck, - LAPACKE_cgb_trans, LAPACKE_cgbbrd, LAPACKE_cgbbrd_work, LAPACKE_cgbcon, @@ -368,8 +620,6 @@ LAPACKE_cgbtrf_work, LAPACKE_cgbtrs, LAPACKE_cgbtrs_work, - LAPACKE_cge_nancheck, - LAPACKE_cge_trans, LAPACKE_cgebak, LAPACKE_cgebak_work, LAPACKE_cgebal, @@ -444,8 +694,6 @@ LAPACKE_cgetri_work, LAPACKE_cgetrs, LAPACKE_cgetrs_work, - LAPACKE_cgg_nancheck, - LAPACKE_cgg_trans, LAPACKE_cggbak, LAPACKE_cggbak_work, LAPACKE_cggbal, @@ -472,7 +720,6 @@ LAPACKE_cggsvd_work, LAPACKE_cggsvp, LAPACKE_cggsvp_work, - LAPACKE_cgt_nancheck, LAPACKE_cgtcon, LAPACKE_cgtcon_work, LAPACKE_cgtrfs, @@ -485,8 +732,6 @@ LAPACKE_cgttrf_work, LAPACKE_cgttrs, LAPACKE_cgttrs_work, - LAPACKE_chb_nancheck, - LAPACKE_chb_trans, LAPACKE_chbev, LAPACKE_chbev_work, LAPACKE_chbevd, @@ -503,8 +748,6 @@ LAPACKE_chbgvx_work, LAPACKE_chbtrd, LAPACKE_chbtrd_work, - LAPACKE_che_nancheck, - LAPACKE_che_trans, LAPACKE_checon, LAPACKE_checon_work, LAPACKE_cheequb, @@ -551,8 +794,6 @@ LAPACKE_chfrk_work, LAPACKE_chgeqz, LAPACKE_chgeqz_work, - LAPACKE_chp_nancheck, - LAPACKE_chp_trans, LAPACKE_chpcon, LAPACKE_chpcon_work, LAPACKE_chpev, @@ -583,8 +824,6 @@ LAPACKE_chptri_work, LAPACKE_chptrs, LAPACKE_chptrs_work, - LAPACKE_chs_nancheck, - LAPACKE_chs_trans, LAPACKE_chsein, LAPACKE_chsein_work, LAPACKE_chseqr, @@ -621,8 +860,6 @@ LAPACKE_claswp_work, LAPACKE_clauum, LAPACKE_clauum_work, - LAPACKE_cpb_nancheck, - LAPACKE_cpb_trans, LAPACKE_cpbcon, LAPACKE_cpbcon_work, LAPACKE_cpbequ, @@ -639,16 +876,12 @@ LAPACKE_cpbtrf_work, LAPACKE_cpbtrs, LAPACKE_cpbtrs_work, - LAPACKE_cpf_nancheck, - LAPACKE_cpf_trans, LAPACKE_cpftrf, LAPACKE_cpftrf_work, LAPACKE_cpftri, LAPACKE_cpftri_work, LAPACKE_cpftrs, LAPACKE_cpftrs_work, - LAPACKE_cpo_nancheck, - LAPACKE_cpo_trans, LAPACKE_cpocon, LAPACKE_cpocon_work, LAPACKE_cpoequ, @@ -667,8 +900,6 @@ LAPACKE_cpotri_work, LAPACKE_cpotrs, LAPACKE_cpotrs_work, - LAPACKE_cpp_nancheck, - LAPACKE_cpp_trans, LAPACKE_cppcon, LAPACKE_cppcon_work, LAPACKE_cppequ, @@ -687,7 +918,6 @@ LAPACKE_cpptrs_work, LAPACKE_cpstrf, LAPACKE_cpstrf_work, - LAPACKE_cpt_nancheck, LAPACKE_cptcon, LAPACKE_cptcon_work, LAPACKE_cpteqr, @@ -702,8 +932,6 @@ LAPACKE_cpttrf_work, LAPACKE_cpttrs, LAPACKE_cpttrs_work, - LAPACKE_csp_nancheck, - LAPACKE_csp_trans, LAPACKE_cspcon, LAPACKE_cspcon_work, LAPACKE_csprfs, @@ -718,7 +946,6 @@ LAPACKE_csptri_work, LAPACKE_csptrs, LAPACKE_csptrs_work, - LAPACKE_cst_nancheck, LAPACKE_cstedc, LAPACKE_cstedc_work, LAPACKE_cstegr, @@ -729,16 +956,12 @@ LAPACKE_cstemr_work, LAPACKE_csteqr, LAPACKE_csteqr_work, - LAPACKE_csy_nancheck, - LAPACKE_csy_trans, LAPACKE_csycon, LAPACKE_csycon_work, LAPACKE_csyconv, LAPACKE_csyconv_work, LAPACKE_csyequb, LAPACKE_csyequb_work, - LAPACKE_csyr, - LAPACKE_csyr_work, LAPACKE_csyrfs, LAPACKE_csyrfs_work, LAPACKE_csysv, @@ -759,16 +982,12 @@ LAPACKE_csytrs2, LAPACKE_csytrs2_work, LAPACKE_csytrs_work, - LAPACKE_ctb_nancheck, - LAPACKE_ctb_trans, LAPACKE_ctbcon, LAPACKE_ctbcon_work, LAPACKE_ctbrfs, LAPACKE_ctbrfs_work, LAPACKE_ctbtrs, LAPACKE_ctbtrs_work, - LAPACKE_ctf_nancheck, - LAPACKE_ctf_trans, LAPACKE_ctfsm, LAPACKE_ctfsm_work, LAPACKE_ctftri, @@ -789,8 +1008,6 @@ LAPACKE_ctgsna_work, LAPACKE_ctgsyl, LAPACKE_ctgsyl_work, - LAPACKE_ctp_nancheck, - LAPACKE_ctp_trans, LAPACKE_ctpcon, LAPACKE_ctpcon_work, LAPACKE_ctpmqrt, @@ -811,8 +1028,6 @@ LAPACKE_ctpttf_work, LAPACKE_ctpttr, LAPACKE_ctpttr_work, - LAPACKE_ctr_nancheck, - LAPACKE_ctr_trans, LAPACKE_ctrcon, LAPACKE_ctrcon_work, LAPACKE_ctrevc, @@ -875,7 +1090,6 @@ LAPACKE_cupgtr_work, LAPACKE_cupmtr, LAPACKE_cupmtr_work, - LAPACKE_d_nancheck, LAPACKE_dbbcsd, LAPACKE_dbbcsd_work, LAPACKE_dbdsdc, @@ -884,8 +1098,6 @@ LAPACKE_dbdsqr_work, LAPACKE_ddisna, LAPACKE_ddisna_work, - LAPACKE_dgb_nancheck, - LAPACKE_dgb_trans, LAPACKE_dgbbrd, LAPACKE_dgbbrd_work, LAPACKE_dgbcon, @@ -904,8 +1116,6 @@ LAPACKE_dgbtrf_work, LAPACKE_dgbtrs, LAPACKE_dgbtrs_work, - LAPACKE_dge_nancheck, - LAPACKE_dge_trans, LAPACKE_dgebak, LAPACKE_dgebak_work, LAPACKE_dgebal, @@ -984,8 +1194,6 @@ LAPACKE_dgetri_work, LAPACKE_dgetrs, LAPACKE_dgetrs_work, - LAPACKE_dgg_nancheck, - LAPACKE_dgg_trans, LAPACKE_dggbak, LAPACKE_dggbak_work, LAPACKE_dggbal, @@ -1012,7 +1220,6 @@ LAPACKE_dggsvd_work, LAPACKE_dggsvp, LAPACKE_dggsvp_work, - LAPACKE_dgt_nancheck, LAPACKE_dgtcon, LAPACKE_dgtcon_work, LAPACKE_dgtrfs, @@ -1027,8 +1234,6 @@ LAPACKE_dgttrs_work, LAPACKE_dhgeqz, LAPACKE_dhgeqz_work, - LAPACKE_dhs_nancheck, - LAPACKE_dhs_trans, LAPACKE_dhsein, LAPACKE_dhsein_work, LAPACKE_dhseqr, @@ -1111,8 +1316,6 @@ LAPACKE_dormrz_work, LAPACKE_dormtr, LAPACKE_dormtr_work, - LAPACKE_dpb_nancheck, - LAPACKE_dpb_trans, LAPACKE_dpbcon, LAPACKE_dpbcon_work, LAPACKE_dpbequ, @@ -1129,16 +1332,12 @@ LAPACKE_dpbtrf_work, LAPACKE_dpbtrs, LAPACKE_dpbtrs_work, - LAPACKE_dpf_nancheck, - LAPACKE_dpf_trans, LAPACKE_dpftrf, LAPACKE_dpftrf_work, LAPACKE_dpftri, LAPACKE_dpftri_work, LAPACKE_dpftrs, LAPACKE_dpftrs_work, - LAPACKE_dpo_nancheck, - LAPACKE_dpo_trans, LAPACKE_dpocon, LAPACKE_dpocon_work, LAPACKE_dpoequ, @@ -1157,8 +1356,6 @@ LAPACKE_dpotri_work, LAPACKE_dpotrs, LAPACKE_dpotrs_work, - LAPACKE_dpp_nancheck, - LAPACKE_dpp_trans, LAPACKE_dppcon, LAPACKE_dppcon_work, LAPACKE_dppequ, @@ -1177,7 +1374,6 @@ LAPACKE_dpptrs_work, LAPACKE_dpstrf, LAPACKE_dpstrf_work, - LAPACKE_dpt_nancheck, LAPACKE_dptcon, LAPACKE_dptcon_work, LAPACKE_dpteqr, @@ -1192,8 +1388,6 @@ LAPACKE_dpttrf_work, LAPACKE_dpttrs, LAPACKE_dpttrs_work, - LAPACKE_dsb_nancheck, - LAPACKE_dsb_trans, LAPACKE_dsbev, LAPACKE_dsbev_work, LAPACKE_dsbevd, @@ -1214,8 +1408,6 @@ LAPACKE_dsfrk_work, LAPACKE_dsgesv, LAPACKE_dsgesv_work, - LAPACKE_dsp_nancheck, - LAPACKE_dsp_trans, LAPACKE_dspcon, LAPACKE_dspcon_work, LAPACKE_dspev, @@ -1248,7 +1440,6 @@ LAPACKE_dsptri_work, LAPACKE_dsptrs, LAPACKE_dsptrs_work, - LAPACKE_dst_nancheck, LAPACKE_dstebz, LAPACKE_dstebz_work, LAPACKE_dstedc, @@ -1271,8 +1462,6 @@ LAPACKE_dstevr_work, LAPACKE_dstevx, LAPACKE_dstevx_work, - LAPACKE_dsy_nancheck, - LAPACKE_dsy_trans, LAPACKE_dsycon, LAPACKE_dsycon_work, LAPACKE_dsyconv, @@ -1317,16 +1506,12 @@ LAPACKE_dsytrs2, LAPACKE_dsytrs2_work, LAPACKE_dsytrs_work, - LAPACKE_dtb_nancheck, - LAPACKE_dtb_trans, LAPACKE_dtbcon, LAPACKE_dtbcon_work, LAPACKE_dtbrfs, LAPACKE_dtbrfs_work, LAPACKE_dtbtrs, LAPACKE_dtbtrs_work, - LAPACKE_dtf_nancheck, - LAPACKE_dtf_trans, LAPACKE_dtfsm, LAPACKE_dtfsm_work, LAPACKE_dtftri, @@ -1347,8 +1532,6 @@ LAPACKE_dtgsna_work, LAPACKE_dtgsyl, LAPACKE_dtgsyl_work, - LAPACKE_dtp_nancheck, - LAPACKE_dtp_trans, LAPACKE_dtpcon, LAPACKE_dtpcon_work, LAPACKE_dtpmqrt, @@ -1369,8 +1552,6 @@ LAPACKE_dtpttf_work, LAPACKE_dtpttr, LAPACKE_dtpttr_work, - LAPACKE_dtr_nancheck, - LAPACKE_dtr_trans, LAPACKE_dtrcon, LAPACKE_dtrcon_work, LAPACKE_dtrevc, @@ -1395,8 +1576,6 @@ LAPACKE_dtrttp_work, LAPACKE_dtzrzf, LAPACKE_dtzrzf_work, - LAPACKE_lsame, - LAPACKE_s_nancheck, LAPACKE_sbbcsd, LAPACKE_sbbcsd_work, LAPACKE_sbdsdc, @@ -1405,8 +1584,6 @@ LAPACKE_sbdsqr_work, LAPACKE_sdisna, LAPACKE_sdisna_work, - LAPACKE_sgb_nancheck, - LAPACKE_sgb_trans, LAPACKE_sgbbrd, LAPACKE_sgbbrd_work, LAPACKE_sgbcon, @@ -1425,8 +1602,6 @@ LAPACKE_sgbtrf_work, LAPACKE_sgbtrs, LAPACKE_sgbtrs_work, - LAPACKE_sge_nancheck, - LAPACKE_sge_trans, LAPACKE_sgebak, LAPACKE_sgebak_work, LAPACKE_sgebal, @@ -1505,8 +1680,6 @@ LAPACKE_sgetri_work, LAPACKE_sgetrs, LAPACKE_sgetrs_work, - LAPACKE_sgg_nancheck, - LAPACKE_sgg_trans, LAPACKE_sggbak, LAPACKE_sggbak_work, LAPACKE_sggbal, @@ -1533,7 +1706,6 @@ LAPACKE_sggsvd_work, LAPACKE_sggsvp, LAPACKE_sggsvp_work, - LAPACKE_sgt_nancheck, LAPACKE_sgtcon, LAPACKE_sgtcon_work, LAPACKE_sgtrfs, @@ -1548,8 +1720,6 @@ LAPACKE_sgttrs_work, LAPACKE_shgeqz, LAPACKE_shgeqz_work, - LAPACKE_shs_nancheck, - LAPACKE_shs_trans, LAPACKE_shsein, LAPACKE_shsein_work, LAPACKE_shseqr, @@ -1632,8 +1802,6 @@ LAPACKE_sormrz_work, LAPACKE_sormtr, LAPACKE_sormtr_work, - LAPACKE_spb_nancheck, - LAPACKE_spb_trans, LAPACKE_spbcon, LAPACKE_spbcon_work, LAPACKE_spbequ, @@ -1650,16 +1818,12 @@ LAPACKE_spbtrf_work, LAPACKE_spbtrs, LAPACKE_spbtrs_work, - LAPACKE_spf_nancheck, - LAPACKE_spf_trans, LAPACKE_spftrf, LAPACKE_spftrf_work, LAPACKE_spftri, LAPACKE_spftri_work, LAPACKE_spftrs, LAPACKE_spftrs_work, - LAPACKE_spo_nancheck, - LAPACKE_spo_trans, LAPACKE_spocon, LAPACKE_spocon_work, LAPACKE_spoequ, @@ -1678,8 +1842,6 @@ LAPACKE_spotri_work, LAPACKE_spotrs, LAPACKE_spotrs_work, - LAPACKE_spp_nancheck, - LAPACKE_spp_trans, LAPACKE_sppcon, LAPACKE_sppcon_work, LAPACKE_sppequ, @@ -1698,7 +1860,6 @@ LAPACKE_spptrs_work, LAPACKE_spstrf, LAPACKE_spstrf_work, - LAPACKE_spt_nancheck, LAPACKE_sptcon, LAPACKE_sptcon_work, LAPACKE_spteqr, @@ -1713,8 +1874,6 @@ LAPACKE_spttrf_work, LAPACKE_spttrs, LAPACKE_spttrs_work, - LAPACKE_ssb_nancheck, - LAPACKE_ssb_trans, LAPACKE_ssbev, LAPACKE_ssbev_work, LAPACKE_ssbevd, @@ -1733,8 +1892,6 @@ LAPACKE_ssbtrd_work, LAPACKE_ssfrk, LAPACKE_ssfrk_work, - LAPACKE_ssp_nancheck, - LAPACKE_ssp_trans, LAPACKE_sspcon, LAPACKE_sspcon_work, LAPACKE_sspev, @@ -1765,7 +1922,6 @@ LAPACKE_ssptri_work, LAPACKE_ssptrs, LAPACKE_ssptrs_work, - LAPACKE_sst_nancheck, LAPACKE_sstebz, LAPACKE_sstebz_work, LAPACKE_sstedc, @@ -1788,8 +1944,6 @@ LAPACKE_sstevr_work, LAPACKE_sstevx, LAPACKE_sstevx_work, - LAPACKE_ssy_nancheck, - LAPACKE_ssy_trans, LAPACKE_ssycon, LAPACKE_ssycon_work, LAPACKE_ssyconv, @@ -1834,16 +1988,12 @@ LAPACKE_ssytrs2, LAPACKE_ssytrs2_work, LAPACKE_ssytrs_work, - LAPACKE_stb_nancheck, - LAPACKE_stb_trans, LAPACKE_stbcon, LAPACKE_stbcon_work, LAPACKE_stbrfs, LAPACKE_stbrfs_work, LAPACKE_stbtrs, LAPACKE_stbtrs_work, - LAPACKE_stf_nancheck, - LAPACKE_stf_trans, LAPACKE_stfsm, LAPACKE_stfsm_work, LAPACKE_stftri, @@ -1864,8 +2014,6 @@ LAPACKE_stgsna_work, LAPACKE_stgsyl, LAPACKE_stgsyl_work, - LAPACKE_stp_nancheck, - LAPACKE_stp_trans, LAPACKE_stpcon, LAPACKE_stpcon_work, LAPACKE_stpmqrt, @@ -1884,8 +2032,6 @@ LAPACKE_stpttf_work, LAPACKE_stpttr, LAPACKE_stpttr_work, - LAPACKE_str_nancheck, - LAPACKE_str_trans, LAPACKE_strcon, LAPACKE_strcon_work, LAPACKE_strevc, @@ -1910,8 +2056,6 @@ LAPACKE_strttp_work, LAPACKE_stzrzf, LAPACKE_stzrzf_work, - LAPACKE_xerbla, - LAPACKE_z_nancheck, LAPACKE_zbbcsd, LAPACKE_zbbcsd_work, LAPACKE_zbdsqr, @@ -1920,8 +2064,6 @@ LAPACKE_zcgesv_work, LAPACKE_zcposv, LAPACKE_zcposv_work, - LAPACKE_zgb_nancheck, - LAPACKE_zgb_trans, LAPACKE_zgbbrd, LAPACKE_zgbbrd_work, LAPACKE_zgbcon, @@ -1940,8 +2082,6 @@ LAPACKE_zgbtrf_work, LAPACKE_zgbtrs, LAPACKE_zgbtrs_work, - LAPACKE_zge_nancheck, - LAPACKE_zge_trans, LAPACKE_zgebak, LAPACKE_zgebak_work, LAPACKE_zgebal, @@ -2016,8 +2156,6 @@ LAPACKE_zgetri_work, LAPACKE_zgetrs, LAPACKE_zgetrs_work, - LAPACKE_zgg_nancheck, - LAPACKE_zgg_trans, LAPACKE_zggbak, LAPACKE_zggbak_work, LAPACKE_zggbal, @@ -2044,7 +2182,6 @@ LAPACKE_zggsvd_work, LAPACKE_zggsvp, LAPACKE_zggsvp_work, - LAPACKE_zgt_nancheck, LAPACKE_zgtcon, LAPACKE_zgtcon_work, LAPACKE_zgtrfs, @@ -2057,8 +2194,6 @@ LAPACKE_zgttrf_work, LAPACKE_zgttrs, LAPACKE_zgttrs_work, - LAPACKE_zhb_nancheck, - LAPACKE_zhb_trans, LAPACKE_zhbev, LAPACKE_zhbev_work, LAPACKE_zhbevd, @@ -2075,8 +2210,6 @@ LAPACKE_zhbgvx_work, LAPACKE_zhbtrd, LAPACKE_zhbtrd_work, - LAPACKE_zhe_nancheck, - LAPACKE_zhe_trans, LAPACKE_zhecon, LAPACKE_zhecon_work, LAPACKE_zheequb, @@ -2123,8 +2256,6 @@ LAPACKE_zhfrk_work, LAPACKE_zhgeqz, LAPACKE_zhgeqz_work, - LAPACKE_zhp_nancheck, - LAPACKE_zhp_trans, LAPACKE_zhpcon, LAPACKE_zhpcon_work, LAPACKE_zhpev, @@ -2155,8 +2286,6 @@ LAPACKE_zhptri_work, LAPACKE_zhptrs, LAPACKE_zhptrs_work, - LAPACKE_zhs_nancheck, - LAPACKE_zhs_trans, LAPACKE_zhsein, LAPACKE_zhsein_work, LAPACKE_zhseqr, @@ -2193,8 +2322,6 @@ LAPACKE_zlaswp_work, LAPACKE_zlauum, LAPACKE_zlauum_work, - LAPACKE_zpb_nancheck, - LAPACKE_zpb_trans, LAPACKE_zpbcon, LAPACKE_zpbcon_work, LAPACKE_zpbequ, @@ -2211,16 +2338,12 @@ LAPACKE_zpbtrf_work, LAPACKE_zpbtrs, LAPACKE_zpbtrs_work, - LAPACKE_zpf_nancheck, - LAPACKE_zpf_trans, LAPACKE_zpftrf, LAPACKE_zpftrf_work, LAPACKE_zpftri, LAPACKE_zpftri_work, LAPACKE_zpftrs, LAPACKE_zpftrs_work, - LAPACKE_zpo_nancheck, - LAPACKE_zpo_trans, LAPACKE_zpocon, LAPACKE_zpocon_work, LAPACKE_zpoequ, @@ -2239,8 +2362,6 @@ LAPACKE_zpotri_work, LAPACKE_zpotrs, LAPACKE_zpotrs_work, - LAPACKE_zpp_nancheck, - LAPACKE_zpp_trans, LAPACKE_zppcon, LAPACKE_zppcon_work, LAPACKE_zppequ, @@ -2259,7 +2380,6 @@ LAPACKE_zpptrs_work, LAPACKE_zpstrf, LAPACKE_zpstrf_work, - LAPACKE_zpt_nancheck, LAPACKE_zptcon, LAPACKE_zptcon_work, LAPACKE_zpteqr, @@ -2274,8 +2394,6 @@ LAPACKE_zpttrf_work, LAPACKE_zpttrs, LAPACKE_zpttrs_work, - LAPACKE_zsp_nancheck, - LAPACKE_zsp_trans, LAPACKE_zspcon, LAPACKE_zspcon_work, LAPACKE_zsprfs, @@ -2290,7 +2408,6 @@ LAPACKE_zsptri_work, LAPACKE_zsptrs, LAPACKE_zsptrs_work, - LAPACKE_zst_nancheck, LAPACKE_zstedc, LAPACKE_zstedc_work, LAPACKE_zstegr, @@ -2301,16 +2418,12 @@ LAPACKE_zstemr_work, LAPACKE_zsteqr, LAPACKE_zsteqr_work, - LAPACKE_zsy_nancheck, - LAPACKE_zsy_trans, LAPACKE_zsycon, LAPACKE_zsycon_work, LAPACKE_zsyconv, LAPACKE_zsyconv_work, LAPACKE_zsyequb, LAPACKE_zsyequb_work, - LAPACKE_zsyr, - LAPACKE_zsyr_work, LAPACKE_zsyrfs, LAPACKE_zsyrfs_work, LAPACKE_zsysv, @@ -2331,16 +2444,12 @@ LAPACKE_zsytrs2, LAPACKE_zsytrs2_work, LAPACKE_zsytrs_work, - LAPACKE_ztb_nancheck, - LAPACKE_ztb_trans, LAPACKE_ztbcon, LAPACKE_ztbcon_work, LAPACKE_ztbrfs, LAPACKE_ztbrfs_work, LAPACKE_ztbtrs, LAPACKE_ztbtrs_work, - LAPACKE_ztf_nancheck, - LAPACKE_ztf_trans, LAPACKE_ztfsm, LAPACKE_ztfsm_work, LAPACKE_ztftri, @@ -2361,8 +2470,6 @@ LAPACKE_ztgsna_work, LAPACKE_ztgsyl, LAPACKE_ztgsyl_work, - LAPACKE_ztp_nancheck, - LAPACKE_ztp_trans, LAPACKE_ztpcon, LAPACKE_ztpcon_work, LAPACKE_ztpmqrt, @@ -2383,8 +2490,6 @@ LAPACKE_ztpttf_work, LAPACKE_ztpttr, LAPACKE_ztpttr_work, - LAPACKE_ztr_nancheck, - LAPACKE_ztr_trans, LAPACKE_ztrcon, LAPACKE_ztrcon_work, LAPACKE_ztrevc, @@ -2447,15 +2552,127 @@ LAPACKE_zupgtr_work, LAPACKE_zupmtr, LAPACKE_zupmtr_work, + LAPACKE_zsyr, + LAPACKE_csyr, + LAPACKE_zsyr_work, + LAPACKE_csyr_work, + + ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` + ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the + ## corresponding LAPACK extended precision routines. + #LAPACKE_cgbrfsx, + #LAPACKE_cporfsx, + #LAPACKE_dgerfsx, + #LAPACKE_sgbrfsx, + #LAPACKE_ssyrfsx, + #LAPACKE_zherfsx, + #LAPACKE_cgbrfsx_work, + #LAPACKE_cporfsx_work, + #LAPACKE_dgerfsx_work, + #LAPACKE_sgbrfsx_work, + #LAPACKE_ssyrfsx_work, + #LAPACKE_zherfsx_work, + #LAPACKE_cgerfsx, + #LAPACKE_csyrfsx, + #LAPACKE_dporfsx, + #LAPACKE_sgerfsx, + #LAPACKE_zgbrfsx, + #LAPACKE_zporfsx, + #LAPACKE_cgerfsx_work, + #LAPACKE_csyrfsx_work, + #LAPACKE_dporfsx_work, + #LAPACKE_sgerfsx_work, + #LAPACKE_zgbrfsx_work, + #LAPACKE_zporfsx_work, + #LAPACKE_cherfsx, + #LAPACKE_dgbrfsx, + #LAPACKE_dsyrfsx, + #LAPACKE_sporfsx, + #LAPACKE_zgerfsx, + #LAPACKE_zsyrfsx, + #LAPACKE_cherfsx_work, + #LAPACKE_dgbrfsx_work, + #LAPACKE_dsyrfsx_work, + #LAPACKE_sporfsx_work, + #LAPACKE_zgerfsx_work, + #LAPACKE_zsyrfsx_work, + #LAPACKE_cgbsvxx, + #LAPACKE_cposvxx, + #LAPACKE_dgesvxx, + #LAPACKE_sgbsvxx, + #LAPACKE_ssysvxx, + #LAPACKE_zhesvxx, + #LAPACKE_cgbsvxx_work, + #LAPACKE_cposvxx_work, + #LAPACKE_dgesvxx_work, + #LAPACKE_sgbsvxx_work, + #LAPACKE_ssysvxx_work, + #LAPACKE_zhesvxx_work, + #LAPACKE_cgesvxx, + #LAPACKE_csysvxx, + #LAPACKE_dposvxx, + #LAPACKE_sgesvxx, + #LAPACKE_zgbsvxx, + #LAPACKE_zposvxx, + #LAPACKE_cgesvxx_work, + #LAPACKE_csysvxx_work, + #LAPACKE_dposvxx_work, + #LAPACKE_sgesvxx_work, + #LAPACKE_zgbsvxx_work, + #LAPACKE_zposvxx_work, + #LAPACKE_chesvxx, + #LAPACKE_dgbsvxx, + #LAPACKE_dsysvxx, + #LAPACKE_sposvxx, + #LAPACKE_zgesvxx, + #LAPACKE_zsysvxx, + #LAPACKE_chesvxx_work, + #LAPACKE_dgbsvxx_work, + #LAPACKE_dsysvxx_work, + #LAPACKE_sposvxx_work, + #LAPACKE_zgesvxx_work, + #LAPACKE_zsysvxx_work, + + ## @(MATGEN_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` + ## Not exported: requires LAPACKE_TESTING to be set and depends on libtmg + ## (see `lapack-3.4.1/TESTING/MATGEN`). + #LAPACKE_clatms, + #LAPACKE_clatms_work, + #LAPACKE_dlatms, + #LAPACKE_dlatms_work, + #LAPACKE_slatms, + #LAPACKE_slatms_work, + #LAPACKE_zlatms, + #LAPACKE_zlatms_work, + #LAPACKE_clagge, + #LAPACKE_clagge_work, + #LAPACKE_dlagge, + #LAPACKE_dlagge_work, + #LAPACKE_slagge, + #LAPACKE_slagge_work, + #LAPACKE_zlagge, + #LAPACKE_zlagge_work, + #LAPACKE_claghe, + #LAPACKE_claghe_work, + #LAPACKE_zlaghe, + #LAPACKE_zlaghe_work, + #LAPACKE_clagsy, + #LAPACKE_clagsy_work, + #LAPACKE_dlagsy, + #LAPACKE_dlagsy_work, + #LAPACKE_slagsy, + #LAPACKE_slagsy_work, + #LAPACKE_zlagsy, + #LAPACKE_zlagsy_work, ); if ($ARGV[5] == 1) { #NO_LAPACK=1 - @underscore_objs = (@blasobjs); + @underscore_objs = (@blasobjs, @misc_underscore_objs); } elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0" || -d "../lapack-3.4.1") { - @underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2); + @underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2, @misc_underscore_objs); } else { - @underscore_objs = (@blasobjs, @lapackobjs); + @underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs); } if ($ARGV[3] == 1){ @underscore_objs = (@underscore_objs, @exblasobjs); }; @@ -2469,10 +2686,10 @@ if ($ARGV[1] eq "ia64"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; if ($ARGV[1] eq "MIPS"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; if ($ARGV[4] == 0) { - @no_underscore_objs = (@cblasobjs); + @no_underscore_objs = (@cblasobjs, @misc_no_underscore_objs); }else{ #NO_CBLAS=1 - @no_underscore_objs = (); + @no_underscore_objs = (@misc_no_underscore_objs); } if ($ARGV[6] == 1) { #NO_LAPACKE=1 @@ -2542,6 +2759,10 @@ if ($ARGV[0] eq "aix"){ if ($ARGV[0] eq "win2k"){ print "EXPORTS\n"; $count = 1; + + #remove openblas_set_num_threads + @underscore_objs = grep /[^openblas_set_num_threads]/,@underscore_objs; + foreach $objs (@underscore_objs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; @@ -2552,7 +2773,11 @@ if ($ARGV[0] eq "win2k"){ print "\t$uppercase=$objs", "_ \@", $count, "\n"; $count ++; } - + + #for openblas_set_num_threads + print "\topenblas_set_num_threads_=openblas_set_num_threads_ \@", $count, "\n"; + $count ++; + # if ($ARGV[4] == 0) { foreach $objs (@no_underscore_objs) { print "\t",$objs,"=$objs"," \@", $count, "\n"; diff --git a/f_check b/f_check index 93c39ec88..8e3855b10 100644 --- a/f_check +++ b/f_check @@ -32,11 +32,12 @@ if ($compiler eq "") { "pgf95", "pgf90", "pgf77", "ifort"); +OUTER: foreach $lists (@lists) { foreach $path (@path) { - if (-f $path . "/" . $lists) { + if (-x $path . "/" . $lists) { $compiler = $lists; - break; + last OUTER; } } } diff --git a/getarch.c b/getarch.c index 5b614472a..7e08e774e 100644 --- a/getarch.c +++ b/getarch.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -102,6 +102,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_BARCELONA */ /* #define FORCE_SHANGHAI */ /* #define FORCE_ISTANBUL */ +/* #define FORCE_BOBCAT */ /* #define FORCE_SSE_GENERIC */ /* #define FORCE_VIAC3 */ /* #define FORCE_NANO */ @@ -278,6 +279,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "NEHALEM" #endif +#ifdef FORCE_SANDYBRIDGE +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#endif + #ifdef FORCE_ATOM #define FORCE #define FORCE_INTEL @@ -349,6 +364,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "BARCELONA" #endif +#if defined(FORCE_BOBCAT) +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "BOBCAT" +#define ARCHCONFIG "-DBOBCAT " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=40 -DDTB_SIZE=4096 " \ + "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 " \ + "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_CFLUSH -DHAVE_CMOV" +#define LIBNAME "bobcat" +#define CORENAME "BOBCAT" +#endif + #ifdef FORCE_SSE_GENERIC #define FORCE #define FORCE_INTEL diff --git a/kernel/generic/zgemm_ncopy_4_sandy.c b/kernel/generic/zgemm_ncopy_4_sandy.c new file mode 100644 index 000000000..839bd5939 --- /dev/null +++ b/kernel/generic/zgemm_ncopy_4_sandy.c @@ -0,0 +1,235 @@ +/***************************************************************************** + Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the ISCAS nor the names of its contributors may +be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + **********************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*dest0; + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*src4,*src5,*src6,*src7,*dest0; + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*dest0; + FLOAT *dest1,*dest2; + ii = col&-4; + ii = ii*(2*row); + dest2 = dest+ii; + ii = col&-2; + ii = ii*(2*row); + dest1 = dest+ii; + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*dest0; + FLOAT *dest1,*dest2,*dest4; + ii = col&-8; + ii = ii*(2*row); + dest4 = dest+ii; + ii = col&-4; + ii = ii*(2*row); + dest2 = dest+ii; + ii = col&-2; + ii = ii*(2*row); + dest1 = dest+ii; + for (j=0; j