diff --git a/CMakeLists.txt b/CMakeLists.txt index e77aec030..53c1709a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 11) +set(OpenBLAS_PATCH_VERSION 12) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions diff --git a/Changelog.txt b/Changelog.txt index bd0e60992..1e843e38e 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,9 +1,36 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.12 + 24-Oct-2020 + +common: + * Fixed missibg LAPACK functions (inadvertently dropped during + the build system restructuring) + * Fixed argument conversion macro in LAPACKE_zgesvdq (LAPACK #458) + +POWER: + * Added optimized SCOPY/CCOPY kernels for POWER10 + * Increased and unified the default size of the GEMM BUFFER + * Fixed building for POWER1ß in DYNAMIC_ARCH mode + * POWER10 compatibility test now checks binutils version as well + * Cleaned up compiler warnings + +x86_64: + * corrected compiler version checks for AVX2 compatibility + * added compiler option -mavx2 for building with flang + * fixed direct SGEMM pathway for small matrix sizes (broken by + the code refactoring in 0.3.11) + * fixed unhandled partial register clobbers in several kernels + for AXPY,DOT,GEMV_N and GEMV_T flagged by gcc10 tree-vectorizer + +ARMV8: + * improved Apple Vortex support to include cross-compiling + ==================================================================== Version 0.3.11 17-Oct-2020 - common: +common: * API change: the newly added BFLOAT16 functions were renamed to use the letter "B" instead of "H" to avoid potential confusion with @@ -28,7 +55,7 @@ Version 0.3.11 * Makefile builds no longer misread NO_CBLAS=0 or NO_LAPACK=0 as enabling these options * Fixed detection of gfortran when invoked through an mpi wrapper - * Improve thread reinitialization performance with OpenMP xafter a fork + * Improve thread reinitialization performance with OpenMP after a fork * Added support for building only the subset of the library required for a particular precision by specifying BUILD_SINGLE, BUILD_DOUBLE * Optional function name prefixes and suffixes are now correctly @@ -66,7 +93,6 @@ ARMV8: * Fixed cpu detection on BSD-like systems * Fixed compilation in -std=C18 mode - IBM Z: * Added support for compiling with the clang compiler * Improved GEMM performance on Z14 diff --git a/Makefile.power b/Makefile.power index e766f8499..c7e972290 100644 --- a/Makefile.power +++ b/Makefile.power @@ -10,7 +10,7 @@ USE_OPENMP = 1 endif ifeq ($(CORE), POWER10) -COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math +CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math endif diff --git a/Makefile.rule b/Makefile.rule index acfe568d6..a4d11dc7c 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.11 +VERSION = 0.3.12 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library @@ -295,10 +295,13 @@ COMMON_PROF = -pg -# the below is not yet configurable, use cmake if you need to build only select types -BUILD_SINGLE = 1 -BUILD_DOUBLE = 1 -BUILD_COMPLEX = 1 -BUILD_COMPLEX16 = 1 +# By default the library contains BLAS functions (and LAPACK if selected) for all input types. +# To build a smaller library supporting e.g. only single precision real (SGEMM etc.) or only +# the functions for complex numbers, uncomment the desired type(s) below +# BUILD_SINGLE = 1 +# BUILD_DOUBLE = 1 +# BUILD_COMPLEX = 1 +# BUILD_COMPLEX16 = 1 +# # End of user configuration # diff --git a/Makefile.system b/Makefile.system index 461f7370b..30d8f4ccf 100644 --- a/Makefile.system +++ b/Makefile.system @@ -641,6 +641,7 @@ DYNAMIC_CORE += POWER8 ifneq ($(C_COMPILER), GCC) DYNAMIC_CORE += POWER9 DYNAMIC_CORE += POWER10 +CCOMMON_OPT += -DHAVE_P10_SUPPORT endif ifeq ($(C_COMPILER), GCC) ifeq ($(GCCVERSIONGT5), 1) @@ -648,11 +649,14 @@ DYNAMIC_CORE += POWER9 else $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) endif -ifeq ($(GCCVERSIONGTEQ11), 1) +LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35) +ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11) DYNAMIC_CORE += POWER10 +CCOMMON_OPT += -DHAVE_P10_SUPPORT else ifeq ($(GCCVERSIONGTEQ10), 1) -ifeq ($(GCCMINORVERSIONGTEQ2), 1) +ifeq ($(GCCMINORVERSIONGTEQ2)$(LDVERSIONGTEQ35), 11) DYNAMIC_CORE += POWER10 +CCOMMON_OPT += -DHAVE_P10_SUPPORT endif else $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 3a42e19e4..a849f0b01 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -74,8 +74,10 @@ ifndef NO_AVX2 ifeq ($(C_COMPILER), GCC) # AVX2 support was added in 4.7.0 GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) +GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) -ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11) +GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) +ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) CCOMMON_OPT += -mavx2 endif else @@ -86,8 +88,14 @@ endif ifeq ($(F_COMPILER), GFORTRAN) # AVX2 support was added in 4.7.0 GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4) +GCCVERSIONGTEQ5 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 5) GCCMINORVERSIONGTEQ7 := $(shell expr `$(FC) -dumpversion | cut -f2 -d.` \>= 7) -ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11) +GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) +ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) +FCOMMON_OPT += -mavx2 +endif +else +ifeq ($(F_COMPILER), FLANG) FCOMMON_OPT += -mavx2 endif endif diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 99e685d04..5457bfb07 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -49,6 +49,7 @@ if (DYNAMIC_ARCH) if (POWER) set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10) + set(CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_P10_SUPPORT") endif () if (X86) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index f40304c09..3e38abbf5 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -416,6 +416,29 @@ endif () set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) +elseif ("${TCORE}" STREQUAL "VORTEX") + file(APPEND ${TARGET_CONF_TEMP} + "#define ARMV8\n" + "#define L1_CODE_SIZE\t32768\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t4\n" + "#define L1_DATA_SIZE\t32768\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t4\n" + "#define L2_SIZE\t5262144\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "POWER6") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE 32768\n" diff --git a/common_power.h b/common_power.h index e0685f760..a61e4e28a 100644 --- a/common_power.h +++ b/common_power.h @@ -844,8 +844,8 @@ Lmcount$lazy_ptr: #define BUFFER_SIZE ( 2 << 20) #elif defined(PPC440FP2) #define BUFFER_SIZE ( 16 << 20) -#elif defined(POWER8) || defined(POWER9) || defined(POWER10) -#define BUFFER_SIZE ( 64 << 20) +#elif defined(POWER6) || defined(POWER8) || defined(POWER9) || defined(POWER10) +#define BUFFER_SIZE ( 64 << 22) #else #define BUFFER_SIZE ( 16 << 20) #endif diff --git a/cpuid_arm64.c b/cpuid_arm64.c index ae150ef1b..5f5d7771b 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -424,7 +424,7 @@ void get_cpuconfig(void) sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0); printf("#define L1_DATA_SIZE %d \n",value); sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0); - printf("#define L2_DATA_SIZE %d \n",value); + printf("#define L2_SIZE %d \n",value); break; #endif } diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index ca1d42408..85fc5b3ba 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -6,10 +6,10 @@ extern gotoblas_t gotoblas_POWER8; #if (!defined __GNUC__) || ( __GNUC__ >= 6) extern gotoblas_t gotoblas_POWER9; #endif -#if (!defined __GNUC__) || ( __GNUC__ >= 11) \ - || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) -#define HAVE_P10_SUPPORT 1 -#endif +//#if (!defined __GNUC__) || ( __GNUC__ >= 11) \ +// || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) +//#define HAVE_P10_SUPPORT 1 +//#endif #ifdef HAVE_P10_SUPPORT extern gotoblas_t gotoblas_POWER10; #endif diff --git a/exports/Makefile b/exports/Makefile index 3f1ffba11..eec0593aa 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -120,10 +120,10 @@ dll : ../$(LIBDLLNAME) -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB) $(LIBPREFIX).def : gensymbol - perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) + perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) libgoto_hpl.def : gensymbol - perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) + perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) ifeq ($(OSNAME), Darwin) INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib @@ -258,16 +258,16 @@ static : ../$(LIBNAME) rm -f goto.$(SUFFIX) osx.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) + perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) aix.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) + perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) objcopy.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) + perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) objconv.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) + perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) test : linktest.c $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. diff --git a/exports/gensymbol b/exports/gensymbol index 8482ecb7e..22e470da5 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -50,8 +50,8 @@ zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2, zgeadd, dzsum); -@cblasobjs = (lsame, xerbla); -@halfblasobjs = (sbgemm, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); +@blasobjs = (lsame, xerbla); +@bfblasobjs = (sbgemm, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); @cblasobjsc = ( cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, @@ -72,7 +72,7 @@ ); @cblasobjss = ( - cblas_sasum, cblas_saxpy, + cblas_sasum, cblas_saxpy, cblas_saxpby, cblas_scopy, cblas_sdot, cblas_sdsdot, cblas_sgbmv, cblas_sgemm, cblas_sgemv, cblas_sger, cblas_snrm2, cblas_srot, cblas_srotg, cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr, @@ -94,7 +94,7 @@ @cblasobjs = ( cblas_xerbla ); -@halfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod); +@bfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod); @exblasobjs = ( qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, @@ -415,7 +415,7 @@ zpotri, cgeqrt, cgeqrt2, cgeqrt3, cgemqrt, ctpqrt, ctpqrt2, ctpmqrt, ctprfb, ); -@lapack2objszc = ( +@lapackobjs2zc = ( # ZCLASRC -- Double-single mixed precision complex routines called from # single, single-extra and double precision complex LAPACK # routines (i.e. from CLASRC, CXLASRC, ZLASRC). @@ -425,7 +425,7 @@ zpotri, cpotrs, ); -@lapack2objsd = ( +@lapackobjs2d = ( # DLASRC -- Double precision real LAPACK routines # already provided by @lapackobjs: # dgesv, dgetf2, dgetrs, dlaswp, dlauu2, dlauum, dpotf2, dpotrf, dpotri, @@ -568,7 +568,7 @@ zpotri, ); # functions added for lapack-3.6.0 -@lapack2objsc = ( @lapack2objsc, +@lapackobjs2c = ( @lapackobjs2c, cgejsv, cgesvdx, cgesvj, @@ -604,7 +604,7 @@ zpotri, csyr2, cunm22, ); -@lapackobjs2d = (@lapack2objsd, +@lapackobjs2d = (@lapackobjs2d, dbdsvdx, dgesvdx, dgetrf2, @@ -637,7 +637,7 @@ zpotri, dpotrf2, dsecnd, ); - @lapack2objss = (@lapack2objss, + @lapackobjs2s = (@lapackobjs2s, sbdsvdx, second, sgesvdx, @@ -670,7 +670,7 @@ zpotri, sorm22, spotrf2, ); - @lapack2objsz = (@lapack2objsz, + @lapackobjs2z = (@lapackobjs2z, zgejsv, zgesvdx, zgesvj, @@ -707,7 +707,7 @@ zpotri, zunm22, ); # functions added for lapack-3.7.0 -@lapack2objss = (@lapack2objss, +@lapackobjs2s = (@lapackobjs2s, slarfy, strevc3, sgelqt, @@ -726,7 +726,7 @@ zpotri, stplqt2, stpmlqt, ); - @lapack2objsd = (@lapack2objsd, + @lapackobjs2d = (@lapackobjs2d, dlarfy, dsyconvf, dtrevc3, @@ -746,7 +746,7 @@ zpotri, dtplqt2, dtpmlqt, ); - @lapack2objsc = (@lapack2objsc, + @lapackobjs2c = (@lapackobjs2c, clarfy, csyconvf, ctrevc3, @@ -766,7 +766,7 @@ zpotri, ctplqt2, ctpmlqt, ); - @lapack2objsz = (@lapack2objsz, + @lapackobjs2z = (@lapackobjs2z, zlarfy, zsyconvf, ztrevc3, @@ -786,31 +786,31 @@ zpotri, zlamswlq, zgemlq, ); - @lapack2objs = (@lapack2objs, - sladiv1, - dladiv1, + @lapackobjs2s = (@lapackobjs2s, + sladiv1); + @lapackobjs2d = (@lapackobjs2d, + dladiv1); + @lapackobjs = (@lapackobjs, iparam2stage, - # functions added for lapack-3.8.0 - ilaenv2stage, ); # functions added for lapack-3.9.0 -@lapack2objsc = (@lapack2objsc, +@lapackobjs2c = (@lapackobjs2c, cgesvdq, - cungtsqr, - dcombssq, + cungtsqr ); -@lapack2objsd = (@lapack2objsd, +@lapackobjs2d = (@lapackobjs2d, + dcombssq, dgesvdq, dorgtsqr, ); -@lapack2objss = (@lapack2objss, +@lapackobjs2s = (@lapackobjs2s, scombssq, sgesvdq, sorgtsqr, ); -@lapack2objsz = (@lapack2objsz, +@lapackobjs2z = (@lapackobjs2z, zgesvdq, zungtsqr ); @@ -835,10 +835,29 @@ zpotri, dlatzm, dtzrqf); @lapack_deprecated_objss = ( + sgelsx, sgegs, - sgegv, + sgegv, + sgeqpf, + sggsvd, + sggsvp, + slahrd, + slatzm, + stzrqf ); - + +@lapack_deprecated_objsz = ( + zgegs, + zgegv, + zgelsx, + zgeqpf, + zggsvd, + zggsvp, + zlahrd, + zlatzm, + ztzrqf + ); + @lapacke_deprecated_objsc = ( LAPACKE_cggsvp, LAPACKE_cggsvp_work, @@ -3590,14 +3609,18 @@ use File::Basename; my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib"); if ($ARGV[12] == 1) { - @blasobjs = (@blasobjs, @halfblasobjs); - @cblasobjs = (@cblasobjs, @halfcblasobjs); + @blasobjs = (@blasobjs, @bfblasobjs); + @cblasobjs = (@cblasobjs, @bfcblasobjs); } if ($ARGV[13] == 1) { @blasobjs = (@blasobjs, @blasobjss); @cblasobjs = (@cblasobjs, @cblasobjss); @lapackobjs = (@lapackobjs, @lapackobjss); - @lapack2objs = (@lapack2objs, @lapack2objss); + @lapackobjs2 = (@lapackobjs2, @lapackobjs2s); + @lapackobjs2 = (@lapackobjs2, @lapackobjs2sc); + @lapackobjs2 = (@lapackobjs2, @lapackobjs2ds); + @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objss); + @lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objss); @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_s); @lapackeobjs = (@lapackeobjs, @lapackeobjss); } @@ -3605,7 +3628,12 @@ if ($ARGV[14] == 1) { @blasobjs = (@blasobjs, @blasobjsd); @cblasobjs = (@cblasobjs, @cblasobjsd); @lapackobjs = (@lapackobjs, @lapackobjsd); - @lapack2objs = (@lapack2objs, @lapack2objsd); + if ($ARGV[13] == 0) { + @lapackobjs2 = (@lapackobjs2, @lapackobjs2ds); + } + @lapackobjs2 = (@lapackobjs2, @lapackobjs2d, @lapackobjs2dz); + @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsd); + @lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objsd); @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_d); @lapackeobjs = (@lapackeobjs, @lapackeobjsd); } @@ -3613,9 +3641,14 @@ if ($ARGV[15] == 1) { @blasobjs = (@blasobjs, @blasobjsc); @cblasobjs = (@cblasobjs, @cblasobjsc); @gemm3mobjs = (@gemm3mobjs, @gemm3mobjsc); - @cblasgemm3mobjs = (@cblasgemm3mobjs, @sblasgemm3mobjsc); + @cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsc); @lapackobjs = (@lapackobjs, @lapackobjsc); - @lapack2objs = (@lapack2objs, @lapack2objsc, @lapac2objszc); + @lapackobjs2 = (@lapackobjs2, @lapackobjs2c, @lapackobjs2zc); + if ($ARGV[13] == 0) { + @lapackobjs2 = (@lapackobjs2, @lapackobjs2sc); + } + @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsc); + @lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objsc); @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_c); @lapackeobjs = (@lapackeobjs, @lapackeobjsc); } @@ -3623,9 +3656,17 @@ if ($ARGV[16] == 1) { @blasobjs = (@blasobjs, @blasobjsz); @cblasobjs = (@cblasobjs, @cblasobjsz); @gemm3mobjs = (@gemm3mobjs, @gemm3mobjsz); - @cblasgemm3mobjs = (@cblasgemm3mobjs, @sblasgemm3mobjsz); + @cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsz); @lapackobjs = (@lapackobjs, @lapackobjsz); - @lapack2objs = (@lapack2objs, @lapack2objsz, @lapack2objszc); + @lapackobjs2 = (@lapackobjs2, @lapackobjs2z); + if ($ARGV[15] == 0) { + @lapackobjs2 = (@lapackobjs2, @lapackobjs2zc); + } + if ($ARGV[14] == 0) { + @lapackobjs2 = (@lapackobjs2, @lapackobjs2dz); + } + @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsz); + @lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objsz); @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_z); @lapackeobjs = (@lapackeobjs, @lapackeobjsz); } diff --git a/getarch.c b/getarch.c index e2c22d3a0..3f1448305 100644 --- a/getarch.c +++ b/getarch.c @@ -1222,6 +1222,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_VORTEX +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "VORTEX" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DVORTEX " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "vortex" +#define CORENAME "VORTEX" +#endif + #ifdef FORCE_ZARCH_GENERIC #define FORCE #define ARCHITECTURE "ZARCH" diff --git a/kernel/Makefile b/kernel/Makefile index 43318d475..e52781c6d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -22,20 +22,25 @@ ifeq ($(C_COMPILER), CLANG) override CFLAGS += -fno-integrated-as endif endif + AVX2OPT = ifeq ($(C_COMPILER), GCC) # AVX2 support was added in 4.7.0 - GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) - GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) - ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11) +GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) +GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5) +GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) +GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) +ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) AVX2OPT = -mavx2 endif endif ifeq ($(C_COMPILER), CLANG) # Any clang posing as gcc 4.2 should be new enough (3.4 or later) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) + GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5) GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2) - ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2), 11) + GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) + ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) AVX2OPT = -mavx2 endif endif diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 031d96581..86df7e3a2 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -150,9 +150,9 @@ CAXPYKERNEL = caxpy.c endif ZAXPYKERNEL = zaxpy_power10.c # -SCOPYKERNEL = scopy.c +SCOPYKERNEL = scopy_power10.c DCOPYKERNEL = dcopy_power10.c -CCOPYKERNEL = ccopy.c +CCOPYKERNEL = ccopy_power10.c ZCOPYKERNEL = zcopy_power10.c # SDOTKERNEL = sdot.c diff --git a/kernel/power/ccopy_power10.c b/kernel/power/ccopy_power10.c new file mode 100644 index 000000000..a5877cd12 --- /dev/null +++ b/kernel/power/ccopy_power10.c @@ -0,0 +1,132 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "copy_microk_power10.c" +#endif + +#ifndef HAVE_KERNEL + +static void copy_kernel(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + copy_kernel(n1, x, y); + i=n1; + ix=n1*2; + iy=n1*2; + } + + while(i < n) + { + y[iy] = x[iy] ; + y[iy+1] = x[ix+1] ; + ix+=2; + iy+=2; + i++ ; + + } + + + } + else + { + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/power/dcopy_microk_power10.c b/kernel/power/copy_microk_power10.c similarity index 91% rename from kernel/power/dcopy_microk_power10.c rename to kernel/power/copy_microk_power10.c index 8940e0db9..c90dc3785 100644 --- a/kernel/power/dcopy_microk_power10.c +++ b/kernel/power/copy_microk_power10.c @@ -25,9 +25,9 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#define HAVE_KERNEL_64 1 +#define HAVE_KERNEL 1 -static void dcopy_kernel_64 (long n, double *x, double *y) +static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) { __asm__ ( @@ -49,8 +49,13 @@ static void dcopy_kernel_64 (long n, double *x, double *y) "lxvp 60, 448(%2) \n\t" "lxvp 62, 480(%2) \n\t" "addi %2, %2, 512 \n\t" - +#if !defined(COMPLEX) && !defined(DOUBLE) + "addic. %1, %1, -128 \n\t" +#elif defined(COMPLEX) && defined(DOUBLE) + "addic. %1, %1, -32 \n\t" +#else "addic. %1, %1, -64 \n\t" +#endif "ble two%= \n\t" ".align 5 \n" @@ -94,7 +99,13 @@ static void dcopy_kernel_64 (long n, double *x, double *y) "addi %3, %3, 512 \n\t" "addi %2, %2, 512 \n\t" +#if !defined(COMPLEX) && !defined(DOUBLE) + "addic. %1, %1, -128 \n\t" +#elif defined(COMPLEX) && defined(DOUBLE) + "addic. %1, %1, -32 \n\t" +#else "addic. %1, %1, -64 \n\t" +#endif "bgt one%= \n" "two%=: \n\t" @@ -121,7 +132,7 @@ static void dcopy_kernel_64 (long n, double *x, double *y) "=m" (*y), "+r" (n), // 1 "+b" (x), // 2 - "+b" (y) // 3 + "+b" (y) // 3 : "m" (*x) : diff --git a/kernel/power/ctrmm_kernel_8x4_power8.S b/kernel/power/ctrmm_kernel_8x4_power8.S index 822420dfd..35faad19e 100644 --- a/kernel/power/ctrmm_kernel_8x4_power8.S +++ b/kernel/power/ctrmm_kernel_8x4_power8.S @@ -82,12 +82,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef __64BIT__ -#define STACKSIZE 400 #define STACKSIZE 592 #define ALPHA_R_SP 304+192(SP) #define ALPHA_I_SP 312+192(SP) #else -#define STACKSIZE 256 #define STACKSIZE 452 #define ALPHA_R_SP 224+196(SP) #define ALPHA_I_SP 232+196(SP) diff --git a/kernel/power/dcopy_power10.c b/kernel/power/dcopy_power10.c index 32530d570..cd10b7136 100644 --- a/kernel/power/dcopy_power10.c +++ b/kernel/power/dcopy_power10.c @@ -28,12 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(__VEC__) || defined(__ALTIVEC__) -#include "dcopy_microk_power10.c" +#include "copy_microk_power10.c" #endif -#ifndef HAVE_KERNEL_64 +#ifndef HAVE_KERNEL -static void dcopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) +static void copy_kernel(BLASLONG n, FLOAT *x, FLOAT *y) { BLASLONG i=0; @@ -89,7 +89,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG n1 = n & -64; if ( n1 > 0 ) { - dcopy_kernel_64(n1, x, y); + copy_kernel(n1, x, y); i=n1; } diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S index 651fd53fc..f8ed12ee9 100644 --- a/kernel/power/dgemm_kernel_16x4_power8.S +++ b/kernel/power/dgemm_kernel_16x4_power8.S @@ -82,12 +82,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef __64BIT__ -#define STACKSIZE 320 #define STACKSIZE 512 #define ALPHA_SP 296+192(SP) #define FZERO 304+192(SP) #else -#define STACKSIZE 240 #define STACKSIZE 440 #define ALPHA_SP 224+200(SP) #define FZERO 232+200(SP) diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S index 84c65f503..91154ad37 100644 --- a/kernel/power/dtrmm_kernel_16x4_power8.S +++ b/kernel/power/dtrmm_kernel_16x4_power8.S @@ -82,7 +82,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef __64BIT__ -#define STACKSIZE 320 #define STACKSIZE 520 #define ALPHA_SP 296+200(SP) #define FZERO 304+200(SP) diff --git a/kernel/power/dtrsm_kernel_LT_16x4_power8.S b/kernel/power/dtrsm_kernel_LT_16x4_power8.S index 8a423f181..5b349db12 100644 --- a/kernel/power/dtrsm_kernel_LT_16x4_power8.S +++ b/kernel/power/dtrsm_kernel_LT_16x4_power8.S @@ -47,7 +47,6 @@ #endif #ifdef __64BIT__ -#define STACKSIZE 320 #define STACKSIZE 520 #define ALPHA 296+200(SP) #define FZERO 304+200(SP) diff --git a/kernel/power/scopy_power10.c b/kernel/power/scopy_power10.c new file mode 100644 index 000000000..298a8998a --- /dev/null +++ b/kernel/power/scopy_power10.c @@ -0,0 +1,123 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "copy_microk_power10.c" +#endif + +#ifndef HAVE_KERNEL + +static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + copy_kernel (n1, x, y); + i=n1; + } + + while(i < n) + { + y[i] = x[i] ; + i++ ; + + } + + + } + else + { + + while(i < n) + { + y[iy] = x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/power/strmm_kernel_16x8_power8.S b/kernel/power/strmm_kernel_16x8_power8.S index 78e539231..1f9912c49 100644 --- a/kernel/power/strmm_kernel_16x8_power8.S +++ b/kernel/power/strmm_kernel_16x8_power8.S @@ -82,7 +82,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef __64BIT__ -#define STACKSIZE 340 #define STACKSIZE 540 #define ALPHA_SP 296+200(SP) #define FZERO 304+200(SP) diff --git a/kernel/power/zcopy_microk_power10.c b/kernel/power/zcopy_microk_power10.c deleted file mode 100644 index f2f2119a3..000000000 --- a/kernel/power/zcopy_microk_power10.c +++ /dev/null @@ -1,134 +0,0 @@ -/*************************************************************************** -Copyright (c) 2020, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define HAVE_KERNEL_32 1 - -static void zcopy_kernel_32 (long n, double *x, double *y) -{ - __asm__ - ( - "lxvp 32, 0(%2) \n\t" - "lxvp 34, 32(%2) \n\t" - "lxvp 36, 64(%2) \n\t" - "lxvp 38, 96(%2) \n\t" - "lxvp 40, 128(%2) \n\t" - "lxvp 42, 160(%2) \n\t" - "lxvp 44, 192(%2) \n\t" - "lxvp 46, 224(%2) \n\t" - - "lxvp 48, 256(%2) \n\t" - "lxvp 50, 288(%2) \n\t" - "lxvp 52, 320(%2) \n\t" - "lxvp 54, 352(%2) \n\t" - "lxvp 56, 384(%2) \n\t" - "lxvp 58, 416(%2) \n\t" - "lxvp 60, 448(%2) \n\t" - "lxvp 62, 480(%2) \n\t" - "addi %2, %2, 512 \n\t" - - "addic. %1, %1, -32 \n\t" - "ble two%= \n\t" - - ".align 5 \n" - "one%=: \n\t" - - "stxvp 32, 0(%3) \n\t" - "lxvp 32, 0(%2) \n\t" - "stxvp 34, 32(%3) \n\t" - "lxvp 34, 32(%2) \n\t" - "stxvp 36, 64(%3) \n\t" - "lxvp 36, 64(%2) \n\t" - "stxvp 38, 96(%3) \n\t" - "lxvp 38, 96(%2) \n\t" - - "stxvp 40, 128(%3) \n\t" - "lxvp 40, 128(%2) \n\t" - "stxvp 42, 160(%3) \n\t" - "lxvp 42, 160(%2) \n\t" - "stxvp 44, 192(%3) \n\t" - "lxvp 44, 192(%2) \n\t" - "stxvp 46, 224(%3) \n\t" - "lxvp 46, 224(%2) \n\t" - - "stxvp 48, 256(%3) \n\t" - "lxvp 48, 256(%2) \n\t" - "stxvp 50, 288(%3) \n\t" - "lxvp 50, 288(%2) \n\t" - "stxvp 52, 320(%3) \n\t" - "lxvp 52, 320(%2) \n\t" - "stxvp 54, 352(%3) \n\t" - "lxvp 54, 352(%2) \n\t" - "stxvp 56, 384(%3) \n\t" - "lxvp 56, 384(%2) \n\t" - "stxvp 58, 416(%3) \n\t" - "lxvp 58, 416(%2) \n\t" - "stxvp 60, 448(%3) \n\t" - "lxvp 60, 448(%2) \n\t" - "stxvp 62, 480(%3) \n\t" - "lxvp 62, 480(%2) \n\t" - - "addi %3, %3, 512 \n\t" - "addi %2, %2, 512 \n\t" - - "addic. %1, %1, -32 \n\t" - "bgt one%= \n" - - "two%=: \n\t" - - "stxvp 32, 0(%3) \n\t" - "stxvp 34, 32(%3) \n\t" - "stxvp 36, 64(%3) \n\t" - "stxvp 38, 96(%3) \n\t" - "stxvp 40, 128(%3) \n\t" - "stxvp 42, 160(%3) \n\t" - "stxvp 44, 192(%3) \n\t" - "stxvp 46, 224(%3) \n\t" - "stxvp 48, 256(%3) \n\t" - "stxvp 50, 288(%3) \n\t" - "stxvp 52, 320(%3) \n\t" - "stxvp 54, 352(%3) \n\t" - "stxvp 56, 384(%3) \n\t" - "stxvp 58, 416(%3) \n\t" - "stxvp 60, 448(%3) \n\t" - "stxvp 62, 480(%3) \n\t" - - "#n=%1 x=%4=%2 y=%0=%3" - : - "=m" (*y), - "+r" (n), // 1 - "+b" (x), // 2 - "+b" (y) // 3 - : - "m" (*x) - : - "cr0", - "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", - "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", - "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", - "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" - ); -} diff --git a/kernel/power/zcopy_power10.c b/kernel/power/zcopy_power10.c index 99d463b02..6b4e7a7d4 100644 --- a/kernel/power/zcopy_power10.c +++ b/kernel/power/zcopy_power10.c @@ -28,12 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(__VEC__) || defined(__ALTIVEC__) -#include "zcopy_microk_power10.c" +#include "copy_microk_power10.c" #endif -#ifndef HAVE_KERNEL_32 +#ifndef HAVE_KERNEL -static void zcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +static void copy_kernel(BLASLONG n, FLOAT *x, FLOAT *y) { BLASLONG i=0; @@ -89,7 +89,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG n1 = n & -32; if ( n1 > 0 ) { - zcopy_kernel_32(n1, x, y); + copy_kernel(n1, x, y); i=n1; ix=n1*2; iy=n1*2; diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c index 4ed27d96b..956d75ffc 100644 --- a/kernel/power/zgemv_t_4.c +++ b/kernel/power/zgemv_t_4.c @@ -513,7 +513,7 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT al #endif -static __attribute__((always_inline)) void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { +static __attribute__((always_inline)) inline void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { BLASLONG i; for (i = 0; i < n; i++) { *dest = *src; diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c index ca2209340..a32558dc9 100644 --- a/kernel/x86_64/caxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c @@ -122,7 +122,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", @@ -189,9 +189,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c index b605ea34c..129ce7a49 100644 --- a/kernel/x86_64/caxpy_microk_haswell-2.c +++ b/kernel/x86_64/caxpy_microk_haswell-2.c @@ -120,7 +120,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c index 72d37afed..564dfbd0f 100644 --- a/kernel/x86_64/caxpy_microk_sandy-2.c +++ b/kernel/x86_64/caxpy_microk_sandy-2.c @@ -104,7 +104,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c index 7ca7af070..cc5c5de76 100644 --- a/kernel/x86_64/caxpy_microk_steamroller-2.c +++ b/kernel/x86_64/caxpy_microk_steamroller-2.c @@ -122,7 +122,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", @@ -189,9 +189,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c index f3682e6d7..ecc0ecbd3 100644 --- a/kernel/x86_64/daxpy_microk_haswell-2.c +++ b/kernel/x86_64/daxpy_microk_haswell-2.c @@ -67,8 +67,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (y), // 3 "r" (alpha) // 4 : "cc", - "%xmm0", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c index dbb5487f7..faac72870 100644 --- a/kernel/x86_64/ddot_microk_haswell-2.c +++ b/kernel/x86_64/ddot_microk_haswell-2.c @@ -84,8 +84,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "r" (y), // 3 "r" (dot) // 4 : "cc", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c index cc4bcd90a..0320a2e36 100644 --- a/kernel/x86_64/ddot_microk_piledriver-2.c +++ b/kernel/x86_64/ddot_microk_piledriver-2.c @@ -91,6 +91,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); @@ -155,6 +156,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c index 84493ec27..35ba86a7d 100644 --- a/kernel/x86_64/ddot_microk_sandy-2.c +++ b/kernel/x86_64/ddot_microk_sandy-2.c @@ -89,8 +89,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "r" (y), // 3 "r" (dot) // 4 : "cc", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c index 27d5244ce..94c012f0d 100644 --- a/kernel/x86_64/ddot_microk_steamroller-2.c +++ b/kernel/x86_64/ddot_microk_steamroller-2.c @@ -88,6 +88,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/dgemm_tcopy_16_skylakex.c b/kernel/x86_64/dgemm_tcopy_16_skylakex.c index a1da60f8f..ff2c48617 100644 --- a/kernel/x86_64/dgemm_tcopy_16_skylakex.c +++ b/kernel/x86_64/dgemm_tcopy_16_skylakex.c @@ -126,4 +126,5 @@ int CNAME(BLASLONG dim_second, BLASLONG dim_first, double *src, BLASLONG lead_di } src1 += src_inc; } + return 0; } diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c index da0fa2fff..c20c0a030 100644 --- a/kernel/x86_64/dgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c @@ -105,9 +105,8 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "r" (alpha) // 8 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); @@ -182,11 +181,10 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "r" (ap[1]), // 5 "r" (alpha) // 6 : "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", - "%xmm6", - "%xmm8", - "%xmm12", "%xmm13", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } diff --git a/kernel/x86_64/dgemv_n_microk_piledriver-4.c b/kernel/x86_64/dgemv_n_microk_piledriver-4.c index 466931b82..57fa426ba 100644 --- a/kernel/x86_64/dgemv_n_microk_piledriver-4.c +++ b/kernel/x86_64/dgemv_n_microk_piledriver-4.c @@ -140,7 +140,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); @@ -235,9 +235,11 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "r" (ap[3]), // 7 "r" (alpha) // 8 : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/dgemv_t_microk_haswell-4.c b/kernel/x86_64/dgemv_t_microk_haswell-4.c index 958fd3e0a..b398307d3 100644 --- a/kernel/x86_64/dgemv_t_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_t_microk_haswell-4.c @@ -117,7 +117,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "r" (ap[2]), // 6 "r" (ap[3]) // 7 : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c index 7099ba4c6..8cc697f05 100644 --- a/kernel/x86_64/saxpy_microk_haswell-2.c +++ b/kernel/x86_64/saxpy_microk_haswell-2.c @@ -67,7 +67,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (y), // 3 "r" (alpha) // 4 : "cc", - "%xmm0", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" diff --git a/kernel/x86_64/saxpy_microk_piledriver-2.c b/kernel/x86_64/saxpy_microk_piledriver-2.c index 5feea7f24..ebbcc0045 100644 --- a/kernel/x86_64/saxpy_microk_piledriver-2.c +++ b/kernel/x86_64/saxpy_microk_piledriver-2.c @@ -86,7 +86,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (y), // 3 "r" (alpha) // 4 : "cc", - "%xmm0", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" @@ -147,7 +148,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (y), // 3 "r" (alpha) // 4 : "cc", - "%xmm0", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c index 91dc928d3..322f4b28c 100644 --- a/kernel/x86_64/sdot_microk_haswell-2.c +++ b/kernel/x86_64/sdot_microk_haswell-2.c @@ -87,8 +87,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "r" (y), // 3 "r" (dot) // 4 : "cc", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/sdot_microk_sandy-2.c b/kernel/x86_64/sdot_microk_sandy-2.c index ae25d5a50..ce09b06cf 100644 --- a/kernel/x86_64/sdot_microk_sandy-2.c +++ b/kernel/x86_64/sdot_microk_sandy-2.c @@ -90,8 +90,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "r" (y), // 3 "r" (dot) // 4 : "cc", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/sgemm_direct_skylakex.c b/kernel/x86_64/sgemm_direct_skylakex.c index a7cddbb3d..aaadcf151 100644 --- a/kernel/x86_64/sgemm_direct_skylakex.c +++ b/kernel/x86_64/sgemm_direct_skylakex.c @@ -1,7 +1,8 @@ -#if defined(SKYLAKEX) || defined (COOPERLAKE) /* the direct sgemm code written by Arjan van der Ven */ #include #include "common.h" + +#if defined(SKYLAKEX) || defined (COOPERLAKE) /* * "Direct sgemm" code. This code operates directly on the inputs and outputs * of the sgemm call, avoiding the copies, memory realignments and threading, diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c index 93e1e26e8..556dcfde5 100644 --- a/kernel/x86_64/sgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c @@ -164,11 +164,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "r" (ap[3]), // 8 "r" (alpha) // 9 : "cc", - "%xmm0", "%xmm1", - "%xmm2", "%xmm3", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); @@ -286,9 +284,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "r" (ap[3]), // 7 "r" (alpha) // 8 : "cc", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/sgemv_t_microk_haswell-4.c b/kernel/x86_64/sgemv_t_microk_haswell-4.c index 8c370b4c0..fcabc0def 100644 --- a/kernel/x86_64/sgemv_t_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_t_microk_haswell-4.c @@ -138,7 +138,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "r" (ap[2]), // 6 "r" (ap[3]) // 7 : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c index 15d367971..ccb26134f 100644 --- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c @@ -122,7 +122,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", @@ -189,9 +189,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c index 89d23daf3..8f299ea2d 100644 --- a/kernel/x86_64/zaxpy_microk_haswell-2.c +++ b/kernel/x86_64/zaxpy_microk_haswell-2.c @@ -120,7 +120,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c index 17b8b24f7..5246c72e8 100644 --- a/kernel/x86_64/zaxpy_microk_sandy-2.c +++ b/kernel/x86_64/zaxpy_microk_sandy-2.c @@ -108,9 +108,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); return; @@ -185,9 +186,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c index 907b1ae00..88e3a680b 100644 --- a/kernel/x86_64/zaxpy_microk_steamroller-2.c +++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c @@ -122,7 +122,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", @@ -189,9 +189,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (alpha), // 4 "r" (mvec) // 5 : "cc", - "%xmm0", "%xmm1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c index f58a5c4e9..4928b1bc0 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c @@ -71,7 +71,7 @@ lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp, goto exit_level_0; } liwork = iwork_query; - lcwork = LAPACK_C2INT(cwork_query); + lcwork = LAPACK_Z2INT(cwork_query); lrwork = (lapack_int)rwork_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );