Merge pull request #2943 from xianyi/develop
Merge from develop for 0.3.12 release
This commit is contained in:
commit
6e3a05f2c9
|
@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
|
|||
project(OpenBLAS C ASM)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 11)
|
||||
set(OpenBLAS_PATCH_VERSION 12)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
|
|
|
@ -1,9 +1,36 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.12
|
||||
24-Oct-2020
|
||||
|
||||
common:
|
||||
* Fixed missibg LAPACK functions (inadvertently dropped during
|
||||
the build system restructuring)
|
||||
* Fixed argument conversion macro in LAPACKE_zgesvdq (LAPACK #458)
|
||||
|
||||
POWER:
|
||||
* Added optimized SCOPY/CCOPY kernels for POWER10
|
||||
* Increased and unified the default size of the GEMM BUFFER
|
||||
* Fixed building for POWER1ß in DYNAMIC_ARCH mode
|
||||
* POWER10 compatibility test now checks binutils version as well
|
||||
* Cleaned up compiler warnings
|
||||
|
||||
x86_64:
|
||||
* corrected compiler version checks for AVX2 compatibility
|
||||
* added compiler option -mavx2 for building with flang
|
||||
* fixed direct SGEMM pathway for small matrix sizes (broken by
|
||||
the code refactoring in 0.3.11)
|
||||
* fixed unhandled partial register clobbers in several kernels
|
||||
for AXPY,DOT,GEMV_N and GEMV_T flagged by gcc10 tree-vectorizer
|
||||
|
||||
ARMV8:
|
||||
* improved Apple Vortex support to include cross-compiling
|
||||
|
||||
====================================================================
|
||||
Version 0.3.11
|
||||
17-Oct-2020
|
||||
|
||||
common:
|
||||
common:
|
||||
* API change:
|
||||
the newly added BFLOAT16 functions were renamed to use the
|
||||
letter "B" instead of "H" to avoid potential confusion with
|
||||
|
@ -28,7 +55,7 @@ Version 0.3.11
|
|||
* Makefile builds no longer misread NO_CBLAS=0 or NO_LAPACK=0 as
|
||||
enabling these options
|
||||
* Fixed detection of gfortran when invoked through an mpi wrapper
|
||||
* Improve thread reinitialization performance with OpenMP xafter a fork
|
||||
* Improve thread reinitialization performance with OpenMP after a fork
|
||||
* Added support for building only the subset of the library required
|
||||
for a particular precision by specifying BUILD_SINGLE, BUILD_DOUBLE
|
||||
* Optional function name prefixes and suffixes are now correctly
|
||||
|
@ -66,7 +93,6 @@ ARMV8:
|
|||
* Fixed cpu detection on BSD-like systems
|
||||
* Fixed compilation in -std=C18 mode
|
||||
|
||||
|
||||
IBM Z:
|
||||
* Added support for compiling with the clang compiler
|
||||
* Improved GEMM performance on Z14
|
||||
|
|
|
@ -10,7 +10,7 @@ USE_OPENMP = 1
|
|||
endif
|
||||
|
||||
ifeq ($(CORE), POWER10)
|
||||
COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
|
||||
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
|
||||
endif
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.11
|
||||
VERSION = 0.3.12
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
@ -295,10 +295,13 @@ COMMON_PROF = -pg
|
|||
|
||||
|
||||
|
||||
# the below is not yet configurable, use cmake if you need to build only select types
|
||||
BUILD_SINGLE = 1
|
||||
BUILD_DOUBLE = 1
|
||||
BUILD_COMPLEX = 1
|
||||
BUILD_COMPLEX16 = 1
|
||||
# By default the library contains BLAS functions (and LAPACK if selected) for all input types.
|
||||
# To build a smaller library supporting e.g. only single precision real (SGEMM etc.) or only
|
||||
# the functions for complex numbers, uncomment the desired type(s) below
|
||||
# BUILD_SINGLE = 1
|
||||
# BUILD_DOUBLE = 1
|
||||
# BUILD_COMPLEX = 1
|
||||
# BUILD_COMPLEX16 = 1
|
||||
#
|
||||
# End of user configuration
|
||||
#
|
||||
|
|
|
@ -641,6 +641,7 @@ DYNAMIC_CORE += POWER8
|
|||
ifneq ($(C_COMPILER), GCC)
|
||||
DYNAMIC_CORE += POWER9
|
||||
DYNAMIC_CORE += POWER10
|
||||
CCOMMON_OPT += -DHAVE_P10_SUPPORT
|
||||
endif
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
ifeq ($(GCCVERSIONGT5), 1)
|
||||
|
@ -648,11 +649,14 @@ DYNAMIC_CORE += POWER9
|
|||
else
|
||||
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
|
||||
endif
|
||||
ifeq ($(GCCVERSIONGTEQ11), 1)
|
||||
LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35)
|
||||
ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11)
|
||||
DYNAMIC_CORE += POWER10
|
||||
CCOMMON_OPT += -DHAVE_P10_SUPPORT
|
||||
else ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
ifeq ($(GCCMINORVERSIONGTEQ2), 1)
|
||||
ifeq ($(GCCMINORVERSIONGTEQ2)$(LDVERSIONGTEQ35), 11)
|
||||
DYNAMIC_CORE += POWER10
|
||||
CCOMMON_OPT += -DHAVE_P10_SUPPORT
|
||||
endif
|
||||
else
|
||||
$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
|
||||
|
|
|
@ -74,8 +74,10 @@ ifndef NO_AVX2
|
|||
ifeq ($(C_COMPILER), GCC)
|
||||
# AVX2 support was added in 4.7.0
|
||||
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||
GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
|
||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
|
||||
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
|
||||
GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
|
||||
ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
|
||||
CCOMMON_OPT += -mavx2
|
||||
endif
|
||||
else
|
||||
|
@ -86,8 +88,14 @@ endif
|
|||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
# AVX2 support was added in 4.7.0
|
||||
GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||
GCCVERSIONGTEQ5 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 5)
|
||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(FC) -dumpversion | cut -f2 -d.` \>= 7)
|
||||
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
|
||||
GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
|
||||
ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
|
||||
FCOMMON_OPT += -mavx2
|
||||
endif
|
||||
else
|
||||
ifeq ($(F_COMPILER), FLANG)
|
||||
FCOMMON_OPT += -mavx2
|
||||
endif
|
||||
endif
|
||||
|
|
|
@ -49,6 +49,7 @@ if (DYNAMIC_ARCH)
|
|||
|
||||
if (POWER)
|
||||
set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_P10_SUPPORT")
|
||||
endif ()
|
||||
|
||||
if (X86)
|
||||
|
|
|
@ -416,6 +416,29 @@ endif ()
|
|||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "VORTEX")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define ARMV8\n"
|
||||
"#define L1_CODE_SIZE\t32768\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
"#define L1_CODE_ASSOCIATIVE\t4\n"
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t4\n"
|
||||
"#define L2_SIZE\t5262144\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define L2_ASSOCIATIVE\t8\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "POWER6")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE 32768\n"
|
||||
|
|
|
@ -844,8 +844,8 @@ Lmcount$lazy_ptr:
|
|||
#define BUFFER_SIZE ( 2 << 20)
|
||||
#elif defined(PPC440FP2)
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
#elif defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#define BUFFER_SIZE ( 64 << 20)
|
||||
#elif defined(POWER6) || defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#define BUFFER_SIZE ( 64 << 22)
|
||||
#else
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
#endif
|
||||
|
|
|
@ -424,7 +424,7 @@ void get_cpuconfig(void)
|
|||
sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0);
|
||||
printf("#define L1_DATA_SIZE %d \n",value);
|
||||
sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0);
|
||||
printf("#define L2_DATA_SIZE %d \n",value);
|
||||
printf("#define L2_SIZE %d \n",value);
|
||||
break;
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -6,10 +6,10 @@ extern gotoblas_t gotoblas_POWER8;
|
|||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
extern gotoblas_t gotoblas_POWER9;
|
||||
#endif
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 11) \
|
||||
|| (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
|
||||
#define HAVE_P10_SUPPORT 1
|
||||
#endif
|
||||
//#if (!defined __GNUC__) || ( __GNUC__ >= 11) \
|
||||
// || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
|
||||
//#define HAVE_P10_SUPPORT 1
|
||||
//#endif
|
||||
#ifdef HAVE_P10_SUPPORT
|
||||
extern gotoblas_t gotoblas_POWER10;
|
||||
#endif
|
||||
|
|
|
@ -120,10 +120,10 @@ dll : ../$(LIBDLLNAME)
|
|||
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB)
|
||||
|
||||
$(LIBPREFIX).def : gensymbol
|
||||
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
|
||||
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
|
||||
|
||||
libgoto_hpl.def : gensymbol
|
||||
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
|
||||
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
|
||||
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
|
@ -258,16 +258,16 @@ static : ../$(LIBNAME)
|
|||
rm -f goto.$(SUFFIX)
|
||||
|
||||
osx.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
|
||||
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
|
||||
|
||||
aix.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
|
||||
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
|
||||
|
||||
objcopy.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
|
||||
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
|
||||
|
||||
objconv.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
|
||||
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
|
||||
|
||||
test : linktest.c
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
|
||||
|
|
|
@ -50,8 +50,8 @@
|
|||
zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2,
|
||||
zgeadd, dzsum);
|
||||
|
||||
@cblasobjs = (lsame, xerbla);
|
||||
@halfblasobjs = (sbgemm, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
|
||||
@blasobjs = (lsame, xerbla);
|
||||
@bfblasobjs = (sbgemm, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
|
||||
@cblasobjsc = (
|
||||
cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv,
|
||||
cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k,
|
||||
|
@ -72,7 +72,7 @@
|
|||
);
|
||||
|
||||
@cblasobjss = (
|
||||
cblas_sasum, cblas_saxpy,
|
||||
cblas_sasum, cblas_saxpy, cblas_saxpby,
|
||||
cblas_scopy, cblas_sdot, cblas_sdsdot, cblas_sgbmv, cblas_sgemm,
|
||||
cblas_sgemv, cblas_sger, cblas_snrm2, cblas_srot, cblas_srotg,
|
||||
cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr,
|
||||
|
@ -94,7 +94,7 @@
|
|||
|
||||
@cblasobjs = ( cblas_xerbla );
|
||||
|
||||
@halfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod);
|
||||
@bfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod);
|
||||
|
||||
@exblasobjs = (
|
||||
qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
|
||||
|
@ -415,7 +415,7 @@ zpotri,
|
|||
cgeqrt, cgeqrt2, cgeqrt3, cgemqrt,
|
||||
ctpqrt, ctpqrt2, ctpmqrt, ctprfb,
|
||||
);
|
||||
@lapack2objszc = (
|
||||
@lapackobjs2zc = (
|
||||
# ZCLASRC -- Double-single mixed precision complex routines called from
|
||||
# single, single-extra and double precision complex LAPACK
|
||||
# routines (i.e. from CLASRC, CXLASRC, ZLASRC).
|
||||
|
@ -425,7 +425,7 @@ zpotri,
|
|||
cpotrs,
|
||||
);
|
||||
|
||||
@lapack2objsd = (
|
||||
@lapackobjs2d = (
|
||||
# DLASRC -- Double precision real LAPACK routines
|
||||
# already provided by @lapackobjs:
|
||||
# dgesv, dgetf2, dgetrs, dlaswp, dlauu2, dlauum, dpotf2, dpotrf, dpotri,
|
||||
|
@ -568,7 +568,7 @@ zpotri,
|
|||
);
|
||||
# functions added for lapack-3.6.0
|
||||
|
||||
@lapack2objsc = ( @lapack2objsc,
|
||||
@lapackobjs2c = ( @lapackobjs2c,
|
||||
cgejsv,
|
||||
cgesvdx,
|
||||
cgesvj,
|
||||
|
@ -604,7 +604,7 @@ zpotri,
|
|||
csyr2,
|
||||
cunm22,
|
||||
);
|
||||
@lapackobjs2d = (@lapack2objsd,
|
||||
@lapackobjs2d = (@lapackobjs2d,
|
||||
dbdsvdx,
|
||||
dgesvdx,
|
||||
dgetrf2,
|
||||
|
@ -637,7 +637,7 @@ zpotri,
|
|||
dpotrf2,
|
||||
dsecnd,
|
||||
);
|
||||
@lapack2objss = (@lapack2objss,
|
||||
@lapackobjs2s = (@lapackobjs2s,
|
||||
sbdsvdx,
|
||||
second,
|
||||
sgesvdx,
|
||||
|
@ -670,7 +670,7 @@ zpotri,
|
|||
sorm22,
|
||||
spotrf2,
|
||||
);
|
||||
@lapack2objsz = (@lapack2objsz,
|
||||
@lapackobjs2z = (@lapackobjs2z,
|
||||
zgejsv,
|
||||
zgesvdx,
|
||||
zgesvj,
|
||||
|
@ -707,7 +707,7 @@ zpotri,
|
|||
zunm22,
|
||||
);
|
||||
# functions added for lapack-3.7.0
|
||||
@lapack2objss = (@lapack2objss,
|
||||
@lapackobjs2s = (@lapackobjs2s,
|
||||
slarfy,
|
||||
strevc3,
|
||||
sgelqt,
|
||||
|
@ -726,7 +726,7 @@ zpotri,
|
|||
stplqt2,
|
||||
stpmlqt,
|
||||
);
|
||||
@lapack2objsd = (@lapack2objsd,
|
||||
@lapackobjs2d = (@lapackobjs2d,
|
||||
dlarfy,
|
||||
dsyconvf,
|
||||
dtrevc3,
|
||||
|
@ -746,7 +746,7 @@ zpotri,
|
|||
dtplqt2,
|
||||
dtpmlqt,
|
||||
);
|
||||
@lapack2objsc = (@lapack2objsc,
|
||||
@lapackobjs2c = (@lapackobjs2c,
|
||||
clarfy,
|
||||
csyconvf,
|
||||
ctrevc3,
|
||||
|
@ -766,7 +766,7 @@ zpotri,
|
|||
ctplqt2,
|
||||
ctpmlqt,
|
||||
);
|
||||
@lapack2objsz = (@lapack2objsz,
|
||||
@lapackobjs2z = (@lapackobjs2z,
|
||||
zlarfy,
|
||||
zsyconvf,
|
||||
ztrevc3,
|
||||
|
@ -786,31 +786,31 @@ zpotri,
|
|||
zlamswlq,
|
||||
zgemlq,
|
||||
);
|
||||
@lapack2objs = (@lapack2objs,
|
||||
sladiv1,
|
||||
dladiv1,
|
||||
@lapackobjs2s = (@lapackobjs2s,
|
||||
sladiv1);
|
||||
@lapackobjs2d = (@lapackobjs2d,
|
||||
dladiv1);
|
||||
@lapackobjs = (@lapackobjs,
|
||||
iparam2stage,
|
||||
|
||||
# functions added for lapack-3.8.0
|
||||
|
||||
ilaenv2stage,
|
||||
);
|
||||
# functions added for lapack-3.9.0
|
||||
@lapack2objsc = (@lapack2objsc,
|
||||
@lapackobjs2c = (@lapackobjs2c,
|
||||
cgesvdq,
|
||||
cungtsqr,
|
||||
dcombssq,
|
||||
cungtsqr
|
||||
);
|
||||
@lapack2objsd = (@lapack2objsd,
|
||||
@lapackobjs2d = (@lapackobjs2d,
|
||||
dcombssq,
|
||||
dgesvdq,
|
||||
dorgtsqr,
|
||||
);
|
||||
@lapack2objss = (@lapack2objss,
|
||||
@lapackobjs2s = (@lapackobjs2s,
|
||||
scombssq,
|
||||
sgesvdq,
|
||||
sorgtsqr,
|
||||
);
|
||||
@lapack2objsz = (@lapack2objsz,
|
||||
@lapackobjs2z = (@lapackobjs2z,
|
||||
zgesvdq,
|
||||
zungtsqr
|
||||
);
|
||||
|
@ -835,10 +835,29 @@ zpotri,
|
|||
dlatzm, dtzrqf);
|
||||
|
||||
@lapack_deprecated_objss = (
|
||||
sgelsx,
|
||||
sgegs,
|
||||
sgegv,
|
||||
sgegv,
|
||||
sgeqpf,
|
||||
sggsvd,
|
||||
sggsvp,
|
||||
slahrd,
|
||||
slatzm,
|
||||
stzrqf
|
||||
);
|
||||
|
||||
|
||||
@lapack_deprecated_objsz = (
|
||||
zgegs,
|
||||
zgegv,
|
||||
zgelsx,
|
||||
zgeqpf,
|
||||
zggsvd,
|
||||
zggsvp,
|
||||
zlahrd,
|
||||
zlatzm,
|
||||
ztzrqf
|
||||
);
|
||||
|
||||
@lapacke_deprecated_objsc = (
|
||||
LAPACKE_cggsvp,
|
||||
LAPACKE_cggsvp_work,
|
||||
|
@ -3590,14 +3609,18 @@ use File::Basename;
|
|||
my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib");
|
||||
|
||||
if ($ARGV[12] == 1) {
|
||||
@blasobjs = (@blasobjs, @halfblasobjs);
|
||||
@cblasobjs = (@cblasobjs, @halfcblasobjs);
|
||||
@blasobjs = (@blasobjs, @bfblasobjs);
|
||||
@cblasobjs = (@cblasobjs, @bfcblasobjs);
|
||||
}
|
||||
if ($ARGV[13] == 1) {
|
||||
@blasobjs = (@blasobjs, @blasobjss);
|
||||
@cblasobjs = (@cblasobjs, @cblasobjss);
|
||||
@lapackobjs = (@lapackobjs, @lapackobjss);
|
||||
@lapack2objs = (@lapack2objs, @lapack2objss);
|
||||
@lapackobjs2 = (@lapackobjs2, @lapackobjs2s);
|
||||
@lapackobjs2 = (@lapackobjs2, @lapackobjs2sc);
|
||||
@lapackobjs2 = (@lapackobjs2, @lapackobjs2ds);
|
||||
@lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objss);
|
||||
@lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objss);
|
||||
@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_s);
|
||||
@lapackeobjs = (@lapackeobjs, @lapackeobjss);
|
||||
}
|
||||
|
@ -3605,7 +3628,12 @@ if ($ARGV[14] == 1) {
|
|||
@blasobjs = (@blasobjs, @blasobjsd);
|
||||
@cblasobjs = (@cblasobjs, @cblasobjsd);
|
||||
@lapackobjs = (@lapackobjs, @lapackobjsd);
|
||||
@lapack2objs = (@lapack2objs, @lapack2objsd);
|
||||
if ($ARGV[13] == 0) {
|
||||
@lapackobjs2 = (@lapackobjs2, @lapackobjs2ds);
|
||||
}
|
||||
@lapackobjs2 = (@lapackobjs2, @lapackobjs2d, @lapackobjs2dz);
|
||||
@lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsd);
|
||||
@lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objsd);
|
||||
@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_d);
|
||||
@lapackeobjs = (@lapackeobjs, @lapackeobjsd);
|
||||
}
|
||||
|
@ -3613,9 +3641,14 @@ if ($ARGV[15] == 1) {
|
|||
@blasobjs = (@blasobjs, @blasobjsc);
|
||||
@cblasobjs = (@cblasobjs, @cblasobjsc);
|
||||
@gemm3mobjs = (@gemm3mobjs, @gemm3mobjsc);
|
||||
@cblasgemm3mobjs = (@cblasgemm3mobjs, @sblasgemm3mobjsc);
|
||||
@cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsc);
|
||||
@lapackobjs = (@lapackobjs, @lapackobjsc);
|
||||
@lapack2objs = (@lapack2objs, @lapack2objsc, @lapac2objszc);
|
||||
@lapackobjs2 = (@lapackobjs2, @lapackobjs2c, @lapackobjs2zc);
|
||||
if ($ARGV[13] == 0) {
|
||||
@lapackobjs2 = (@lapackobjs2, @lapackobjs2sc);
|
||||
}
|
||||
@lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsc);
|
||||
@lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objsc);
|
||||
@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_c);
|
||||
@lapackeobjs = (@lapackeobjs, @lapackeobjsc);
|
||||
}
|
||||
|
@ -3623,9 +3656,17 @@ if ($ARGV[16] == 1) {
|
|||
@blasobjs = (@blasobjs, @blasobjsz);
|
||||
@cblasobjs = (@cblasobjs, @cblasobjsz);
|
||||
@gemm3mobjs = (@gemm3mobjs, @gemm3mobjsz);
|
||||
@cblasgemm3mobjs = (@cblasgemm3mobjs, @sblasgemm3mobjsz);
|
||||
@cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsz);
|
||||
@lapackobjs = (@lapackobjs, @lapackobjsz);
|
||||
@lapack2objs = (@lapack2objs, @lapack2objsz, @lapack2objszc);
|
||||
@lapackobjs2 = (@lapackobjs2, @lapackobjs2z);
|
||||
if ($ARGV[15] == 0) {
|
||||
@lapackobjs2 = (@lapackobjs2, @lapackobjs2zc);
|
||||
}
|
||||
if ($ARGV[14] == 0) {
|
||||
@lapackobjs2 = (@lapackobjs2, @lapackobjs2dz);
|
||||
}
|
||||
@lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsz);
|
||||
@lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objsz);
|
||||
@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_z);
|
||||
@lapackeobjs = (@lapackeobjs, @lapackeobjsz);
|
||||
}
|
||||
|
|
14
getarch.c
14
getarch.c
|
@ -1222,6 +1222,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_VORTEX
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "VORTEX"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DVORTEX " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "vortex"
|
||||
#define CORENAME "VORTEX"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_ZARCH_GENERIC
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ZARCH"
|
||||
|
|
|
@ -22,20 +22,25 @@ ifeq ($(C_COMPILER), CLANG)
|
|||
override CFLAGS += -fno-integrated-as
|
||||
endif
|
||||
endif
|
||||
|
||||
AVX2OPT =
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
# AVX2 support was added in 4.7.0
|
||||
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
|
||||
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
|
||||
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||
GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
|
||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
|
||||
GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
|
||||
ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
|
||||
AVX2OPT = -mavx2
|
||||
endif
|
||||
endif
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
# Any clang posing as gcc 4.2 should be new enough (3.4 or later)
|
||||
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||
GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
|
||||
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2)
|
||||
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2), 11)
|
||||
GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
|
||||
ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
|
||||
AVX2OPT = -mavx2
|
||||
endif
|
||||
endif
|
||||
|
|
|
@ -150,9 +150,9 @@ CAXPYKERNEL = caxpy.c
|
|||
endif
|
||||
ZAXPYKERNEL = zaxpy_power10.c
|
||||
#
|
||||
SCOPYKERNEL = scopy.c
|
||||
SCOPYKERNEL = scopy_power10.c
|
||||
DCOPYKERNEL = dcopy_power10.c
|
||||
CCOPYKERNEL = ccopy.c
|
||||
CCOPYKERNEL = ccopy_power10.c
|
||||
ZCOPYKERNEL = zcopy_power10.c
|
||||
#
|
||||
SDOTKERNEL = sdot.c
|
||||
|
|
|
@ -0,0 +1,132 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "copy_microk_power10.c"
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL
|
||||
|
||||
static void copy_kernel(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
|
||||
while ( i<n )
|
||||
{
|
||||
|
||||
f0 = x1[0];
|
||||
f1 = x1[1];
|
||||
f2 = x1[2];
|
||||
f3 = x1[3];
|
||||
f4 = x1[4];
|
||||
f5 = x1[5];
|
||||
f6 = x1[6];
|
||||
f7 = x1[7];
|
||||
|
||||
y1[0] = f0;
|
||||
y1[1] = f1;
|
||||
y1[2] = f2;
|
||||
y1[3] = f3;
|
||||
y1[4] = f4;
|
||||
y1[5] = f5;
|
||||
y1[6] = f6;
|
||||
y1[7] = f7;
|
||||
|
||||
x1 += 8;
|
||||
y1 += 8;
|
||||
|
||||
i+=4;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
copy_kernel(n1, x, y);
|
||||
i=n1;
|
||||
ix=n1*2;
|
||||
iy=n1*2;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = x[iy] ;
|
||||
y[iy+1] = x[ix+1] ;
|
||||
ix+=2;
|
||||
iy+=2;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
BLASLONG inc_x2 = 2 * inc_x;
|
||||
BLASLONG inc_y2 = 2 * inc_y;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = x[ix] ;
|
||||
y[iy+1] = x[ix+1] ;
|
||||
ix += inc_x2 ;
|
||||
iy += inc_y2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -25,9 +25,9 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_64 1
|
||||
#define HAVE_KERNEL 1
|
||||
|
||||
static void dcopy_kernel_64 (long n, double *x, double *y)
|
||||
static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__
|
||||
(
|
||||
|
@ -49,8 +49,13 @@ static void dcopy_kernel_64 (long n, double *x, double *y)
|
|||
"lxvp 60, 448(%2) \n\t"
|
||||
"lxvp 62, 480(%2) \n\t"
|
||||
"addi %2, %2, 512 \n\t"
|
||||
|
||||
#if !defined(COMPLEX) && !defined(DOUBLE)
|
||||
"addic. %1, %1, -128 \n\t"
|
||||
#elif defined(COMPLEX) && defined(DOUBLE)
|
||||
"addic. %1, %1, -32 \n\t"
|
||||
#else
|
||||
"addic. %1, %1, -64 \n\t"
|
||||
#endif
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
|
@ -94,7 +99,13 @@ static void dcopy_kernel_64 (long n, double *x, double *y)
|
|||
"addi %3, %3, 512 \n\t"
|
||||
"addi %2, %2, 512 \n\t"
|
||||
|
||||
#if !defined(COMPLEX) && !defined(DOUBLE)
|
||||
"addic. %1, %1, -128 \n\t"
|
||||
#elif defined(COMPLEX) && defined(DOUBLE)
|
||||
"addic. %1, %1, -32 \n\t"
|
||||
#else
|
||||
"addic. %1, %1, -64 \n\t"
|
||||
#endif
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
@ -121,7 +132,7 @@ static void dcopy_kernel_64 (long n, double *x, double *y)
|
|||
"=m" (*y),
|
||||
"+r" (n), // 1
|
||||
"+b" (x), // 2
|
||||
"+b" (y) // 3
|
||||
"+b" (y) // 3
|
||||
:
|
||||
"m" (*x)
|
||||
:
|
|
@ -82,12 +82,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
#ifdef __64BIT__
|
||||
#define STACKSIZE 400
|
||||
#define STACKSIZE 592
|
||||
#define ALPHA_R_SP 304+192(SP)
|
||||
#define ALPHA_I_SP 312+192(SP)
|
||||
#else
|
||||
#define STACKSIZE 256
|
||||
#define STACKSIZE 452
|
||||
#define ALPHA_R_SP 224+196(SP)
|
||||
#define ALPHA_I_SP 232+196(SP)
|
||||
|
|
|
@ -28,12 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "dcopy_microk_power10.c"
|
||||
#include "copy_microk_power10.c"
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_64
|
||||
#ifndef HAVE_KERNEL
|
||||
|
||||
static void dcopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
static void copy_kernel(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
|
@ -89,7 +89,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
BLASLONG n1 = n & -64;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
dcopy_kernel_64(n1, x, y);
|
||||
copy_kernel(n1, x, y);
|
||||
i=n1;
|
||||
}
|
||||
|
||||
|
|
|
@ -82,12 +82,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
#ifdef __64BIT__
|
||||
#define STACKSIZE 320
|
||||
#define STACKSIZE 512
|
||||
#define ALPHA_SP 296+192(SP)
|
||||
#define FZERO 304+192(SP)
|
||||
#else
|
||||
#define STACKSIZE 240
|
||||
#define STACKSIZE 440
|
||||
#define ALPHA_SP 224+200(SP)
|
||||
#define FZERO 232+200(SP)
|
||||
|
|
|
@ -82,7 +82,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
#ifdef __64BIT__
|
||||
#define STACKSIZE 320
|
||||
#define STACKSIZE 520
|
||||
#define ALPHA_SP 296+200(SP)
|
||||
#define FZERO 304+200(SP)
|
||||
|
|
|
@ -47,7 +47,6 @@
|
|||
#endif
|
||||
|
||||
#ifdef __64BIT__
|
||||
#define STACKSIZE 320
|
||||
#define STACKSIZE 520
|
||||
#define ALPHA 296+200(SP)
|
||||
#define FZERO 304+200(SP)
|
||||
|
|
|
@ -0,0 +1,123 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "copy_microk_power10.c"
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL
|
||||
|
||||
static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
|
||||
while ( i<n )
|
||||
{
|
||||
|
||||
f0 = x1[0];
|
||||
f1 = x1[1];
|
||||
f2 = x1[2];
|
||||
f3 = x1[3];
|
||||
f4 = x1[4];
|
||||
f5 = x1[5];
|
||||
f6 = x1[6];
|
||||
f7 = x1[7];
|
||||
|
||||
y1[0] = f0;
|
||||
y1[1] = f1;
|
||||
y1[2] = f2;
|
||||
y1[3] = f3;
|
||||
y1[4] = f4;
|
||||
y1[5] = f5;
|
||||
y1[6] = f6;
|
||||
y1[7] = f7;
|
||||
|
||||
x1 += 8;
|
||||
y1 += 8;
|
||||
|
||||
i+=8;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -128;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
copy_kernel (n1, x, y);
|
||||
i=n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[i] = x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -82,7 +82,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
#ifdef __64BIT__
|
||||
#define STACKSIZE 340
|
||||
#define STACKSIZE 540
|
||||
#define ALPHA_SP 296+200(SP)
|
||||
#define FZERO 304+200(SP)
|
||||
|
|
|
@ -1,134 +0,0 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_32 1
|
||||
|
||||
static void zcopy_kernel_32 (long n, double *x, double *y)
|
||||
{
|
||||
__asm__
|
||||
(
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"lxvp 36, 64(%2) \n\t"
|
||||
"lxvp 38, 96(%2) \n\t"
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
"lxvp 46, 224(%2) \n\t"
|
||||
|
||||
"lxvp 48, 256(%2) \n\t"
|
||||
"lxvp 50, 288(%2) \n\t"
|
||||
"lxvp 52, 320(%2) \n\t"
|
||||
"lxvp 54, 352(%2) \n\t"
|
||||
"lxvp 56, 384(%2) \n\t"
|
||||
"lxvp 58, 416(%2) \n\t"
|
||||
"lxvp 60, 448(%2) \n\t"
|
||||
"lxvp 62, 480(%2) \n\t"
|
||||
"addi %2, %2, 512 \n\t"
|
||||
|
||||
"addic. %1, %1, -32 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"stxvp 32, 0(%3) \n\t"
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"stxvp 34, 32(%3) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"stxvp 36, 64(%3) \n\t"
|
||||
"lxvp 36, 64(%2) \n\t"
|
||||
"stxvp 38, 96(%3) \n\t"
|
||||
"lxvp 38, 96(%2) \n\t"
|
||||
|
||||
"stxvp 40, 128(%3) \n\t"
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
"stxvp 42, 160(%3) \n\t"
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
"stxvp 44, 192(%3) \n\t"
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
"stxvp 46, 224(%3) \n\t"
|
||||
"lxvp 46, 224(%2) \n\t"
|
||||
|
||||
"stxvp 48, 256(%3) \n\t"
|
||||
"lxvp 48, 256(%2) \n\t"
|
||||
"stxvp 50, 288(%3) \n\t"
|
||||
"lxvp 50, 288(%2) \n\t"
|
||||
"stxvp 52, 320(%3) \n\t"
|
||||
"lxvp 52, 320(%2) \n\t"
|
||||
"stxvp 54, 352(%3) \n\t"
|
||||
"lxvp 54, 352(%2) \n\t"
|
||||
"stxvp 56, 384(%3) \n\t"
|
||||
"lxvp 56, 384(%2) \n\t"
|
||||
"stxvp 58, 416(%3) \n\t"
|
||||
"lxvp 58, 416(%2) \n\t"
|
||||
"stxvp 60, 448(%3) \n\t"
|
||||
"lxvp 60, 448(%2) \n\t"
|
||||
"stxvp 62, 480(%3) \n\t"
|
||||
"lxvp 62, 480(%2) \n\t"
|
||||
|
||||
"addi %3, %3, 512 \n\t"
|
||||
"addi %2, %2, 512 \n\t"
|
||||
|
||||
"addic. %1, %1, -32 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"stxvp 32, 0(%3) \n\t"
|
||||
"stxvp 34, 32(%3) \n\t"
|
||||
"stxvp 36, 64(%3) \n\t"
|
||||
"stxvp 38, 96(%3) \n\t"
|
||||
"stxvp 40, 128(%3) \n\t"
|
||||
"stxvp 42, 160(%3) \n\t"
|
||||
"stxvp 44, 192(%3) \n\t"
|
||||
"stxvp 46, 224(%3) \n\t"
|
||||
"stxvp 48, 256(%3) \n\t"
|
||||
"stxvp 50, 288(%3) \n\t"
|
||||
"stxvp 52, 320(%3) \n\t"
|
||||
"stxvp 54, 352(%3) \n\t"
|
||||
"stxvp 56, 384(%3) \n\t"
|
||||
"stxvp 58, 416(%3) \n\t"
|
||||
"stxvp 60, 448(%3) \n\t"
|
||||
"stxvp 62, 480(%3) \n\t"
|
||||
|
||||
"#n=%1 x=%4=%2 y=%0=%3"
|
||||
:
|
||||
"=m" (*y),
|
||||
"+r" (n), // 1
|
||||
"+b" (x), // 2
|
||||
"+b" (y) // 3
|
||||
:
|
||||
"m" (*x)
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
|
||||
"vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
|
||||
);
|
||||
}
|
|
@ -28,12 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "zcopy_microk_power10.c"
|
||||
#include "copy_microk_power10.c"
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
#ifndef HAVE_KERNEL
|
||||
|
||||
static void zcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
static void copy_kernel(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
|
@ -89,7 +89,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
zcopy_kernel_32(n1, x, y);
|
||||
copy_kernel(n1, x, y);
|
||||
i=n1;
|
||||
ix=n1*2;
|
||||
iy=n1*2;
|
||||
|
|
|
@ -513,7 +513,7 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT al
|
|||
|
||||
#endif
|
||||
|
||||
static __attribute__((always_inline)) void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
|
||||
static __attribute__((always_inline)) inline void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
|
||||
BLASLONG i;
|
||||
for (i = 0; i < n; i++) {
|
||||
*dest = *src;
|
||||
|
|
|
@ -122,7 +122,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"r" (alpha), // 4
|
||||
"r" (mvec) // 5
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
|
@ -189,9 +189,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"r" (alpha), // 4
|
||||
"r" (mvec) // 5
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
|
|
|
@ -120,7 +120,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"r" (alpha), // 4
|
||||
"r" (mvec) // 5
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
|
|
|
@ -104,7 +104,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"r" (alpha), // 4
|
||||
"r" (mvec) // 5
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
|
|
|
@ -122,7 +122,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"r" (alpha), // 4
|
||||
"r" (mvec) // 5
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
|
@ -189,9 +189,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"r" (alpha), // 4
|
||||
"r" (mvec) // 5
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
|
|
|
@ -67,8 +67,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
: "cc",
|
||||
"%xmm0",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
|
|
@ -84,8 +84,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
|
|
@ -91,6 +91,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
@ -155,6 +156,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
|
|
@ -89,8 +89,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
|
|
@ -88,6 +88,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
|
|
@ -126,4 +126,5 @@ int CNAME(BLASLONG dim_second, BLASLONG dim_first, double *src, BLASLONG lead_di
|
|||
}
|
||||
src1 += src_inc;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -105,9 +105,8 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"r" (alpha) // 8
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
@ -182,11 +181,10 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"r" (ap[1]), // 5
|
||||
"r" (alpha) // 6
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6",
|
||||
"%xmm8",
|
||||
"%xmm12", "%xmm13",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
}
|
||||
|
|
|
@ -140,7 +140,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
@ -235,9 +235,11 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"r" (ap[3]), // 7
|
||||
"r" (alpha) // 8
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
|
|
@ -117,7 +117,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]) // 7
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
|
|
@ -67,7 +67,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
: "cc",
|
||||
"%xmm0",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
|
|
|
@ -86,7 +86,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
: "cc",
|
||||
"%xmm0",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
|
@ -147,7 +148,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"r" (y), // 3
|
||||
"r" (alpha) // 4
|
||||
: "cc",
|
||||
"%xmm0",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
|
|
|
@ -87,8 +87,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
|
|
@ -90,8 +90,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"r" (y), // 3
|
||||
"r" (dot) // 4
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
#if defined(SKYLAKEX) || defined (COOPERLAKE)
|
||||
/* the direct sgemm code written by Arjan van der Ven */
|
||||
#include <immintrin.h>
|
||||
#include "common.h"
|
||||
|
||||
#if defined(SKYLAKEX) || defined (COOPERLAKE)
|
||||
/*
|
||||
* "Direct sgemm" code. This code operates directly on the inputs and outputs
|
||||
* of the sgemm call, avoiding the copies, memory realignments and threading,
|
||||
|
|
|
@ -164,11 +164,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"r" (ap[3]), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
@ -286,9 +284,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"r" (ap[3]), // 7
|
||||
"r" (alpha) // 8
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
|
|
@ -138,7 +138,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]) // 7
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
|
|
@ -122,7 +122,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"r" (alpha), // 4
|
||||
"r" (mvec) // 5
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
|
@ -189,9 +189,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"r" (alpha), // 4
|
||||
"r" (mvec) // 5
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
|
|
|
@ -120,7 +120,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"r" (alpha), // 4
|
||||
"r" (mvec) // 5
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
|
|
|
@ -108,9 +108,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"r" (alpha), // 4
|
||||
"r" (mvec) // 5
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
return;
|
||||
|
@ -185,9 +186,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"r" (alpha), // 4
|
||||
"r" (mvec) // 5
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
|
|
|
@ -122,7 +122,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"r" (alpha), // 4
|
||||
"r" (mvec) // 5
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
|
@ -189,9 +189,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"r" (alpha), // 4
|
||||
"r" (mvec) // 5
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
|
|
|
@ -71,7 +71,7 @@ lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp,
|
|||
goto exit_level_0;
|
||||
}
|
||||
liwork = iwork_query;
|
||||
lcwork = LAPACK_C2INT(cwork_query);
|
||||
lcwork = LAPACK_Z2INT(cwork_query);
|
||||
lrwork = (lapack_int)rwork_query;
|
||||
/* Allocate memory for work arrays */
|
||||
iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
|
||||
|
|
Loading…
Reference in New Issue