Merge pull request #2943 from xianyi/develop

Merge from develop for 0.3.12 release
This commit is contained in:
Martin Kroeker 2020-10-24 12:52:59 +02:00 committed by GitHub
commit 6e3a05f2c9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
53 changed files with 540 additions and 269 deletions

View File

@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 11)
set(OpenBLAS_PATCH_VERSION 12)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions

View File

@ -1,4 +1,31 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.12
24-Oct-2020
common:
* Fixed missibg LAPACK functions (inadvertently dropped during
the build system restructuring)
* Fixed argument conversion macro in LAPACKE_zgesvdq (LAPACK #458)
POWER:
* Added optimized SCOPY/CCOPY kernels for POWER10
* Increased and unified the default size of the GEMM BUFFER
* Fixed building for POWER1ß in DYNAMIC_ARCH mode
* POWER10 compatibility test now checks binutils version as well
* Cleaned up compiler warnings
x86_64:
* corrected compiler version checks for AVX2 compatibility
* added compiler option -mavx2 for building with flang
* fixed direct SGEMM pathway for small matrix sizes (broken by
the code refactoring in 0.3.11)
* fixed unhandled partial register clobbers in several kernels
for AXPY,DOT,GEMV_N and GEMV_T flagged by gcc10 tree-vectorizer
ARMV8:
* improved Apple Vortex support to include cross-compiling
====================================================================
Version 0.3.11
17-Oct-2020
@ -28,7 +55,7 @@ Version 0.3.11
* Makefile builds no longer misread NO_CBLAS=0 or NO_LAPACK=0 as
enabling these options
* Fixed detection of gfortran when invoked through an mpi wrapper
* Improve thread reinitialization performance with OpenMP xafter a fork
* Improve thread reinitialization performance with OpenMP after a fork
* Added support for building only the subset of the library required
for a particular precision by specifying BUILD_SINGLE, BUILD_DOUBLE
* Optional function name prefixes and suffixes are now correctly
@ -66,7 +93,6 @@ ARMV8:
* Fixed cpu detection on BSD-like systems
* Fixed compilation in -std=C18 mode
IBM Z:
* Added support for compiling with the clang compiler
* Improved GEMM performance on Z14

View File

@ -10,7 +10,7 @@ USE_OPENMP = 1
endif
ifeq ($(CORE), POWER10)
COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
endif

View File

@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.3.11
VERSION = 0.3.12
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -295,10 +295,13 @@ COMMON_PROF = -pg
# the below is not yet configurable, use cmake if you need to build only select types
BUILD_SINGLE = 1
BUILD_DOUBLE = 1
BUILD_COMPLEX = 1
BUILD_COMPLEX16 = 1
# By default the library contains BLAS functions (and LAPACK if selected) for all input types.
# To build a smaller library supporting e.g. only single precision real (SGEMM etc.) or only
# the functions for complex numbers, uncomment the desired type(s) below
# BUILD_SINGLE = 1
# BUILD_DOUBLE = 1
# BUILD_COMPLEX = 1
# BUILD_COMPLEX16 = 1
#
# End of user configuration
#

View File

@ -641,6 +641,7 @@ DYNAMIC_CORE += POWER8
ifneq ($(C_COMPILER), GCC)
DYNAMIC_CORE += POWER9
DYNAMIC_CORE += POWER10
CCOMMON_OPT += -DHAVE_P10_SUPPORT
endif
ifeq ($(C_COMPILER), GCC)
ifeq ($(GCCVERSIONGT5), 1)
@ -648,11 +649,14 @@ DYNAMIC_CORE += POWER9
else
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
endif
ifeq ($(GCCVERSIONGTEQ11), 1)
LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35)
ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11)
DYNAMIC_CORE += POWER10
CCOMMON_OPT += -DHAVE_P10_SUPPORT
else ifeq ($(GCCVERSIONGTEQ10), 1)
ifeq ($(GCCMINORVERSIONGTEQ2), 1)
ifeq ($(GCCMINORVERSIONGTEQ2)$(LDVERSIONGTEQ35), 11)
DYNAMIC_CORE += POWER10
CCOMMON_OPT += -DHAVE_P10_SUPPORT
endif
else
$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)

View File

@ -74,8 +74,10 @@ ifndef NO_AVX2
ifeq ($(C_COMPILER), GCC)
# AVX2 support was added in 4.7.0
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
CCOMMON_OPT += -mavx2
endif
else
@ -86,8 +88,14 @@ endif
ifeq ($(F_COMPILER), GFORTRAN)
# AVX2 support was added in 4.7.0
GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4)
GCCVERSIONGTEQ5 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 5)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(FC) -dumpversion | cut -f2 -d.` \>= 7)
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
FCOMMON_OPT += -mavx2
endif
else
ifeq ($(F_COMPILER), FLANG)
FCOMMON_OPT += -mavx2
endif
endif

View File

@ -49,6 +49,7 @@ if (DYNAMIC_ARCH)
if (POWER)
set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10)
set(CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_P10_SUPPORT")
endif ()
if (X86)

View File

@ -416,6 +416,29 @@ endif ()
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "VORTEX")
file(APPEND ${TARGET_CONF_TEMP}
"#define ARMV8\n"
"#define L1_CODE_SIZE\t32768\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t4\n"
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t4\n"
"#define L2_SIZE\t5262144\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t8\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "POWER6")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE 32768\n"

View File

@ -844,8 +844,8 @@ Lmcount$lazy_ptr:
#define BUFFER_SIZE ( 2 << 20)
#elif defined(PPC440FP2)
#define BUFFER_SIZE ( 16 << 20)
#elif defined(POWER8) || defined(POWER9) || defined(POWER10)
#define BUFFER_SIZE ( 64 << 20)
#elif defined(POWER6) || defined(POWER8) || defined(POWER9) || defined(POWER10)
#define BUFFER_SIZE ( 64 << 22)
#else
#define BUFFER_SIZE ( 16 << 20)
#endif

View File

@ -424,7 +424,7 @@ void get_cpuconfig(void)
sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0);
printf("#define L1_DATA_SIZE %d \n",value);
sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0);
printf("#define L2_DATA_SIZE %d \n",value);
printf("#define L2_SIZE %d \n",value);
break;
#endif
}

View File

@ -6,10 +6,10 @@ extern gotoblas_t gotoblas_POWER8;
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
extern gotoblas_t gotoblas_POWER9;
#endif
#if (!defined __GNUC__) || ( __GNUC__ >= 11) \
|| (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
#define HAVE_P10_SUPPORT 1
#endif
//#if (!defined __GNUC__) || ( __GNUC__ >= 11) \
// || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
//#define HAVE_P10_SUPPORT 1
//#endif
#ifdef HAVE_P10_SUPPORT
extern gotoblas_t gotoblas_POWER10;
#endif

View File

@ -50,8 +50,8 @@
zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2,
zgeadd, dzsum);
@cblasobjs = (lsame, xerbla);
@halfblasobjs = (sbgemm, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
@blasobjs = (lsame, xerbla);
@bfblasobjs = (sbgemm, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
@cblasobjsc = (
cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv,
cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k,
@ -72,7 +72,7 @@
);
@cblasobjss = (
cblas_sasum, cblas_saxpy,
cblas_sasum, cblas_saxpy, cblas_saxpby,
cblas_scopy, cblas_sdot, cblas_sdsdot, cblas_sgbmv, cblas_sgemm,
cblas_sgemv, cblas_sger, cblas_snrm2, cblas_srot, cblas_srotg,
cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr,
@ -94,7 +94,7 @@
@cblasobjs = ( cblas_xerbla );
@halfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod);
@bfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod);
@exblasobjs = (
qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
@ -415,7 +415,7 @@ zpotri,
cgeqrt, cgeqrt2, cgeqrt3, cgemqrt,
ctpqrt, ctpqrt2, ctpmqrt, ctprfb,
);
@lapack2objszc = (
@lapackobjs2zc = (
# ZCLASRC -- Double-single mixed precision complex routines called from
# single, single-extra and double precision complex LAPACK
# routines (i.e. from CLASRC, CXLASRC, ZLASRC).
@ -425,7 +425,7 @@ zpotri,
cpotrs,
);
@lapack2objsd = (
@lapackobjs2d = (
# DLASRC -- Double precision real LAPACK routines
# already provided by @lapackobjs:
# dgesv, dgetf2, dgetrs, dlaswp, dlauu2, dlauum, dpotf2, dpotrf, dpotri,
@ -568,7 +568,7 @@ zpotri,
);
# functions added for lapack-3.6.0
@lapack2objsc = ( @lapack2objsc,
@lapackobjs2c = ( @lapackobjs2c,
cgejsv,
cgesvdx,
cgesvj,
@ -604,7 +604,7 @@ zpotri,
csyr2,
cunm22,
);
@lapackobjs2d = (@lapack2objsd,
@lapackobjs2d = (@lapackobjs2d,
dbdsvdx,
dgesvdx,
dgetrf2,
@ -637,7 +637,7 @@ zpotri,
dpotrf2,
dsecnd,
);
@lapack2objss = (@lapack2objss,
@lapackobjs2s = (@lapackobjs2s,
sbdsvdx,
second,
sgesvdx,
@ -670,7 +670,7 @@ zpotri,
sorm22,
spotrf2,
);
@lapack2objsz = (@lapack2objsz,
@lapackobjs2z = (@lapackobjs2z,
zgejsv,
zgesvdx,
zgesvj,
@ -707,7 +707,7 @@ zpotri,
zunm22,
);
# functions added for lapack-3.7.0
@lapack2objss = (@lapack2objss,
@lapackobjs2s = (@lapackobjs2s,
slarfy,
strevc3,
sgelqt,
@ -726,7 +726,7 @@ zpotri,
stplqt2,
stpmlqt,
);
@lapack2objsd = (@lapack2objsd,
@lapackobjs2d = (@lapackobjs2d,
dlarfy,
dsyconvf,
dtrevc3,
@ -746,7 +746,7 @@ zpotri,
dtplqt2,
dtpmlqt,
);
@lapack2objsc = (@lapack2objsc,
@lapackobjs2c = (@lapackobjs2c,
clarfy,
csyconvf,
ctrevc3,
@ -766,7 +766,7 @@ zpotri,
ctplqt2,
ctpmlqt,
);
@lapack2objsz = (@lapack2objsz,
@lapackobjs2z = (@lapackobjs2z,
zlarfy,
zsyconvf,
ztrevc3,
@ -786,31 +786,31 @@ zpotri,
zlamswlq,
zgemlq,
);
@lapack2objs = (@lapack2objs,
sladiv1,
dladiv1,
@lapackobjs2s = (@lapackobjs2s,
sladiv1);
@lapackobjs2d = (@lapackobjs2d,
dladiv1);
@lapackobjs = (@lapackobjs,
iparam2stage,
# functions added for lapack-3.8.0
ilaenv2stage,
);
# functions added for lapack-3.9.0
@lapack2objsc = (@lapack2objsc,
@lapackobjs2c = (@lapackobjs2c,
cgesvdq,
cungtsqr,
dcombssq,
cungtsqr
);
@lapack2objsd = (@lapack2objsd,
@lapackobjs2d = (@lapackobjs2d,
dcombssq,
dgesvdq,
dorgtsqr,
);
@lapack2objss = (@lapack2objss,
@lapackobjs2s = (@lapackobjs2s,
scombssq,
sgesvdq,
sorgtsqr,
);
@lapack2objsz = (@lapack2objsz,
@lapackobjs2z = (@lapackobjs2z,
zgesvdq,
zungtsqr
);
@ -835,8 +835,27 @@ zpotri,
dlatzm, dtzrqf);
@lapack_deprecated_objss = (
sgelsx,
sgegs,
sgegv,
sgeqpf,
sggsvd,
sggsvp,
slahrd,
slatzm,
stzrqf
);
@lapack_deprecated_objsz = (
zgegs,
zgegv,
zgelsx,
zgeqpf,
zggsvd,
zggsvp,
zlahrd,
zlatzm,
ztzrqf
);
@lapacke_deprecated_objsc = (
@ -3590,14 +3609,18 @@ use File::Basename;
my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib");
if ($ARGV[12] == 1) {
@blasobjs = (@blasobjs, @halfblasobjs);
@cblasobjs = (@cblasobjs, @halfcblasobjs);
@blasobjs = (@blasobjs, @bfblasobjs);
@cblasobjs = (@cblasobjs, @bfcblasobjs);
}
if ($ARGV[13] == 1) {
@blasobjs = (@blasobjs, @blasobjss);
@cblasobjs = (@cblasobjs, @cblasobjss);
@lapackobjs = (@lapackobjs, @lapackobjss);
@lapack2objs = (@lapack2objs, @lapack2objss);
@lapackobjs2 = (@lapackobjs2, @lapackobjs2s);
@lapackobjs2 = (@lapackobjs2, @lapackobjs2sc);
@lapackobjs2 = (@lapackobjs2, @lapackobjs2ds);
@lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objss);
@lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objss);
@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_s);
@lapackeobjs = (@lapackeobjs, @lapackeobjss);
}
@ -3605,7 +3628,12 @@ if ($ARGV[14] == 1) {
@blasobjs = (@blasobjs, @blasobjsd);
@cblasobjs = (@cblasobjs, @cblasobjsd);
@lapackobjs = (@lapackobjs, @lapackobjsd);
@lapack2objs = (@lapack2objs, @lapack2objsd);
if ($ARGV[13] == 0) {
@lapackobjs2 = (@lapackobjs2, @lapackobjs2ds);
}
@lapackobjs2 = (@lapackobjs2, @lapackobjs2d, @lapackobjs2dz);
@lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsd);
@lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objsd);
@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_d);
@lapackeobjs = (@lapackeobjs, @lapackeobjsd);
}
@ -3613,9 +3641,14 @@ if ($ARGV[15] == 1) {
@blasobjs = (@blasobjs, @blasobjsc);
@cblasobjs = (@cblasobjs, @cblasobjsc);
@gemm3mobjs = (@gemm3mobjs, @gemm3mobjsc);
@cblasgemm3mobjs = (@cblasgemm3mobjs, @sblasgemm3mobjsc);
@cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsc);
@lapackobjs = (@lapackobjs, @lapackobjsc);
@lapack2objs = (@lapack2objs, @lapack2objsc, @lapac2objszc);
@lapackobjs2 = (@lapackobjs2, @lapackobjs2c, @lapackobjs2zc);
if ($ARGV[13] == 0) {
@lapackobjs2 = (@lapackobjs2, @lapackobjs2sc);
}
@lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsc);
@lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objsc);
@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_c);
@lapackeobjs = (@lapackeobjs, @lapackeobjsc);
}
@ -3623,9 +3656,17 @@ if ($ARGV[16] == 1) {
@blasobjs = (@blasobjs, @blasobjsz);
@cblasobjs = (@cblasobjs, @cblasobjsz);
@gemm3mobjs = (@gemm3mobjs, @gemm3mobjsz);
@cblasgemm3mobjs = (@cblasgemm3mobjs, @sblasgemm3mobjsz);
@cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsz);
@lapackobjs = (@lapackobjs, @lapackobjsz);
@lapack2objs = (@lapack2objs, @lapack2objsz, @lapack2objszc);
@lapackobjs2 = (@lapackobjs2, @lapackobjs2z);
if ($ARGV[15] == 0) {
@lapackobjs2 = (@lapackobjs2, @lapackobjs2zc);
}
if ($ARGV[14] == 0) {
@lapackobjs2 = (@lapackobjs2, @lapackobjs2dz);
}
@lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsz);
@lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objsz);
@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_z);
@lapackeobjs = (@lapackeobjs, @lapackeobjsz);
}

View File

@ -1222,6 +1222,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif
#ifdef FORCE_VORTEX
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "VORTEX"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DVORTEX " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "vortex"
#define CORENAME "VORTEX"
#endif
#ifdef FORCE_ZARCH_GENERIC
#define FORCE
#define ARCHITECTURE "ZARCH"

View File

@ -22,20 +22,25 @@ ifeq ($(C_COMPILER), CLANG)
override CFLAGS += -fno-integrated-as
endif
endif
AVX2OPT =
ifeq ($(C_COMPILER), GCC)
# AVX2 support was added in 4.7.0
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
AVX2OPT = -mavx2
endif
endif
ifeq ($(C_COMPILER), CLANG)
# Any clang posing as gcc 4.2 should be new enough (3.4 or later)
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2)
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2), 11)
GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
AVX2OPT = -mavx2
endif
endif

View File

@ -150,9 +150,9 @@ CAXPYKERNEL = caxpy.c
endif
ZAXPYKERNEL = zaxpy_power10.c
#
SCOPYKERNEL = scopy.c
SCOPYKERNEL = scopy_power10.c
DCOPYKERNEL = dcopy_power10.c
CCOPYKERNEL = ccopy.c
CCOPYKERNEL = ccopy_power10.c
ZCOPYKERNEL = zcopy_power10.c
#
SDOTKERNEL = sdot.c

View File

@ -0,0 +1,132 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "copy_microk_power10.c"
#endif
#ifndef HAVE_KERNEL
static void copy_kernel(BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i=0;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
FLOAT *x1=x;
FLOAT *y1=y;
while ( i<n )
{
f0 = x1[0];
f1 = x1[1];
f2 = x1[2];
f3 = x1[3];
f4 = x1[4];
f5 = x1[5];
f6 = x1[6];
f7 = x1[7];
y1[0] = f0;
y1[1] = f1;
y1[2] = f2;
y1[3] = f3;
y1[4] = f4;
y1[5] = f5;
y1[6] = f6;
y1[7] = f7;
x1 += 8;
y1 += 8;
i+=4;
}
return;
}
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -64;
if ( n1 > 0 )
{
copy_kernel(n1, x, y);
i=n1;
ix=n1*2;
iy=n1*2;
}
while(i < n)
{
y[iy] = x[iy] ;
y[iy+1] = x[ix+1] ;
ix+=2;
iy+=2;
i++ ;
}
}
else
{
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y;
while(i < n)
{
y[iy] = x[ix] ;
y[iy+1] = x[ix+1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
}
return(0);
}

View File

@ -25,9 +25,9 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_64 1
#define HAVE_KERNEL 1
static void dcopy_kernel_64 (long n, double *x, double *y)
static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__
(
@ -49,8 +49,13 @@ static void dcopy_kernel_64 (long n, double *x, double *y)
"lxvp 60, 448(%2) \n\t"
"lxvp 62, 480(%2) \n\t"
"addi %2, %2, 512 \n\t"
#if !defined(COMPLEX) && !defined(DOUBLE)
"addic. %1, %1, -128 \n\t"
#elif defined(COMPLEX) && defined(DOUBLE)
"addic. %1, %1, -32 \n\t"
#else
"addic. %1, %1, -64 \n\t"
#endif
"ble two%= \n\t"
".align 5 \n"
@ -94,7 +99,13 @@ static void dcopy_kernel_64 (long n, double *x, double *y)
"addi %3, %3, 512 \n\t"
"addi %2, %2, 512 \n\t"
#if !defined(COMPLEX) && !defined(DOUBLE)
"addic. %1, %1, -128 \n\t"
#elif defined(COMPLEX) && defined(DOUBLE)
"addic. %1, %1, -32 \n\t"
#else
"addic. %1, %1, -64 \n\t"
#endif
"bgt one%= \n"
"two%=: \n\t"

View File

@ -82,12 +82,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef __64BIT__
#define STACKSIZE 400
#define STACKSIZE 592
#define ALPHA_R_SP 304+192(SP)
#define ALPHA_I_SP 312+192(SP)
#else
#define STACKSIZE 256
#define STACKSIZE 452
#define ALPHA_R_SP 224+196(SP)
#define ALPHA_I_SP 232+196(SP)

View File

@ -28,12 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "dcopy_microk_power10.c"
#include "copy_microk_power10.c"
#endif
#ifndef HAVE_KERNEL_64
#ifndef HAVE_KERNEL
static void dcopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
static void copy_kernel(BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i=0;
@ -89,7 +89,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
BLASLONG n1 = n & -64;
if ( n1 > 0 )
{
dcopy_kernel_64(n1, x, y);
copy_kernel(n1, x, y);
i=n1;
}

View File

@ -82,12 +82,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef __64BIT__
#define STACKSIZE 320
#define STACKSIZE 512
#define ALPHA_SP 296+192(SP)
#define FZERO 304+192(SP)
#else
#define STACKSIZE 240
#define STACKSIZE 440
#define ALPHA_SP 224+200(SP)
#define FZERO 232+200(SP)

View File

@ -82,7 +82,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef __64BIT__
#define STACKSIZE 320
#define STACKSIZE 520
#define ALPHA_SP 296+200(SP)
#define FZERO 304+200(SP)

View File

@ -47,7 +47,6 @@
#endif
#ifdef __64BIT__
#define STACKSIZE 320
#define STACKSIZE 520
#define ALPHA 296+200(SP)
#define FZERO 304+200(SP)

View File

@ -0,0 +1,123 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "copy_microk_power10.c"
#endif
#ifndef HAVE_KERNEL
static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i=0;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
FLOAT *x1=x;
FLOAT *y1=y;
while ( i<n )
{
f0 = x1[0];
f1 = x1[1];
f2 = x1[2];
f3 = x1[3];
f4 = x1[4];
f5 = x1[5];
f6 = x1[6];
f7 = x1[7];
y1[0] = f0;
y1[1] = f1;
y1[2] = f2;
y1[3] = f3;
y1[4] = f4;
y1[5] = f5;
y1[6] = f6;
y1[7] = f7;
x1 += 8;
y1 += 8;
i+=8;
}
return;
}
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -128;
if ( n1 > 0 )
{
copy_kernel (n1, x, y);
i=n1;
}
while(i < n)
{
y[i] = x[i] ;
i++ ;
}
}
else
{
while(i < n)
{
y[iy] = x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
}
return(0);
}

View File

@ -82,7 +82,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef __64BIT__
#define STACKSIZE 340
#define STACKSIZE 540
#define ALPHA_SP 296+200(SP)
#define FZERO 304+200(SP)

View File

@ -1,134 +0,0 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_32 1
static void zcopy_kernel_32 (long n, double *x, double *y)
{
__asm__
(
"lxvp 32, 0(%2) \n\t"
"lxvp 34, 32(%2) \n\t"
"lxvp 36, 64(%2) \n\t"
"lxvp 38, 96(%2) \n\t"
"lxvp 40, 128(%2) \n\t"
"lxvp 42, 160(%2) \n\t"
"lxvp 44, 192(%2) \n\t"
"lxvp 46, 224(%2) \n\t"
"lxvp 48, 256(%2) \n\t"
"lxvp 50, 288(%2) \n\t"
"lxvp 52, 320(%2) \n\t"
"lxvp 54, 352(%2) \n\t"
"lxvp 56, 384(%2) \n\t"
"lxvp 58, 416(%2) \n\t"
"lxvp 60, 448(%2) \n\t"
"lxvp 62, 480(%2) \n\t"
"addi %2, %2, 512 \n\t"
"addic. %1, %1, -32 \n\t"
"ble two%= \n\t"
".align 5 \n"
"one%=: \n\t"
"stxvp 32, 0(%3) \n\t"
"lxvp 32, 0(%2) \n\t"
"stxvp 34, 32(%3) \n\t"
"lxvp 34, 32(%2) \n\t"
"stxvp 36, 64(%3) \n\t"
"lxvp 36, 64(%2) \n\t"
"stxvp 38, 96(%3) \n\t"
"lxvp 38, 96(%2) \n\t"
"stxvp 40, 128(%3) \n\t"
"lxvp 40, 128(%2) \n\t"
"stxvp 42, 160(%3) \n\t"
"lxvp 42, 160(%2) \n\t"
"stxvp 44, 192(%3) \n\t"
"lxvp 44, 192(%2) \n\t"
"stxvp 46, 224(%3) \n\t"
"lxvp 46, 224(%2) \n\t"
"stxvp 48, 256(%3) \n\t"
"lxvp 48, 256(%2) \n\t"
"stxvp 50, 288(%3) \n\t"
"lxvp 50, 288(%2) \n\t"
"stxvp 52, 320(%3) \n\t"
"lxvp 52, 320(%2) \n\t"
"stxvp 54, 352(%3) \n\t"
"lxvp 54, 352(%2) \n\t"
"stxvp 56, 384(%3) \n\t"
"lxvp 56, 384(%2) \n\t"
"stxvp 58, 416(%3) \n\t"
"lxvp 58, 416(%2) \n\t"
"stxvp 60, 448(%3) \n\t"
"lxvp 60, 448(%2) \n\t"
"stxvp 62, 480(%3) \n\t"
"lxvp 62, 480(%2) \n\t"
"addi %3, %3, 512 \n\t"
"addi %2, %2, 512 \n\t"
"addic. %1, %1, -32 \n\t"
"bgt one%= \n"
"two%=: \n\t"
"stxvp 32, 0(%3) \n\t"
"stxvp 34, 32(%3) \n\t"
"stxvp 36, 64(%3) \n\t"
"stxvp 38, 96(%3) \n\t"
"stxvp 40, 128(%3) \n\t"
"stxvp 42, 160(%3) \n\t"
"stxvp 44, 192(%3) \n\t"
"stxvp 46, 224(%3) \n\t"
"stxvp 48, 256(%3) \n\t"
"stxvp 50, 288(%3) \n\t"
"stxvp 52, 320(%3) \n\t"
"stxvp 54, 352(%3) \n\t"
"stxvp 56, 384(%3) \n\t"
"stxvp 58, 416(%3) \n\t"
"stxvp 60, 448(%3) \n\t"
"stxvp 62, 480(%3) \n\t"
"#n=%1 x=%4=%2 y=%0=%3"
:
"=m" (*y),
"+r" (n), // 1
"+b" (x), // 2
"+b" (y) // 3
:
"m" (*x)
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
"vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
);
}

View File

@ -28,12 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "zcopy_microk_power10.c"
#include "copy_microk_power10.c"
#endif
#ifndef HAVE_KERNEL_32
#ifndef HAVE_KERNEL
static void zcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
static void copy_kernel(BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i=0;
@ -89,7 +89,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
zcopy_kernel_32(n1, x, y);
copy_kernel(n1, x, y);
i=n1;
ix=n1*2;
iy=n1*2;

View File

@ -513,7 +513,7 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT al
#endif
static __attribute__((always_inline)) void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
static __attribute__((always_inline)) inline void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
BLASLONG i;
for (i = 0; i < n; i++) {
*dest = *src;

View File

@ -122,7 +122,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
@ -189,9 +189,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);

View File

@ -120,7 +120,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",

View File

@ -104,7 +104,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",

View File

@ -122,7 +122,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
@ -189,9 +189,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);

View File

@ -67,7 +67,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (y), // 3
"r" (alpha) // 4
: "cc",
"%xmm0",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"

View File

@ -84,8 +84,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"r" (y), // 3
"r" (dot) // 4
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);

View File

@ -91,6 +91,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
@ -155,6 +156,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);

View File

@ -89,8 +89,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"r" (y), // 3
"r" (dot) // 4
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);

View File

@ -88,6 +88,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);

View File

@ -126,4 +126,5 @@ int CNAME(BLASLONG dim_second, BLASLONG dim_first, double *src, BLASLONG lead_di
}
src1 += src_inc;
}
return 0;
}

View File

@ -105,9 +105,8 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"r" (alpha) // 8
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
@ -182,11 +181,10 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"r" (ap[1]), // 5
"r" (alpha) // 6
: "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5",
"%xmm6",
"%xmm8",
"%xmm12", "%xmm13",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -140,7 +140,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
@ -235,9 +235,11 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
"%xmm0", "%xmm1",
"%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);

View File

@ -117,7 +117,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"r" (ap[2]), // 6
"r" (ap[3]) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);

View File

@ -67,7 +67,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (y), // 3
"r" (alpha) // 4
: "cc",
"%xmm0",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"

View File

@ -86,7 +86,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (y), // 3
"r" (alpha) // 4
: "cc",
"%xmm0",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
@ -147,7 +148,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (y), // 3
"r" (alpha) // 4
: "cc",
"%xmm0",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"

View File

@ -87,8 +87,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"r" (y), // 3
"r" (dot) // 4
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);

View File

@ -90,8 +90,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"r" (y), // 3
"r" (dot) // 4
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);

View File

@ -1,7 +1,8 @@
#if defined(SKYLAKEX) || defined (COOPERLAKE)
/* the direct sgemm code written by Arjan van der Ven */
#include <immintrin.h>
#include "common.h"
#if defined(SKYLAKEX) || defined (COOPERLAKE)
/*
* "Direct sgemm" code. This code operates directly on the inputs and outputs
* of the sgemm call, avoiding the copies, memory realignments and threading,

View File

@ -164,11 +164,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
"%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
@ -286,9 +284,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);

View File

@ -138,7 +138,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"r" (ap[2]), // 6
"r" (ap[3]) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);

View File

@ -122,7 +122,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
@ -189,9 +189,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);

View File

@ -120,7 +120,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",

View File

@ -108,9 +108,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
return;
@ -185,9 +186,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);

View File

@ -122,7 +122,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
@ -189,9 +189,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);

View File

@ -71,7 +71,7 @@ lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp,
goto exit_level_0;
}
liwork = iwork_query;
lcwork = LAPACK_C2INT(cwork_query);
lcwork = LAPACK_Z2INT(cwork_query);
lrwork = (lapack_int)rwork_query;
/* Allocate memory for work arrays */
iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );