Merge pull request #1747 from xianyi/develop
Merge develop into 0.3.x for 0.3.3
This commit is contained in:
commit
422a8fa953
|
@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
|
||||||
project(OpenBLAS C ASM)
|
project(OpenBLAS C ASM)
|
||||||
set(OpenBLAS_MAJOR_VERSION 0)
|
set(OpenBLAS_MAJOR_VERSION 0)
|
||||||
set(OpenBLAS_MINOR_VERSION 3)
|
set(OpenBLAS_MINOR_VERSION 3)
|
||||||
set(OpenBLAS_PATCH_VERSION 2)
|
set(OpenBLAS_PATCH_VERSION 3.dev)
|
||||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||||
|
|
||||||
# Adhere to GNU filesystem layout conventions
|
# Adhere to GNU filesystem layout conventions
|
||||||
|
@ -150,6 +150,7 @@ endif()
|
||||||
|
|
||||||
# add objects to the openblas lib
|
# add objects to the openblas lib
|
||||||
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
|
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
|
||||||
|
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>)
|
||||||
|
|
||||||
# Android needs to explicitly link against libm
|
# Android needs to explicitly link against libm
|
||||||
if(ANDROID)
|
if(ANDROID)
|
||||||
|
@ -169,6 +170,7 @@ endif()
|
||||||
# Set output for libopenblas
|
# Set output for libopenblas
|
||||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
|
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
|
||||||
|
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS")
|
||||||
|
|
||||||
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
|
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
|
||||||
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
|
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
|
||||||
|
|
111
Changelog.txt
111
Changelog.txt
|
@ -1,4 +1,115 @@
|
||||||
OpenBLAS ChangeLog
|
OpenBLAS ChangeLog
|
||||||
|
====================================================================
|
||||||
|
Version 0.3.2
|
||||||
|
30-Jul-2018
|
||||||
|
|
||||||
|
common:
|
||||||
|
* fixes for regressions caused by the rewrite of the thread
|
||||||
|
initialization code in 0.3.1
|
||||||
|
|
||||||
|
POWER:
|
||||||
|
* fixed cpu autodetection for the BSDs
|
||||||
|
|
||||||
|
MIPS64:
|
||||||
|
* fixed utest errors in AXPY, DSDOT, ROT and SWAP
|
||||||
|
|
||||||
|
x86_64:
|
||||||
|
* added autodetection of AMD Ryzen 2
|
||||||
|
* fixed build with older versions of MSVC
|
||||||
|
|
||||||
|
====================================================================
|
||||||
|
Version 0.3.1
|
||||||
|
01-Jul-2018
|
||||||
|
|
||||||
|
common:
|
||||||
|
* rewritten thread initialization code with significantly reduced overhead
|
||||||
|
* added CBLAS interfaces to the IxAMIN BLAS extension functions
|
||||||
|
* fixed the lapack-test target
|
||||||
|
* CMAKE builds now create an OpenBLASConfig.cmake file
|
||||||
|
* ZAXPY now uses a single thread for small input sizes
|
||||||
|
* the LAPACK code was updated from Reference-LAPACK/lapack#253
|
||||||
|
(fixing LAPACKE interfaces to Aasen's functions)
|
||||||
|
|
||||||
|
POWER:
|
||||||
|
* corrected CROT and ZROT behaviour with zero INC_X
|
||||||
|
|
||||||
|
ARMV7:
|
||||||
|
* corrected xDOT behaviour with zero INC_X or INC_Y
|
||||||
|
|
||||||
|
x86_64:
|
||||||
|
* retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER,
|
||||||
|
this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO
|
||||||
|
(which will still be supported via the slower PRESCOTT kernels when this option is not set)
|
||||||
|
* added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to
|
||||||
|
specify the list of x86_64 targets to include. Any target not on the list will be supported
|
||||||
|
by the Sandybridge or Nehalem kernels if available, or by Prescott.
|
||||||
|
* improved SWITCH_RATIO on Haswell for increased GEMM throughput
|
||||||
|
* added initial support for Intel Skylake X, including an AVX512 SGEMM kernel
|
||||||
|
* added autodetection of Intel Cannon Lake series as Skylake X
|
||||||
|
* added a default L2 cache size for hypervisors that return zero here (Chromebook)
|
||||||
|
* fixed a name clash with recent Windows10 headers that broke the build with (at least)
|
||||||
|
recent mingw from MSYS2
|
||||||
|
* fixed a link error in mixed clang/gfortran builds with OpenMP
|
||||||
|
* updated the OSX deployment target to 10.8
|
||||||
|
* switched on parallel make for builds on MS Windows by default
|
||||||
|
|
||||||
|
x86:
|
||||||
|
* fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y
|
||||||
|
|
||||||
|
====================================================================
|
||||||
|
Version 0.3.0
|
||||||
|
23-May-2108
|
||||||
|
|
||||||
|
common:
|
||||||
|
* fixed some more thread race and locking bugs
|
||||||
|
* added preliminary support for calling an OpenMP build of the library from multiple threads
|
||||||
|
* removed performance impact of thread locks added in 0.2.20 on OpenMP code
|
||||||
|
* general code cleanup
|
||||||
|
* optimized DSDOT implementation
|
||||||
|
* improved thread distribution for GEMM
|
||||||
|
* corrected IMATCOPY/OMATCOPY implementation
|
||||||
|
* fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations
|
||||||
|
* cmake build improvements
|
||||||
|
* pkgconfig file now contains build options
|
||||||
|
* openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build
|
||||||
|
* corrections and improvements for systems with more than 64 cpus
|
||||||
|
* LAPACK code updated to 3.8.0 including later fixes
|
||||||
|
* added ReLAPACK, a recursive implementation of several LAPACK functions
|
||||||
|
* Rewrote ROTMG to handle cases that the netlib code failed to address
|
||||||
|
* Disabled (broken) multithreading code for xTRMV
|
||||||
|
* corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard
|
||||||
|
* shared memory access failures on startup are now handled more gracefully
|
||||||
|
* restored utests from earlier releases (and made them pass on all affected systems)
|
||||||
|
|
||||||
|
SPARC:
|
||||||
|
* several fixes for cpu autodetection
|
||||||
|
|
||||||
|
POWER:
|
||||||
|
* corrected vector register overwriting in several Power8 kernels
|
||||||
|
* optimized additional BLAS functions
|
||||||
|
|
||||||
|
ARM:
|
||||||
|
* added support for CortexA53 and A72
|
||||||
|
* added autodetection for ThunderX2T99
|
||||||
|
* made most optimized kernels the default for generic ARMv8 targets
|
||||||
|
|
||||||
|
x86_64:
|
||||||
|
* parallelized DDOT kernel for Haswell
|
||||||
|
* changed alignment directives in assembly kernels to boost performance on OSX
|
||||||
|
* fixed register handling in the GEMV microkernels (bug exposed by gcc7)
|
||||||
|
* added support for building on OpenBSD and Dragonfly
|
||||||
|
* updated compiler options to work with Intel release 2018
|
||||||
|
* support fully optimized build with clang/flang on Microsoft Windows
|
||||||
|
* fixed building on AIX
|
||||||
|
|
||||||
|
IBM Z:
|
||||||
|
* added optimized BLAS 1/2 functions
|
||||||
|
|
||||||
|
MIPS:
|
||||||
|
* fixed cpu autodetection helper code
|
||||||
|
* added mips32 1004K cpu (Mediatek MT7621 and similar SoC)
|
||||||
|
* added mips64 I6500 cpu
|
||||||
|
|
||||||
====================================================================
|
====================================================================
|
||||||
Version 0.2.20
|
Version 0.2.20
|
||||||
24-Jul-2017
|
24-Jul-2017
|
||||||
|
|
4
Makefile
4
Makefile
|
@ -97,7 +97,7 @@ endif
|
||||||
|
|
||||||
shared :
|
shared :
|
||||||
ifndef NO_SHARED
|
ifndef NO_SHARED
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||||
@$(MAKE) -C exports so
|
@$(MAKE) -C exports so
|
||||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||||
|
@ -267,6 +267,8 @@ ifeq ($(F_COMPILER), GFORTRAN)
|
||||||
ifdef SMP
|
ifdef SMP
|
||||||
ifeq ($(OSNAME), WINNT)
|
ifeq ($(OSNAME), WINNT)
|
||||||
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
|
else ifeq ($(OSNAME), Haiku)
|
||||||
|
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
else
|
else
|
||||||
-@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -66,7 +66,7 @@ endif
|
||||||
#for install shared library
|
#for install shared library
|
||||||
ifndef NO_SHARED
|
ifndef NO_SHARED
|
||||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||||
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
# This library's version
|
# This library's version
|
||||||
VERSION = 0.3.2
|
VERSION = 0.3.3.dev
|
||||||
|
|
||||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||||
|
@ -107,7 +107,13 @@ BUILD_LAPACK_DEPRECATED = 1
|
||||||
# BUILD_RELAPACK = 1
|
# BUILD_RELAPACK = 1
|
||||||
|
|
||||||
# If you want to use legacy threaded Level 3 implementation.
|
# If you want to use legacy threaded Level 3 implementation.
|
||||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||||
|
|
||||||
|
# If you want to use the new, still somewhat experimental code that uses
|
||||||
|
# thread-local storage instead of a central memory buffer in memory.c
|
||||||
|
# Note that if your system uses GLIBC, it needs to have at least glibc 2.21
|
||||||
|
# for this to work.
|
||||||
|
USE_TLS = 1
|
||||||
|
|
||||||
# If you want to drive whole 64bit region by BLAS. Not all Fortran
|
# If you want to drive whole 64bit region by BLAS. Not all Fortran
|
||||||
# compiler supports this. It's safe to keep comment it out if you
|
# compiler supports this. It's safe to keep comment it out if you
|
||||||
|
|
|
@ -1018,6 +1018,10 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
|
||||||
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
|
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef USE_TLS
|
||||||
|
CCOMMON_OPT += -DUSE_TLS
|
||||||
|
endif
|
||||||
|
|
||||||
ifndef SYMBOLPREFIX
|
ifndef SYMBOLPREFIX
|
||||||
SYMBOLPREFIX =
|
SYMBOLPREFIX =
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -12,6 +12,9 @@ ifeq ($(CORE), SKYLAKEX)
|
||||||
ifndef NO_AVX512
|
ifndef NO_AVX512
|
||||||
CCOMMON_OPT += -march=skylake-avx512
|
CCOMMON_OPT += -march=skylake-avx512
|
||||||
FCOMMON_OPT += -march=skylake-avx512
|
FCOMMON_OPT += -march=skylake-avx512
|
||||||
|
ifeq ($(OSNAME), CYGWIN_NT)
|
||||||
|
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
|
@ -110,6 +110,7 @@ Please read `GotoBLAS_01Readme.txt`.
|
||||||
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
|
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
|
||||||
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
|
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
|
||||||
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
|
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
|
||||||
|
- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
|
||||||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
|
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
|
||||||
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
||||||
|
@ -200,6 +201,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2
|
||||||
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
|
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
|
||||||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
|
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
|
||||||
Clang 3.0 will generate the wrong AVX binary code.
|
Clang 3.0 will generate the wrong AVX binary code.
|
||||||
|
* Please use GCC version 6 or LLVM version 6 and above to compile Skyalke AVX512 kernels.
|
||||||
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
|
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
|
||||||
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
|
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
|
||||||
the library with `BIGNUMA=1`.
|
the library with `BIGNUMA=1`.
|
||||||
|
|
|
@ -122,7 +122,7 @@ int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *a, *x, *y;
|
FLOAT *a, *x, *y;
|
||||||
FLOAT alpha[] = {1.0, 1.0};
|
FLOAT alpha[] = {1.0, 1.0};
|
||||||
FLOAT beta [] = {1.0, 1.0};
|
FLOAT beta [] = {1.0, 0.0};
|
||||||
char trans='N';
|
char trans='N';
|
||||||
blasint m, i, j;
|
blasint m, i, j;
|
||||||
blasint inc_x=1,inc_y=1;
|
blasint inc_x=1,inc_y=1;
|
||||||
|
|
4
c_check
4
c_check
|
@ -64,6 +64,7 @@ $os = WINNT if ($data =~ /OS_WINNT/);
|
||||||
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
|
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
|
||||||
$os = Interix if ($data =~ /OS_INTERIX/);
|
$os = Interix if ($data =~ /OS_INTERIX/);
|
||||||
$os = Android if ($data =~ /OS_ANDROID/);
|
$os = Android if ($data =~ /OS_ANDROID/);
|
||||||
|
$os = Haiku if ($data =~ /OS_HAIKU/);
|
||||||
|
|
||||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||||
|
@ -223,7 +224,6 @@ $data =~ /globl\s([_\.]*)(.*)/;
|
||||||
$need_fu = $1;
|
$need_fu = $1;
|
||||||
|
|
||||||
$cross = 0;
|
$cross = 0;
|
||||||
$cross = 1 if ($os ne $hostos);
|
|
||||||
|
|
||||||
if ($architecture ne $hostarch) {
|
if ($architecture ne $hostarch) {
|
||||||
$cross = 1;
|
$cross = 1;
|
||||||
|
@ -231,6 +231,8 @@ if ($architecture ne $hostarch) {
|
||||||
$cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips"));
|
$cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$cross = 1 if ($os ne $hostos);
|
||||||
|
|
||||||
$openmp = "" if $ENV{USE_OPENMP} != 1;
|
$openmp = "" if $ENV{USE_OPENMP} != 1;
|
||||||
|
|
||||||
$linker_L = "";
|
$linker_L = "";
|
||||||
|
|
|
@ -214,6 +214,10 @@ if (CONSISTENT_FPCSR)
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
if (USE_TLS)
|
||||||
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_TLS")
|
||||||
|
endif ()
|
||||||
|
|
||||||
# Only for development
|
# Only for development
|
||||||
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST")
|
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST")
|
||||||
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST")
|
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST")
|
||||||
|
|
10
common.h
10
common.h
|
@ -105,6 +105,10 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef OS_HAIKU
|
||||||
|
#define NO_SYSV_IPC
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef OS_WINDOWS
|
#ifdef OS_WINDOWS
|
||||||
#ifdef ATOM
|
#ifdef ATOM
|
||||||
#define GOTO_ATOM ATOM
|
#define GOTO_ATOM ATOM
|
||||||
|
@ -253,8 +257,14 @@ typedef unsigned long BLASULONG;
|
||||||
|
|
||||||
#ifdef USE64BITINT
|
#ifdef USE64BITINT
|
||||||
typedef BLASLONG blasint;
|
typedef BLASLONG blasint;
|
||||||
|
#if defined(OS_WINDOWS) && defined(__64BIT__)
|
||||||
|
#define blasabs(x) llabs(x)
|
||||||
|
#else
|
||||||
|
#define blasabs(x) labs(x)
|
||||||
|
#endif
|
||||||
#else
|
#else
|
||||||
typedef int blasint;
|
typedef int blasint;
|
||||||
|
#define blasabs(x) abs(x)
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
#ifdef USE64BITINT
|
#ifdef USE64BITINT
|
||||||
|
|
|
@ -29,15 +29,18 @@
|
||||||
|
|
||||||
#define CPU_GENERIC 0
|
#define CPU_GENERIC 0
|
||||||
#define CPU_Z13 1
|
#define CPU_Z13 1
|
||||||
|
#define CPU_Z14 2
|
||||||
|
|
||||||
static char *cpuname[] = {
|
static char *cpuname[] = {
|
||||||
"ZARCH_GENERIC",
|
"ZARCH_GENERIC",
|
||||||
"Z13"
|
"Z13",
|
||||||
|
"Z14"
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *cpuname_lower[] = {
|
static char *cpuname_lower[] = {
|
||||||
"zarch_generic",
|
"zarch_generic",
|
||||||
"z13"
|
"z13",
|
||||||
|
"z14"
|
||||||
};
|
};
|
||||||
|
|
||||||
int detect(void)
|
int detect(void)
|
||||||
|
@ -62,6 +65,10 @@ int detect(void)
|
||||||
if (strstr(p, "2964")) return CPU_Z13;
|
if (strstr(p, "2964")) return CPU_Z13;
|
||||||
if (strstr(p, "2965")) return CPU_Z13;
|
if (strstr(p, "2965")) return CPU_Z13;
|
||||||
|
|
||||||
|
/* detect z14, but fall back to z13 */
|
||||||
|
if (strstr(p, "3906")) return CPU_Z13;
|
||||||
|
if (strstr(p, "3907")) return CPU_Z13;
|
||||||
|
|
||||||
return CPU_GENERIC;
|
return CPU_GENERIC;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -107,5 +114,9 @@ void get_cpuconfig(void)
|
||||||
printf("#define Z13\n");
|
printf("#define Z13\n");
|
||||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||||
break;
|
break;
|
||||||
|
case CPU_Z14:
|
||||||
|
printf("#define Z14\n");
|
||||||
|
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
4
ctest.c
4
ctest.c
|
@ -101,6 +101,10 @@ OS_INTERIX
|
||||||
OS_LINUX
|
OS_LINUX
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__HAIKU__)
|
||||||
|
OS_HAIKU
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__i386) || defined(_X86)
|
#if defined(__i386) || defined(_X86)
|
||||||
ARCH_X86
|
ARCH_X86
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU)
|
||||||
#include <dlfcn.h>
|
#include <dlfcn.h>
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
#include <sys/resource.h>
|
#include <sys/resource.h>
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -122,7 +122,7 @@ endif
|
||||||
dllinit.$(SUFFIX) : dllinit.c
|
dllinit.$(SUFFIX) : dllinit.c
|
||||||
$(CC) $(CFLAGS) -c -o $(@F) -s $<
|
$(CC) $(CFLAGS) -c -o $(@F) -s $<
|
||||||
|
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||||
|
|
||||||
so : ../$(LIBSONAME)
|
so : ../$(LIBSONAME)
|
||||||
|
|
||||||
|
|
|
@ -213,7 +213,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
if (trans) lenx = m;
|
if (trans) lenx = m;
|
||||||
if (trans) leny = n;
|
if (trans) leny = n;
|
||||||
|
|
||||||
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||||
|
|
||||||
if (alpha == ZERO) return;
|
if (alpha == ZERO) return;
|
||||||
|
|
||||||
|
|
|
@ -199,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
if (trans) lenx = m;
|
if (trans) lenx = m;
|
||||||
if (trans) leny = n;
|
if (trans) leny = n;
|
||||||
|
|
||||||
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||||
|
|
||||||
if (alpha == ZERO) return;
|
if (alpha == ZERO) return;
|
||||||
|
|
||||||
|
|
|
@ -22,8 +22,8 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
||||||
long double s;
|
long double s;
|
||||||
long double r, roe, z;
|
long double r, roe, z;
|
||||||
|
|
||||||
long double ada = fabs(da);
|
long double ada = fabsl(da);
|
||||||
long double adb = fabs(db);
|
long double adb = fabsl(db);
|
||||||
long double scale = ada + adb;
|
long double scale = ada + adb;
|
||||||
|
|
||||||
#ifndef CBLAS
|
#ifndef CBLAS
|
||||||
|
|
|
@ -184,7 +184,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
|
|
||||||
if (n == 0) return;
|
if (n == 0) return;
|
||||||
|
|
||||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||||
|
|
||||||
if (alpha == ZERO) return;
|
if (alpha == ZERO) return;
|
||||||
|
|
||||||
|
|
|
@ -168,7 +168,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
|
|
||||||
if (n == 0) return;
|
if (n == 0) return;
|
||||||
|
|
||||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||||
|
|
||||||
if (alpha == ZERO) return;
|
if (alpha == ZERO) return;
|
||||||
|
|
||||||
|
|
|
@ -166,7 +166,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
|
||||||
|
|
||||||
if (n == 0) return;
|
if (n == 0) return;
|
||||||
|
|
||||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||||
|
|
||||||
if (alpha == ZERO) return;
|
if (alpha == ZERO) return;
|
||||||
|
|
||||||
|
|
|
@ -237,7 +237,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
if (trans & 1) lenx = m;
|
if (trans & 1) lenx = m;
|
||||||
if (trans & 1) leny = n;
|
if (trans & 1) leny = n;
|
||||||
|
|
||||||
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||||
|
|
||||||
if (alpha_r == ZERO && alpha_i == ZERO) return;
|
if (alpha_r == ZERO && alpha_i == ZERO) return;
|
||||||
|
|
||||||
|
|
|
@ -225,7 +225,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
if (trans & 1) lenx = m;
|
if (trans & 1) lenx = m;
|
||||||
if (trans & 1) leny = n;
|
if (trans & 1) leny = n;
|
||||||
|
|
||||||
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||||
|
|
||||||
if (alpha_r == ZERO && alpha_i == ZERO) return;
|
if (alpha_r == ZERO && alpha_i == ZERO) return;
|
||||||
|
|
||||||
|
|
|
@ -190,7 +190,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
|
|
||||||
if (n == 0) return;
|
if (n == 0) return;
|
||||||
|
|
||||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||||
|
|
||||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||||
|
|
||||||
|
|
|
@ -181,7 +181,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA
|
||||||
|
|
||||||
if (n == 0) return;
|
if (n == 0) return;
|
||||||
|
|
||||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||||
|
|
||||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||||
|
|
||||||
|
|
|
@ -180,7 +180,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
|
|
||||||
if (n == 0) return;
|
if (n == 0) return;
|
||||||
|
|
||||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||||
|
|
||||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||||
|
|
||||||
|
|
|
@ -14,7 +14,7 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
||||||
long double db_i = *(DB + 1);
|
long double db_i = *(DB + 1);
|
||||||
long double r;
|
long double r;
|
||||||
|
|
||||||
long double ada = fabs(da_r) + fabs(da_i);
|
long double ada = fabsl(da_r) + fabsl(da_i);
|
||||||
|
|
||||||
PRINT_DEBUG_NAME;
|
PRINT_DEBUG_NAME;
|
||||||
|
|
||||||
|
|
|
@ -126,7 +126,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *
|
||||||
|
|
||||||
if (n == 0) return;
|
if (n == 0) return;
|
||||||
|
|
||||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0);
|
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, blasabs(incy), NULL, 0, NULL, 0);
|
||||||
|
|
||||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||||
|
|
||||||
|
|
|
@ -44,7 +44,7 @@ ifeq ($(CORE), POWER8)
|
||||||
USE_TRMM = 1
|
USE_TRMM = 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), Z13)
|
ifeq ($(ARCH), zarch)
|
||||||
USE_TRMM = 1
|
USE_TRMM = 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
|
@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "daxpy_microk_steamroller-2.c"
|
#include "daxpy_microk_steamroller-2.c"
|
||||||
#elif defined(PILEDRIVER)
|
#elif defined(PILEDRIVER)
|
||||||
#include "daxpy_microk_piledriver-2.c"
|
#include "daxpy_microk_piledriver-2.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
#elif defined(HASWELL) || defined(ZEN)
|
||||||
#include "daxpy_microk_haswell-2.c"
|
#include "daxpy_microk_haswell-2.c"
|
||||||
|
#elif defined (SKYLAKEX)
|
||||||
|
#include "daxpy_microk_skylakex-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "daxpy_microk_sandy-2.c"
|
#include "daxpy_microk_sandy-2.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -0,0 +1,71 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
/* need a new enough GCC for avx512 support */
|
||||||
|
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||||
|
|
||||||
|
#include <immintrin.h>
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_8 1
|
||||||
|
|
||||||
|
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||||
|
{
|
||||||
|
BLASLONG i = 0;
|
||||||
|
|
||||||
|
__m256d __alpha;
|
||||||
|
|
||||||
|
__alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
|
||||||
|
|
||||||
|
#ifdef __AVX512CD__
|
||||||
|
BLASLONG n32;
|
||||||
|
__m512d __alpha5;
|
||||||
|
__alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha));
|
||||||
|
|
||||||
|
n32 = n & ~31;
|
||||||
|
|
||||||
|
for (; i < n32; i+= 32) {
|
||||||
|
_mm512_storeu_pd(&y[i + 0], _mm512_loadu_pd(&y[i + 0]) + __alpha5 * _mm512_loadu_pd(&x[i + 0]));
|
||||||
|
_mm512_storeu_pd(&y[i + 8], _mm512_loadu_pd(&y[i + 8]) + __alpha5 * _mm512_loadu_pd(&x[i + 8]));
|
||||||
|
_mm512_storeu_pd(&y[i + 16], _mm512_loadu_pd(&y[i + 16]) + __alpha5 * _mm512_loadu_pd(&x[i + 16]));
|
||||||
|
_mm512_storeu_pd(&y[i + 24], _mm512_loadu_pd(&y[i + 24]) + __alpha5 * _mm512_loadu_pd(&x[i + 24]));
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
for (; i < n; i+= 16) {
|
||||||
|
_mm256_storeu_pd(&y[i + 0], _mm256_loadu_pd(&y[i + 0]) + __alpha * _mm256_loadu_pd(&x[i + 0]));
|
||||||
|
_mm256_storeu_pd(&y[i + 4], _mm256_loadu_pd(&y[i + 4]) + __alpha * _mm256_loadu_pd(&x[i + 4]));
|
||||||
|
_mm256_storeu_pd(&y[i + 8], _mm256_loadu_pd(&y[i + 8]) + __alpha * _mm256_loadu_pd(&x[i + 8]));
|
||||||
|
_mm256_storeu_pd(&y[i + 12], _mm256_loadu_pd(&y[i + 12]) + __alpha * _mm256_loadu_pd(&x[i + 12]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
#include "daxpy_microk_haswell-2.c"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "ddot_microk_piledriver-2.c"
|
#include "ddot_microk_piledriver-2.c"
|
||||||
#elif defined(NEHALEM)
|
#elif defined(NEHALEM)
|
||||||
#include "ddot_microk_nehalem-2.c"
|
#include "ddot_microk_nehalem-2.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
#elif defined(HASWELL) || defined(ZEN)
|
||||||
#include "ddot_microk_haswell-2.c"
|
#include "ddot_microk_haswell-2.c"
|
||||||
|
#elif defined (SKYLAKEX)
|
||||||
|
#include "ddot_microk_skylakex-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "ddot_microk_sandy-2.c"
|
#include "ddot_microk_sandy-2.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -0,0 +1,96 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/* need a new enough GCC for avx512 support */
|
||||||
|
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_8 1
|
||||||
|
|
||||||
|
#include <immintrin.h>
|
||||||
|
|
||||||
|
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||||
|
{
|
||||||
|
int i = 0;
|
||||||
|
__m256d accum_0, accum_1, accum_2, accum_3;
|
||||||
|
|
||||||
|
accum_0 = _mm256_setzero_pd();
|
||||||
|
accum_1 = _mm256_setzero_pd();
|
||||||
|
accum_2 = _mm256_setzero_pd();
|
||||||
|
accum_3 = _mm256_setzero_pd();
|
||||||
|
|
||||||
|
#ifdef __AVX512CD__
|
||||||
|
__m512d accum_05, accum_15, accum_25, accum_35;
|
||||||
|
int n32;
|
||||||
|
n32 = n & (~31);
|
||||||
|
|
||||||
|
accum_05 = _mm512_setzero_pd();
|
||||||
|
accum_15 = _mm512_setzero_pd();
|
||||||
|
accum_25 = _mm512_setzero_pd();
|
||||||
|
accum_35 = _mm512_setzero_pd();
|
||||||
|
|
||||||
|
for (; i < n32; i += 32) {
|
||||||
|
accum_05 += _mm512_loadu_pd(&x[i+ 0]) * _mm512_loadu_pd(&y[i+ 0]);
|
||||||
|
accum_15 += _mm512_loadu_pd(&x[i+ 8]) * _mm512_loadu_pd(&y[i+ 8]);
|
||||||
|
accum_25 += _mm512_loadu_pd(&x[i+16]) * _mm512_loadu_pd(&y[i+16]);
|
||||||
|
accum_35 += _mm512_loadu_pd(&x[i+24]) * _mm512_loadu_pd(&y[i+24]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code
|
||||||
|
* below can continue using the intermediate results in its loop
|
||||||
|
*/
|
||||||
|
accum_0 = _mm512_extractf64x4_pd(accum_05, 0) + _mm512_extractf64x4_pd(accum_05, 1);
|
||||||
|
accum_1 = _mm512_extractf64x4_pd(accum_15, 0) + _mm512_extractf64x4_pd(accum_15, 1);
|
||||||
|
accum_2 = _mm512_extractf64x4_pd(accum_25, 0) + _mm512_extractf64x4_pd(accum_25, 1);
|
||||||
|
accum_3 = _mm512_extractf64x4_pd(accum_35, 0) + _mm512_extractf64x4_pd(accum_35, 1);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
for (; i < n; i += 16) {
|
||||||
|
accum_0 += _mm256_loadu_pd(&x[i+ 0]) * _mm256_loadu_pd(&y[i+ 0]);
|
||||||
|
accum_1 += _mm256_loadu_pd(&x[i+ 4]) * _mm256_loadu_pd(&y[i+ 4]);
|
||||||
|
accum_2 += _mm256_loadu_pd(&x[i+ 8]) * _mm256_loadu_pd(&y[i+ 8]);
|
||||||
|
accum_3 += _mm256_loadu_pd(&x[i+12]) * _mm256_loadu_pd(&y[i+12]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */
|
||||||
|
|
||||||
|
accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
|
||||||
|
|
||||||
|
__m128d half_accum0;
|
||||||
|
|
||||||
|
/* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */
|
||||||
|
half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1));
|
||||||
|
|
||||||
|
/* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */
|
||||||
|
half_accum0 = _mm_hadd_pd(half_accum0, half_accum0);
|
||||||
|
|
||||||
|
*dot = half_accum0[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
#include "ddot_microk_haswell-2.c"
|
||||||
|
#endif
|
|
@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(NEHALEM)
|
#if defined(NEHALEM)
|
||||||
#include "dgemv_n_microk_nehalem-4.c"
|
#include "dgemv_n_microk_nehalem-4.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX)
|
#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "dgemv_n_microk_haswell-4.c"
|
#include "dgemv_n_microk_haswell-4.c"
|
||||||
|
#elif defined (SKYLAKEX)
|
||||||
|
#include "dgemv_n_microk_skylakex-4.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,126 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/* need a new enough GCC for avx512 support */
|
||||||
|
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_4x4 1
|
||||||
|
|
||||||
|
#include <immintrin.h>
|
||||||
|
|
||||||
|
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||||
|
{
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
|
||||||
|
__m256d x0, x1, x2, x3;
|
||||||
|
__m256d __alpha;
|
||||||
|
|
||||||
|
x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0]));
|
||||||
|
x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1]));
|
||||||
|
x2 = _mm256_broadcastsd_pd(_mm_load_sd(&x[2]));
|
||||||
|
x3 = _mm256_broadcastsd_pd(_mm_load_sd(&x[3]));
|
||||||
|
|
||||||
|
__alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
|
||||||
|
|
||||||
|
#ifdef __AVX512CD__
|
||||||
|
int n5;
|
||||||
|
__m512d x05, x15, x25, x35;
|
||||||
|
__m512d __alpha5;
|
||||||
|
n5 = n & ~7;
|
||||||
|
|
||||||
|
x05 = _mm512_broadcastsd_pd(_mm_load_sd(&x[0]));
|
||||||
|
x15 = _mm512_broadcastsd_pd(_mm_load_sd(&x[1]));
|
||||||
|
x25 = _mm512_broadcastsd_pd(_mm_load_sd(&x[2]));
|
||||||
|
x35 = _mm512_broadcastsd_pd(_mm_load_sd(&x[3]));
|
||||||
|
|
||||||
|
__alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha));
|
||||||
|
|
||||||
|
for (; i < n5; i+= 8) {
|
||||||
|
__m512d tempY;
|
||||||
|
__m512d sum;
|
||||||
|
|
||||||
|
sum = _mm512_loadu_pd(&ap[0][i]) * x05 +
|
||||||
|
_mm512_loadu_pd(&ap[1][i]) * x15 +
|
||||||
|
_mm512_loadu_pd(&ap[2][i]) * x25 +
|
||||||
|
_mm512_loadu_pd(&ap[3][i]) * x35;
|
||||||
|
|
||||||
|
tempY = _mm512_loadu_pd(&y[i]);
|
||||||
|
tempY += sum * __alpha5;
|
||||||
|
_mm512_storeu_pd(&y[i], tempY);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
for (; i < n; i+= 4) {
|
||||||
|
__m256d tempY;
|
||||||
|
__m256d sum;
|
||||||
|
|
||||||
|
sum = _mm256_loadu_pd(&ap[0][i]) * x0 +
|
||||||
|
_mm256_loadu_pd(&ap[1][i]) * x1 +
|
||||||
|
_mm256_loadu_pd(&ap[2][i]) * x2 +
|
||||||
|
_mm256_loadu_pd(&ap[3][i]) * x3;
|
||||||
|
|
||||||
|
tempY = _mm256_loadu_pd(&y[i]);
|
||||||
|
tempY += sum * __alpha;
|
||||||
|
_mm256_storeu_pd(&y[i], tempY);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_4x2
|
||||||
|
|
||||||
|
static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||||
|
{
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
|
||||||
|
__m256d x0, x1;
|
||||||
|
__m256d __alpha;
|
||||||
|
|
||||||
|
x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0]));
|
||||||
|
x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1]));
|
||||||
|
|
||||||
|
__alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
|
||||||
|
|
||||||
|
|
||||||
|
for (i = 0; i < n; i+= 4) {
|
||||||
|
__m256d tempY;
|
||||||
|
__m256d sum;
|
||||||
|
|
||||||
|
sum = _mm256_loadu_pd(&ap[0][i]) * x0 + _mm256_loadu_pd(&ap[1][i]) * x1;
|
||||||
|
|
||||||
|
tempY = _mm256_loadu_pd(&y[i]);
|
||||||
|
tempY += sum * __alpha;
|
||||||
|
_mm256_storeu_pd(&y[i], tempY);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
#include "dgemv_n_microk_haswell-4.c"
|
||||||
|
#endif
|
|
@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "dscal_microk_bulldozer-2.c"
|
#include "dscal_microk_bulldozer-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "dscal_microk_sandy-2.c"
|
#include "dscal_microk_sandy-2.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
#elif defined(HASWELL) || defined(ZEN)
|
||||||
#include "dscal_microk_haswell-2.c"
|
#include "dscal_microk_haswell-2.c"
|
||||||
|
#elif defined (SKYLAKEX)
|
||||||
|
#include "dscal_microk_skylakex-2.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,77 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014-2015, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/* need a new enough GCC for avx512 support */
|
||||||
|
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||||
|
|
||||||
|
#include <immintrin.h>
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_8 1
|
||||||
|
|
||||||
|
static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||||
|
{
|
||||||
|
int i = 0;
|
||||||
|
|
||||||
|
#ifdef __AVX512CD__
|
||||||
|
__m512d __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha));
|
||||||
|
for (; i < n; i += 8) {
|
||||||
|
_mm512_storeu_pd(&x[i + 0], __alpha5 * _mm512_loadu_pd(&x[i + 0]));
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
__m256d __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
|
||||||
|
for (; i < n; i += 8) {
|
||||||
|
_mm256_storeu_pd(&x[i + 0], __alpha * _mm256_loadu_pd(&x[i + 0]));
|
||||||
|
_mm256_storeu_pd(&x[i + 4], __alpha * _mm256_loadu_pd(&x[i + 4]));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||||
|
{
|
||||||
|
int i = 0;
|
||||||
|
|
||||||
|
/* question to self: Why is this not just memset() */
|
||||||
|
|
||||||
|
#ifdef __AVX512CD__
|
||||||
|
__m512d zero = _mm512_setzero_pd();
|
||||||
|
for (; i < n; i += 8) {
|
||||||
|
_mm512_storeu_pd(&x[i], zero);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
__m256d zero = _mm256_setzero_pd();
|
||||||
|
for (; i < n; i += 8) {
|
||||||
|
_mm256_storeu_pd(&x[i + 0], zero);
|
||||||
|
_mm256_storeu_pd(&x[i + 4], zero);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
#include "dscal_microk_haswell-2.c"
|
||||||
|
#endif
|
|
@ -30,8 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "dsymv_L_microk_bulldozer-2.c"
|
#include "dsymv_L_microk_bulldozer-2.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
#elif defined(HASWELL) || defined(ZEN)
|
||||||
#include "dsymv_L_microk_haswell-2.c"
|
#include "dsymv_L_microk_haswell-2.c"
|
||||||
|
#elif defined (SKYLAKEX)
|
||||||
|
#include "dsymv_L_microk_skylakex-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "dsymv_L_microk_sandy-2.c"
|
#include "dsymv_L_microk_sandy-2.c"
|
||||||
#elif defined(NEHALEM)
|
#elif defined(NEHALEM)
|
||||||
|
|
|
@ -0,0 +1,161 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
/* need a new enough GCC for avx512 support */
|
||||||
|
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||||
|
|
||||||
|
#include <immintrin.h>
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_4x4 1
|
||||||
|
|
||||||
|
static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
|
||||||
|
{
|
||||||
|
|
||||||
|
|
||||||
|
__m256d accum_0, accum_1, accum_2, accum_3;
|
||||||
|
__m256d temp1_0, temp1_1, temp1_2, temp1_3;
|
||||||
|
|
||||||
|
/* the 256 bit wide acculmulator vectors start out as zero */
|
||||||
|
accum_0 = _mm256_setzero_pd();
|
||||||
|
accum_1 = _mm256_setzero_pd();
|
||||||
|
accum_2 = _mm256_setzero_pd();
|
||||||
|
accum_3 = _mm256_setzero_pd();
|
||||||
|
|
||||||
|
temp1_0 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[0]));
|
||||||
|
temp1_1 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[1]));
|
||||||
|
temp1_2 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[2]));
|
||||||
|
temp1_3 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[3]));
|
||||||
|
|
||||||
|
#ifdef __AVX512CD__
|
||||||
|
__m512d accum_05, accum_15, accum_25, accum_35;
|
||||||
|
__m512d temp1_05, temp1_15, temp1_25, temp1_35;
|
||||||
|
BLASLONG to2;
|
||||||
|
int delta;
|
||||||
|
|
||||||
|
/* the 512 bit wide accumulator vectors start out as zero */
|
||||||
|
accum_05 = _mm512_setzero_pd();
|
||||||
|
accum_15 = _mm512_setzero_pd();
|
||||||
|
accum_25 = _mm512_setzero_pd();
|
||||||
|
accum_35 = _mm512_setzero_pd();
|
||||||
|
|
||||||
|
temp1_05 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[0]));
|
||||||
|
temp1_15 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[1]));
|
||||||
|
temp1_25 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[2]));
|
||||||
|
temp1_35 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[3]));
|
||||||
|
|
||||||
|
delta = (to - from) & ~7;
|
||||||
|
to2 = from + delta;
|
||||||
|
|
||||||
|
|
||||||
|
for (; from < to2; from += 8) {
|
||||||
|
__m512d _x, _y;
|
||||||
|
__m512d a0, a1, a2, a3;
|
||||||
|
|
||||||
|
_y = _mm512_loadu_pd(&y[from]);
|
||||||
|
_x = _mm512_loadu_pd(&x[from]);
|
||||||
|
|
||||||
|
a0 = _mm512_loadu_pd(&a[0][from]);
|
||||||
|
a1 = _mm512_loadu_pd(&a[1][from]);
|
||||||
|
a2 = _mm512_loadu_pd(&a[2][from]);
|
||||||
|
a3 = _mm512_loadu_pd(&a[3][from]);
|
||||||
|
|
||||||
|
_y += temp1_05 * a0 + temp1_15 * a1 + temp1_25 * a2 + temp1_35 * a3;
|
||||||
|
|
||||||
|
accum_05 += _x * a0;
|
||||||
|
accum_15 += _x * a1;
|
||||||
|
accum_25 += _x * a2;
|
||||||
|
accum_35 += _x * a3;
|
||||||
|
|
||||||
|
_mm512_storeu_pd(&y[from], _y);
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code
|
||||||
|
* below can continue using the intermediate results in its loop
|
||||||
|
*/
|
||||||
|
accum_0 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_05, 0), _mm512_extractf64x4_pd(accum_05, 1));
|
||||||
|
accum_1 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_15, 0), _mm512_extractf64x4_pd(accum_15, 1));
|
||||||
|
accum_2 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_25, 0), _mm512_extractf64x4_pd(accum_25, 1));
|
||||||
|
accum_3 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_35, 0), _mm512_extractf64x4_pd(accum_35, 1));
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
for (; from != to; from += 4) {
|
||||||
|
__m256d _x, _y;
|
||||||
|
__m256d a0, a1, a2, a3;
|
||||||
|
|
||||||
|
_y = _mm256_loadu_pd(&y[from]);
|
||||||
|
_x = _mm256_loadu_pd(&x[from]);
|
||||||
|
|
||||||
|
/* load 4 rows of matrix data */
|
||||||
|
a0 = _mm256_loadu_pd(&a[0][from]);
|
||||||
|
a1 = _mm256_loadu_pd(&a[1][from]);
|
||||||
|
a2 = _mm256_loadu_pd(&a[2][from]);
|
||||||
|
a3 = _mm256_loadu_pd(&a[3][from]);
|
||||||
|
|
||||||
|
_y += temp1_0 * a0 + temp1_1 * a1 + temp1_2 * a2 + temp1_3 * a3;
|
||||||
|
|
||||||
|
accum_0 += _x * a0;
|
||||||
|
accum_1 += _x * a1;
|
||||||
|
accum_2 += _x * a2;
|
||||||
|
accum_3 += _x * a3;
|
||||||
|
|
||||||
|
_mm256_storeu_pd(&y[from], _y);
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* we now have 4 accumulator vectors. Each vector needs to be summed up element wise and stored in the temp2
|
||||||
|
* output array. There is no direct instruction for this in 256 bit space, only in 128 space.
|
||||||
|
*/
|
||||||
|
|
||||||
|
__m128d half_accum0, half_accum1, half_accum2, half_accum3;
|
||||||
|
|
||||||
|
|
||||||
|
/* Add upper half to lower half of each of the four 256 bit vectors to get to four 128 bit vectors */
|
||||||
|
half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1));
|
||||||
|
half_accum1 = _mm_add_pd(_mm256_extractf128_pd(accum_1, 0), _mm256_extractf128_pd(accum_1, 1));
|
||||||
|
half_accum2 = _mm_add_pd(_mm256_extractf128_pd(accum_2, 0), _mm256_extractf128_pd(accum_2, 1));
|
||||||
|
half_accum3 = _mm_add_pd(_mm256_extractf128_pd(accum_3, 0), _mm256_extractf128_pd(accum_3, 1));
|
||||||
|
|
||||||
|
/* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */
|
||||||
|
half_accum0 = _mm_hadd_pd(half_accum0, half_accum0);
|
||||||
|
half_accum1 = _mm_hadd_pd(half_accum1, half_accum1);
|
||||||
|
half_accum2 = _mm_hadd_pd(half_accum2, half_accum2);
|
||||||
|
half_accum3 = _mm_hadd_pd(half_accum3, half_accum3);
|
||||||
|
|
||||||
|
/* and store the lowest double value from each of these vectors in the temp2 output */
|
||||||
|
temp2[0] += half_accum0[0];
|
||||||
|
temp2[1] += half_accum1[0];
|
||||||
|
temp2[2] += half_accum2[0];
|
||||||
|
temp2[3] += half_accum3[0];
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
#include "dsymv_L_microk_haswell-2.c"
|
||||||
|
#endif
|
|
@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(NEHALEM)
|
#if defined(NEHALEM)
|
||||||
#include "saxpy_microk_nehalem-2.c"
|
#include "saxpy_microk_nehalem-2.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
#elif defined(HASWELL) || defined(ZEN)
|
||||||
#include "saxpy_microk_haswell-2.c"
|
#include "saxpy_microk_haswell-2.c"
|
||||||
|
#elif defined (SKYLAKEX)
|
||||||
|
#include "saxpy_microk_skylakex-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "saxpy_microk_sandy-2.c"
|
#include "saxpy_microk_sandy-2.c"
|
||||||
#elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
#elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
|
|
|
@ -0,0 +1,69 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/* need a new enough GCC for avx512 support */
|
||||||
|
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_16 1
|
||||||
|
|
||||||
|
#include <immintrin.h>
|
||||||
|
|
||||||
|
static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||||
|
{
|
||||||
|
BLASLONG i = 0;
|
||||||
|
|
||||||
|
__m256 __alpha;
|
||||||
|
|
||||||
|
__alpha = _mm256_broadcastss_ps(_mm_load_ss(alpha));
|
||||||
|
|
||||||
|
#ifdef __AVX512CD__
|
||||||
|
BLASLONG n64;
|
||||||
|
__m512 __alpha5;
|
||||||
|
__alpha5 = _mm512_broadcastss_ps(_mm_load_ss(alpha));
|
||||||
|
|
||||||
|
n64 = n & ~63;
|
||||||
|
|
||||||
|
for (; i < n64; i+= 64) {
|
||||||
|
_mm512_storeu_ps(&y[i + 0], _mm512_loadu_ps(&y[i + 0]) + __alpha5 * _mm512_loadu_ps(&x[i + 0]));
|
||||||
|
_mm512_storeu_ps(&y[i + 16], _mm512_loadu_ps(&y[i + 16]) + __alpha5 * _mm512_loadu_ps(&x[i + 16]));
|
||||||
|
_mm512_storeu_ps(&y[i + 32], _mm512_loadu_ps(&y[i + 32]) + __alpha5 * _mm512_loadu_ps(&x[i + 32]));
|
||||||
|
_mm512_storeu_ps(&y[i + 48], _mm512_loadu_ps(&y[i + 48]) + __alpha5 * _mm512_loadu_ps(&x[i + 48]));
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
for (; i < n; i+= 32) {
|
||||||
|
_mm256_storeu_ps(&y[i + 0], _mm256_loadu_ps(&y[i + 0]) + __alpha * _mm256_loadu_ps(&x[i + 0]));
|
||||||
|
_mm256_storeu_ps(&y[i + 8], _mm256_loadu_ps(&y[i + 8]) + __alpha * _mm256_loadu_ps(&x[i + 8]));
|
||||||
|
_mm256_storeu_ps(&y[i + 16], _mm256_loadu_ps(&y[i + 16]) + __alpha * _mm256_loadu_ps(&x[i + 16]));
|
||||||
|
_mm256_storeu_ps(&y[i + 24], _mm256_loadu_ps(&y[i + 24]) + __alpha * _mm256_loadu_ps(&x[i + 24]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
#include "saxpy_microk_haswell-2.c"
|
||||||
|
#endif
|
||||||
|
|
|
@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "sdot_microk_steamroller-2.c"
|
#include "sdot_microk_steamroller-2.c"
|
||||||
#elif defined(NEHALEM)
|
#elif defined(NEHALEM)
|
||||||
#include "sdot_microk_nehalem-2.c"
|
#include "sdot_microk_nehalem-2.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
#elif defined(HASWELL) || defined(ZEN)
|
||||||
#include "sdot_microk_haswell-2.c"
|
#include "sdot_microk_haswell-2.c"
|
||||||
|
#elif defined (SKYLAKEX)
|
||||||
|
#include "sdot_microk_skylakex-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "sdot_microk_sandy-2.c"
|
#include "sdot_microk_sandy-2.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -0,0 +1,98 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/* need a new enough GCC for avx512 support */
|
||||||
|
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_16 1
|
||||||
|
|
||||||
|
#include <immintrin.h>
|
||||||
|
|
||||||
|
static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||||
|
|
||||||
|
{
|
||||||
|
int i = 0;
|
||||||
|
__m256 accum_0, accum_1, accum_2, accum_3;
|
||||||
|
|
||||||
|
accum_0 = _mm256_setzero_ps();
|
||||||
|
accum_1 = _mm256_setzero_ps();
|
||||||
|
accum_2 = _mm256_setzero_ps();
|
||||||
|
accum_3 = _mm256_setzero_ps();
|
||||||
|
|
||||||
|
#ifdef __AVX512CD__
|
||||||
|
__m512 accum_05, accum_15, accum_25, accum_35;
|
||||||
|
int n64;
|
||||||
|
n64 = n & (~63);
|
||||||
|
|
||||||
|
accum_05 = _mm512_setzero_ps();
|
||||||
|
accum_15 = _mm512_setzero_ps();
|
||||||
|
accum_25 = _mm512_setzero_ps();
|
||||||
|
accum_35 = _mm512_setzero_ps();
|
||||||
|
|
||||||
|
for (; i < n64; i += 64) {
|
||||||
|
accum_05 += _mm512_loadu_ps(&x[i+ 0]) * _mm512_loadu_ps(&y[i+ 0]);
|
||||||
|
accum_15 += _mm512_loadu_ps(&x[i+16]) * _mm512_loadu_ps(&y[i+16]);
|
||||||
|
accum_25 += _mm512_loadu_ps(&x[i+32]) * _mm512_loadu_ps(&y[i+32]);
|
||||||
|
accum_35 += _mm512_loadu_ps(&x[i+48]) * _mm512_loadu_ps(&y[i+48]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code
|
||||||
|
* below can continue using the intermediate results in its loop
|
||||||
|
*/
|
||||||
|
accum_0 = _mm512_extractf32x8_ps(accum_05, 0) + _mm512_extractf32x8_ps(accum_05, 1);
|
||||||
|
accum_1 = _mm512_extractf32x8_ps(accum_15, 0) + _mm512_extractf32x8_ps(accum_15, 1);
|
||||||
|
accum_2 = _mm512_extractf32x8_ps(accum_25, 0) + _mm512_extractf32x8_ps(accum_25, 1);
|
||||||
|
accum_3 = _mm512_extractf32x8_ps(accum_35, 0) + _mm512_extractf32x8_ps(accum_35, 1);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
for (; i < n; i += 32) {
|
||||||
|
accum_0 += _mm256_loadu_ps(&x[i+ 0]) * _mm256_loadu_ps(&y[i+ 0]);
|
||||||
|
accum_1 += _mm256_loadu_ps(&x[i+ 8]) * _mm256_loadu_ps(&y[i+ 8]);
|
||||||
|
accum_2 += _mm256_loadu_ps(&x[i+16]) * _mm256_loadu_ps(&y[i+16]);
|
||||||
|
accum_3 += _mm256_loadu_ps(&x[i+24]) * _mm256_loadu_ps(&y[i+24]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */
|
||||||
|
|
||||||
|
accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
|
||||||
|
|
||||||
|
__m128 half_accum0;
|
||||||
|
|
||||||
|
/* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */
|
||||||
|
half_accum0 = _mm256_extractf128_ps(accum_0, 0) + _mm256_extractf128_ps(accum_0, 1);
|
||||||
|
|
||||||
|
/* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */
|
||||||
|
half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
|
||||||
|
half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
|
||||||
|
|
||||||
|
*dot = half_accum0[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
#include "sdot_microk_haswell-2.c"
|
||||||
|
#endif
|
|
@ -280,8 +280,8 @@
|
||||||
* ..
|
* ..
|
||||||
* .. External Functions ..
|
* .. External Functions ..
|
||||||
LOGICAL LSAME
|
LOGICAL LSAME
|
||||||
INTEGER ILAENV
|
INTEGER ILAENV2STAGE
|
||||||
EXTERNAL LSAME, ILAENV
|
EXTERNAL LSAME, ILAENV2STAGE
|
||||||
* ..
|
* ..
|
||||||
* .. Executable Statements ..
|
* .. Executable Statements ..
|
||||||
*
|
*
|
||||||
|
@ -297,9 +297,9 @@
|
||||||
*
|
*
|
||||||
* Determine the block size, the workspace size and the hous size.
|
* Determine the block size, the workspace size and the hous size.
|
||||||
*
|
*
|
||||||
IB = ILAENV( 18, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 )
|
IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 )
|
||||||
LHMIN = ILAENV( 19, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
||||||
LWMIN = ILAENV( 20, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
||||||
*
|
*
|
||||||
IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
|
IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
|
||||||
INFO = -1
|
INFO = -1
|
||||||
|
|
|
@ -285,8 +285,8 @@
|
||||||
* ..
|
* ..
|
||||||
* .. External Functions ..
|
* .. External Functions ..
|
||||||
LOGICAL LSAME
|
LOGICAL LSAME
|
||||||
INTEGER ILAENV
|
INTEGER ILAENV2STAGE
|
||||||
EXTERNAL LSAME, ILAENV
|
EXTERNAL LSAME, ILAENV2STAGE
|
||||||
* ..
|
* ..
|
||||||
* .. Executable Statements ..
|
* .. Executable Statements ..
|
||||||
*
|
*
|
||||||
|
@ -296,7 +296,7 @@
|
||||||
INFO = 0
|
INFO = 0
|
||||||
UPPER = LSAME( UPLO, 'U' )
|
UPPER = LSAME( UPLO, 'U' )
|
||||||
LQUERY = ( LWORK.EQ.-1 )
|
LQUERY = ( LWORK.EQ.-1 )
|
||||||
LWMIN = ILAENV( 20, 'CHETRD_HE2HB', '', N, KD, -1, -1 )
|
LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 )
|
||||||
|
|
||||||
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
|
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
|
||||||
INFO = -1
|
INFO = -1
|
||||||
|
|
|
@ -277,8 +277,8 @@
|
||||||
* ..
|
* ..
|
||||||
* .. External Functions ..
|
* .. External Functions ..
|
||||||
LOGICAL LSAME
|
LOGICAL LSAME
|
||||||
INTEGER ILAENV
|
INTEGER ILAENV2STAGE
|
||||||
EXTERNAL LSAME, ILAENV
|
EXTERNAL LSAME, ILAENV2STAGE
|
||||||
* ..
|
* ..
|
||||||
* .. Executable Statements ..
|
* .. Executable Statements ..
|
||||||
*
|
*
|
||||||
|
@ -294,9 +294,9 @@
|
||||||
*
|
*
|
||||||
* Determine the block size, the workspace size and the hous size.
|
* Determine the block size, the workspace size and the hous size.
|
||||||
*
|
*
|
||||||
IB = ILAENV( 18, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 )
|
IB = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 )
|
||||||
LHMIN = ILAENV( 19, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
||||||
LWMIN = ILAENV( 20, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
||||||
*
|
*
|
||||||
IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
|
IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
|
||||||
INFO = -1
|
INFO = -1
|
||||||
|
|
|
@ -285,8 +285,8 @@
|
||||||
* ..
|
* ..
|
||||||
* .. External Functions ..
|
* .. External Functions ..
|
||||||
LOGICAL LSAME
|
LOGICAL LSAME
|
||||||
INTEGER ILAENV
|
INTEGER ILAENV2STAGE
|
||||||
EXTERNAL LSAME, ILAENV
|
EXTERNAL LSAME, ILAENV2STAGE
|
||||||
* ..
|
* ..
|
||||||
* .. Executable Statements ..
|
* .. Executable Statements ..
|
||||||
*
|
*
|
||||||
|
@ -296,7 +296,7 @@
|
||||||
INFO = 0
|
INFO = 0
|
||||||
UPPER = LSAME( UPLO, 'U' )
|
UPPER = LSAME( UPLO, 'U' )
|
||||||
LQUERY = ( LWORK.EQ.-1 )
|
LQUERY = ( LWORK.EQ.-1 )
|
||||||
LWMIN = ILAENV( 20, 'DSYTRD_SY2SB', '', N, KD, -1, -1 )
|
LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', '', N, KD, -1, -1 )
|
||||||
|
|
||||||
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
|
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
|
||||||
INFO = -1
|
INFO = -1
|
||||||
|
|
|
@ -277,8 +277,8 @@
|
||||||
* ..
|
* ..
|
||||||
* .. External Functions ..
|
* .. External Functions ..
|
||||||
LOGICAL LSAME
|
LOGICAL LSAME
|
||||||
INTEGER ILAENV
|
INTEGER ILAENV2STAGE
|
||||||
EXTERNAL LSAME, ILAENV
|
EXTERNAL LSAME, ILAENV2STAGE
|
||||||
* ..
|
* ..
|
||||||
* .. Executable Statements ..
|
* .. Executable Statements ..
|
||||||
*
|
*
|
||||||
|
@ -294,9 +294,9 @@
|
||||||
*
|
*
|
||||||
* Determine the block size, the workspace size and the hous size.
|
* Determine the block size, the workspace size and the hous size.
|
||||||
*
|
*
|
||||||
IB = ILAENV( 18, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 )
|
IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 )
|
||||||
LHMIN = ILAENV( 19, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
||||||
LWMIN = ILAENV( 20, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
||||||
*
|
*
|
||||||
IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
|
IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
|
||||||
INFO = -1
|
INFO = -1
|
||||||
|
|
|
@ -285,8 +285,8 @@
|
||||||
* ..
|
* ..
|
||||||
* .. External Functions ..
|
* .. External Functions ..
|
||||||
LOGICAL LSAME
|
LOGICAL LSAME
|
||||||
INTEGER ILAENV
|
INTEGER ILAENV2STAGE
|
||||||
EXTERNAL LSAME, ILAENV
|
EXTERNAL LSAME, ILAENV2STAGE
|
||||||
* ..
|
* ..
|
||||||
* .. Executable Statements ..
|
* .. Executable Statements ..
|
||||||
*
|
*
|
||||||
|
@ -296,7 +296,7 @@
|
||||||
INFO = 0
|
INFO = 0
|
||||||
UPPER = LSAME( UPLO, 'U' )
|
UPPER = LSAME( UPLO, 'U' )
|
||||||
LQUERY = ( LWORK.EQ.-1 )
|
LQUERY = ( LWORK.EQ.-1 )
|
||||||
LWMIN = ILAENV( 20, 'SSYTRD_SY2SB', '', N, KD, -1, -1 )
|
LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 )
|
||||||
|
|
||||||
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
|
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
|
||||||
INFO = -1
|
INFO = -1
|
||||||
|
|
|
@ -280,8 +280,8 @@
|
||||||
* ..
|
* ..
|
||||||
* .. External Functions ..
|
* .. External Functions ..
|
||||||
LOGICAL LSAME
|
LOGICAL LSAME
|
||||||
INTEGER ILAENV
|
INTEGER ILAENV2STAGE
|
||||||
EXTERNAL LSAME, ILAENV
|
EXTERNAL LSAME, ILAENV2STAGE
|
||||||
* ..
|
* ..
|
||||||
* .. Executable Statements ..
|
* .. Executable Statements ..
|
||||||
*
|
*
|
||||||
|
@ -297,9 +297,9 @@
|
||||||
*
|
*
|
||||||
* Determine the block size, the workspace size and the hous size.
|
* Determine the block size, the workspace size and the hous size.
|
||||||
*
|
*
|
||||||
IB = ILAENV( 18, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 )
|
IB = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 )
|
||||||
LHMIN = ILAENV( 19, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
||||||
LWMIN = ILAENV( 20, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
||||||
*
|
*
|
||||||
IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
|
IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
|
||||||
INFO = -1
|
INFO = -1
|
||||||
|
|
|
@ -285,8 +285,8 @@
|
||||||
* ..
|
* ..
|
||||||
* .. External Functions ..
|
* .. External Functions ..
|
||||||
LOGICAL LSAME
|
LOGICAL LSAME
|
||||||
INTEGER ILAENV
|
INTEGER ILAENV2STAGE
|
||||||
EXTERNAL LSAME, ILAENV
|
EXTERNAL LSAME, ILAENV2STAGE
|
||||||
* ..
|
* ..
|
||||||
* .. Executable Statements ..
|
* .. Executable Statements ..
|
||||||
*
|
*
|
||||||
|
@ -296,7 +296,7 @@
|
||||||
INFO = 0
|
INFO = 0
|
||||||
UPPER = LSAME( UPLO, 'U' )
|
UPPER = LSAME( UPLO, 'U' )
|
||||||
LQUERY = ( LWORK.EQ.-1 )
|
LQUERY = ( LWORK.EQ.-1 )
|
||||||
LWMIN = ILAENV( 20, 'ZHETRD_HE2HB', '', N, KD, -1, -1 )
|
LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 )
|
||||||
|
|
||||||
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
|
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
|
||||||
INFO = -1
|
INFO = -1
|
||||||
|
|
Loading…
Reference in New Issue