Merge pull request #1747 from xianyi/develop
Merge develop into 0.3.x for 0.3.3
This commit is contained in:
commit
422a8fa953
|
@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
|
|||
project(OpenBLAS C ASM)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 2)
|
||||
set(OpenBLAS_PATCH_VERSION 3.dev)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
|
@ -150,6 +150,7 @@ endif()
|
|||
|
||||
# add objects to the openblas lib
|
||||
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
|
||||
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>)
|
||||
|
||||
# Android needs to explicitly link against libm
|
||||
if(ANDROID)
|
||||
|
@ -169,6 +170,7 @@ endif()
|
|||
# Set output for libopenblas
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS")
|
||||
|
||||
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
|
||||
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
|
||||
|
|
111
Changelog.txt
111
Changelog.txt
|
@ -1,4 +1,115 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.2
|
||||
30-Jul-2018
|
||||
|
||||
common:
|
||||
* fixes for regressions caused by the rewrite of the thread
|
||||
initialization code in 0.3.1
|
||||
|
||||
POWER:
|
||||
* fixed cpu autodetection for the BSDs
|
||||
|
||||
MIPS64:
|
||||
* fixed utest errors in AXPY, DSDOT, ROT and SWAP
|
||||
|
||||
x86_64:
|
||||
* added autodetection of AMD Ryzen 2
|
||||
* fixed build with older versions of MSVC
|
||||
|
||||
====================================================================
|
||||
Version 0.3.1
|
||||
01-Jul-2018
|
||||
|
||||
common:
|
||||
* rewritten thread initialization code with significantly reduced overhead
|
||||
* added CBLAS interfaces to the IxAMIN BLAS extension functions
|
||||
* fixed the lapack-test target
|
||||
* CMAKE builds now create an OpenBLASConfig.cmake file
|
||||
* ZAXPY now uses a single thread for small input sizes
|
||||
* the LAPACK code was updated from Reference-LAPACK/lapack#253
|
||||
(fixing LAPACKE interfaces to Aasen's functions)
|
||||
|
||||
POWER:
|
||||
* corrected CROT and ZROT behaviour with zero INC_X
|
||||
|
||||
ARMV7:
|
||||
* corrected xDOT behaviour with zero INC_X or INC_Y
|
||||
|
||||
x86_64:
|
||||
* retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER,
|
||||
this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO
|
||||
(which will still be supported via the slower PRESCOTT kernels when this option is not set)
|
||||
* added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to
|
||||
specify the list of x86_64 targets to include. Any target not on the list will be supported
|
||||
by the Sandybridge or Nehalem kernels if available, or by Prescott.
|
||||
* improved SWITCH_RATIO on Haswell for increased GEMM throughput
|
||||
* added initial support for Intel Skylake X, including an AVX512 SGEMM kernel
|
||||
* added autodetection of Intel Cannon Lake series as Skylake X
|
||||
* added a default L2 cache size for hypervisors that return zero here (Chromebook)
|
||||
* fixed a name clash with recent Windows10 headers that broke the build with (at least)
|
||||
recent mingw from MSYS2
|
||||
* fixed a link error in mixed clang/gfortran builds with OpenMP
|
||||
* updated the OSX deployment target to 10.8
|
||||
* switched on parallel make for builds on MS Windows by default
|
||||
|
||||
x86:
|
||||
* fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y
|
||||
|
||||
====================================================================
|
||||
Version 0.3.0
|
||||
23-May-2108
|
||||
|
||||
common:
|
||||
* fixed some more thread race and locking bugs
|
||||
* added preliminary support for calling an OpenMP build of the library from multiple threads
|
||||
* removed performance impact of thread locks added in 0.2.20 on OpenMP code
|
||||
* general code cleanup
|
||||
* optimized DSDOT implementation
|
||||
* improved thread distribution for GEMM
|
||||
* corrected IMATCOPY/OMATCOPY implementation
|
||||
* fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations
|
||||
* cmake build improvements
|
||||
* pkgconfig file now contains build options
|
||||
* openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build
|
||||
* corrections and improvements for systems with more than 64 cpus
|
||||
* LAPACK code updated to 3.8.0 including later fixes
|
||||
* added ReLAPACK, a recursive implementation of several LAPACK functions
|
||||
* Rewrote ROTMG to handle cases that the netlib code failed to address
|
||||
* Disabled (broken) multithreading code for xTRMV
|
||||
* corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard
|
||||
* shared memory access failures on startup are now handled more gracefully
|
||||
* restored utests from earlier releases (and made them pass on all affected systems)
|
||||
|
||||
SPARC:
|
||||
* several fixes for cpu autodetection
|
||||
|
||||
POWER:
|
||||
* corrected vector register overwriting in several Power8 kernels
|
||||
* optimized additional BLAS functions
|
||||
|
||||
ARM:
|
||||
* added support for CortexA53 and A72
|
||||
* added autodetection for ThunderX2T99
|
||||
* made most optimized kernels the default for generic ARMv8 targets
|
||||
|
||||
x86_64:
|
||||
* parallelized DDOT kernel for Haswell
|
||||
* changed alignment directives in assembly kernels to boost performance on OSX
|
||||
* fixed register handling in the GEMV microkernels (bug exposed by gcc7)
|
||||
* added support for building on OpenBSD and Dragonfly
|
||||
* updated compiler options to work with Intel release 2018
|
||||
* support fully optimized build with clang/flang on Microsoft Windows
|
||||
* fixed building on AIX
|
||||
|
||||
IBM Z:
|
||||
* added optimized BLAS 1/2 functions
|
||||
|
||||
MIPS:
|
||||
* fixed cpu autodetection helper code
|
||||
* added mips32 1004K cpu (Mediatek MT7621 and similar SoC)
|
||||
* added mips64 I6500 cpu
|
||||
|
||||
====================================================================
|
||||
Version 0.2.20
|
||||
24-Jul-2017
|
||||
|
|
4
Makefile
4
Makefile
|
@ -97,7 +97,7 @@ endif
|
|||
|
||||
shared :
|
||||
ifndef NO_SHARED
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||
@$(MAKE) -C exports so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
|
@ -267,6 +267,8 @@ ifeq ($(F_COMPILER), GFORTRAN)
|
|||
ifdef SMP
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
else ifeq ($(OSNAME), Haiku)
|
||||
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
else
|
||||
-@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
endif
|
||||
|
|
|
@ -66,7 +66,7 @@ endif
|
|||
#for install shared library
|
||||
ifndef NO_SHARED
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.2
|
||||
VERSION = 0.3.3.dev
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
@ -107,7 +107,13 @@ BUILD_LAPACK_DEPRECATED = 1
|
|||
# BUILD_RELAPACK = 1
|
||||
|
||||
# If you want to use legacy threaded Level 3 implementation.
|
||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
|
||||
# If you want to use the new, still somewhat experimental code that uses
|
||||
# thread-local storage instead of a central memory buffer in memory.c
|
||||
# Note that if your system uses GLIBC, it needs to have at least glibc 2.21
|
||||
# for this to work.
|
||||
USE_TLS = 1
|
||||
|
||||
# If you want to drive whole 64bit region by BLAS. Not all Fortran
|
||||
# compiler supports this. It's safe to keep comment it out if you
|
||||
|
|
|
@ -1018,6 +1018,10 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
|
|||
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
|
||||
endif
|
||||
|
||||
ifdef USE_TLS
|
||||
CCOMMON_OPT += -DUSE_TLS
|
||||
endif
|
||||
|
||||
ifndef SYMBOLPREFIX
|
||||
SYMBOLPREFIX =
|
||||
endif
|
||||
|
|
|
@ -12,6 +12,9 @@ ifeq ($(CORE), SKYLAKEX)
|
|||
ifndef NO_AVX512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
|
|
@ -110,6 +110,7 @@ Please read `GotoBLAS_01Readme.txt`.
|
|||
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
|
||||
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
|
||||
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
|
||||
- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
|
||||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
|
||||
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
||||
|
@ -200,6 +201,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2
|
|||
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
|
||||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
|
||||
Clang 3.0 will generate the wrong AVX binary code.
|
||||
* Please use GCC version 6 or LLVM version 6 and above to compile Skyalke AVX512 kernels.
|
||||
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
|
||||
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
|
||||
the library with `BIGNUMA=1`.
|
||||
|
|
|
@ -122,7 +122,7 @@ int main(int argc, char *argv[]){
|
|||
|
||||
FLOAT *a, *x, *y;
|
||||
FLOAT alpha[] = {1.0, 1.0};
|
||||
FLOAT beta [] = {1.0, 1.0};
|
||||
FLOAT beta [] = {1.0, 0.0};
|
||||
char trans='N';
|
||||
blasint m, i, j;
|
||||
blasint inc_x=1,inc_y=1;
|
||||
|
|
4
c_check
4
c_check
|
@ -64,6 +64,7 @@ $os = WINNT if ($data =~ /OS_WINNT/);
|
|||
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
|
||||
$os = Interix if ($data =~ /OS_INTERIX/);
|
||||
$os = Android if ($data =~ /OS_ANDROID/);
|
||||
$os = Haiku if ($data =~ /OS_HAIKU/);
|
||||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
|
@ -223,7 +224,6 @@ $data =~ /globl\s([_\.]*)(.*)/;
|
|||
$need_fu = $1;
|
||||
|
||||
$cross = 0;
|
||||
$cross = 1 if ($os ne $hostos);
|
||||
|
||||
if ($architecture ne $hostarch) {
|
||||
$cross = 1;
|
||||
|
@ -231,6 +231,8 @@ if ($architecture ne $hostarch) {
|
|||
$cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips"));
|
||||
}
|
||||
|
||||
$cross = 1 if ($os ne $hostos);
|
||||
|
||||
$openmp = "" if $ENV{USE_OPENMP} != 1;
|
||||
|
||||
$linker_L = "";
|
||||
|
|
|
@ -214,6 +214,10 @@ if (CONSISTENT_FPCSR)
|
|||
set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR")
|
||||
endif ()
|
||||
|
||||
if (USE_TLS)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_TLS")
|
||||
endif ()
|
||||
|
||||
# Only for development
|
||||
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST")
|
||||
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST")
|
||||
|
|
10
common.h
10
common.h
|
@ -105,6 +105,10 @@ extern "C" {
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
#define NO_SYSV_IPC
|
||||
#endif
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
#ifdef ATOM
|
||||
#define GOTO_ATOM ATOM
|
||||
|
@ -253,8 +257,14 @@ typedef unsigned long BLASULONG;
|
|||
|
||||
#ifdef USE64BITINT
|
||||
typedef BLASLONG blasint;
|
||||
#if defined(OS_WINDOWS) && defined(__64BIT__)
|
||||
#define blasabs(x) llabs(x)
|
||||
#else
|
||||
#define blasabs(x) labs(x)
|
||||
#endif
|
||||
#else
|
||||
typedef int blasint;
|
||||
#define blasabs(x) abs(x)
|
||||
#endif
|
||||
#else
|
||||
#ifdef USE64BITINT
|
||||
|
|
|
@ -29,15 +29,18 @@
|
|||
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
#define CPU_Z14 2
|
||||
|
||||
static char *cpuname[] = {
|
||||
"ZARCH_GENERIC",
|
||||
"Z13"
|
||||
"Z13",
|
||||
"Z14"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"zarch_generic",
|
||||
"z13"
|
||||
"z13",
|
||||
"z14"
|
||||
};
|
||||
|
||||
int detect(void)
|
||||
|
@ -62,6 +65,10 @@ int detect(void)
|
|||
if (strstr(p, "2964")) return CPU_Z13;
|
||||
if (strstr(p, "2965")) return CPU_Z13;
|
||||
|
||||
/* detect z14, but fall back to z13 */
|
||||
if (strstr(p, "3906")) return CPU_Z13;
|
||||
if (strstr(p, "3907")) return CPU_Z13;
|
||||
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
||||
|
@ -107,5 +114,9 @@ void get_cpuconfig(void)
|
|||
printf("#define Z13\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
break;
|
||||
case CPU_Z14:
|
||||
printf("#define Z14\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
4
ctest.c
4
ctest.c
|
@ -101,6 +101,10 @@ OS_INTERIX
|
|||
OS_LINUX
|
||||
#endif
|
||||
|
||||
#if defined(__HAIKU__)
|
||||
OS_HAIKU
|
||||
#endif
|
||||
|
||||
#if defined(__i386) || defined(_X86)
|
||||
ARCH_X86
|
||||
#endif
|
||||
|
|
|
@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU)
|
||||
#include <dlfcn.h>
|
||||
#include <signal.h>
|
||||
#include <sys/resource.h>
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -122,7 +122,7 @@ endif
|
|||
dllinit.$(SUFFIX) : dllinit.c
|
||||
$(CC) $(CFLAGS) -c -o $(@F) -s $<
|
||||
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||
|
||||
so : ../$(LIBSONAME)
|
||||
|
||||
|
|
|
@ -213,7 +213,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
if (trans) lenx = m;
|
||||
if (trans) leny = n;
|
||||
|
||||
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
|
|
@ -199,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
if (trans) lenx = m;
|
||||
if (trans) leny = n;
|
||||
|
||||
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
|
|
@ -22,8 +22,8 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
|||
long double s;
|
||||
long double r, roe, z;
|
||||
|
||||
long double ada = fabs(da);
|
||||
long double adb = fabs(db);
|
||||
long double ada = fabsl(da);
|
||||
long double adb = fabsl(db);
|
||||
long double scale = ada + adb;
|
||||
|
||||
#ifndef CBLAS
|
||||
|
|
|
@ -184,7 +184,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
|
||||
if (n == 0) return;
|
||||
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
|
|
@ -168,7 +168,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
|
||||
if (n == 0) return;
|
||||
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
|
|
@ -166,7 +166,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
|
|||
|
||||
if (n == 0) return;
|
||||
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
|
|
@ -237,7 +237,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
if (trans & 1) lenx = m;
|
||||
if (trans & 1) leny = n;
|
||||
|
||||
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha_r == ZERO && alpha_i == ZERO) return;
|
||||
|
||||
|
|
|
@ -225,7 +225,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
if (trans & 1) lenx = m;
|
||||
if (trans & 1) leny = n;
|
||||
|
||||
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha_r == ZERO && alpha_i == ZERO) return;
|
||||
|
||||
|
|
|
@ -190,7 +190,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
|
||||
if (n == 0) return;
|
||||
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||
|
||||
|
|
|
@ -181,7 +181,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA
|
|||
|
||||
if (n == 0) return;
|
||||
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||
|
||||
|
|
|
@ -180,7 +180,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
|
||||
if (n == 0) return;
|
||||
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
|||
long double db_i = *(DB + 1);
|
||||
long double r;
|
||||
|
||||
long double ada = fabs(da_r) + fabs(da_i);
|
||||
long double ada = fabsl(da_r) + fabsl(da_i);
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
|
|
|
@ -126,7 +126,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *
|
|||
|
||||
if (n == 0) return;
|
||||
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0);
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||
|
||||
|
|
|
@ -44,7 +44,7 @@ ifeq ($(CORE), POWER8)
|
|||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), Z13)
|
||||
ifeq ($(ARCH), zarch)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
|
|
|
@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "daxpy_microk_steamroller-2.c"
|
||||
#elif defined(PILEDRIVER)
|
||||
#include "daxpy_microk_piledriver-2.c"
|
||||
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||
#elif defined(HASWELL) || defined(ZEN)
|
||||
#include "daxpy_microk_haswell-2.c"
|
||||
#elif defined (SKYLAKEX)
|
||||
#include "daxpy_microk_skylakex-2.c"
|
||||
#elif defined(SANDYBRIDGE)
|
||||
#include "daxpy_microk_sandy-2.c"
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,71 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
|
||||
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
|
||||
__m256d __alpha;
|
||||
|
||||
__alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
BLASLONG n32;
|
||||
__m512d __alpha5;
|
||||
__alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha));
|
||||
|
||||
n32 = n & ~31;
|
||||
|
||||
for (; i < n32; i+= 32) {
|
||||
_mm512_storeu_pd(&y[i + 0], _mm512_loadu_pd(&y[i + 0]) + __alpha5 * _mm512_loadu_pd(&x[i + 0]));
|
||||
_mm512_storeu_pd(&y[i + 8], _mm512_loadu_pd(&y[i + 8]) + __alpha5 * _mm512_loadu_pd(&x[i + 8]));
|
||||
_mm512_storeu_pd(&y[i + 16], _mm512_loadu_pd(&y[i + 16]) + __alpha5 * _mm512_loadu_pd(&x[i + 16]));
|
||||
_mm512_storeu_pd(&y[i + 24], _mm512_loadu_pd(&y[i + 24]) + __alpha5 * _mm512_loadu_pd(&x[i + 24]));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
for (; i < n; i+= 16) {
|
||||
_mm256_storeu_pd(&y[i + 0], _mm256_loadu_pd(&y[i + 0]) + __alpha * _mm256_loadu_pd(&x[i + 0]));
|
||||
_mm256_storeu_pd(&y[i + 4], _mm256_loadu_pd(&y[i + 4]) + __alpha * _mm256_loadu_pd(&x[i + 4]));
|
||||
_mm256_storeu_pd(&y[i + 8], _mm256_loadu_pd(&y[i + 8]) + __alpha * _mm256_loadu_pd(&x[i + 8]));
|
||||
_mm256_storeu_pd(&y[i + 12], _mm256_loadu_pd(&y[i + 12]) + __alpha * _mm256_loadu_pd(&x[i + 12]));
|
||||
}
|
||||
}
|
||||
#else
|
||||
#include "daxpy_microk_haswell-2.c"
|
||||
#endif
|
||||
|
||||
|
|
@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "ddot_microk_piledriver-2.c"
|
||||
#elif defined(NEHALEM)
|
||||
#include "ddot_microk_nehalem-2.c"
|
||||
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||
#elif defined(HASWELL) || defined(ZEN)
|
||||
#include "ddot_microk_haswell-2.c"
|
||||
#elif defined (SKYLAKEX)
|
||||
#include "ddot_microk_skylakex-2.c"
|
||||
#elif defined(SANDYBRIDGE)
|
||||
#include "ddot_microk_sandy-2.c"
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,96 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
{
|
||||
int i = 0;
|
||||
__m256d accum_0, accum_1, accum_2, accum_3;
|
||||
|
||||
accum_0 = _mm256_setzero_pd();
|
||||
accum_1 = _mm256_setzero_pd();
|
||||
accum_2 = _mm256_setzero_pd();
|
||||
accum_3 = _mm256_setzero_pd();
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
__m512d accum_05, accum_15, accum_25, accum_35;
|
||||
int n32;
|
||||
n32 = n & (~31);
|
||||
|
||||
accum_05 = _mm512_setzero_pd();
|
||||
accum_15 = _mm512_setzero_pd();
|
||||
accum_25 = _mm512_setzero_pd();
|
||||
accum_35 = _mm512_setzero_pd();
|
||||
|
||||
for (; i < n32; i += 32) {
|
||||
accum_05 += _mm512_loadu_pd(&x[i+ 0]) * _mm512_loadu_pd(&y[i+ 0]);
|
||||
accum_15 += _mm512_loadu_pd(&x[i+ 8]) * _mm512_loadu_pd(&y[i+ 8]);
|
||||
accum_25 += _mm512_loadu_pd(&x[i+16]) * _mm512_loadu_pd(&y[i+16]);
|
||||
accum_35 += _mm512_loadu_pd(&x[i+24]) * _mm512_loadu_pd(&y[i+24]);
|
||||
}
|
||||
|
||||
/*
|
||||
* we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code
|
||||
* below can continue using the intermediate results in its loop
|
||||
*/
|
||||
accum_0 = _mm512_extractf64x4_pd(accum_05, 0) + _mm512_extractf64x4_pd(accum_05, 1);
|
||||
accum_1 = _mm512_extractf64x4_pd(accum_15, 0) + _mm512_extractf64x4_pd(accum_15, 1);
|
||||
accum_2 = _mm512_extractf64x4_pd(accum_25, 0) + _mm512_extractf64x4_pd(accum_25, 1);
|
||||
accum_3 = _mm512_extractf64x4_pd(accum_35, 0) + _mm512_extractf64x4_pd(accum_35, 1);
|
||||
|
||||
#endif
|
||||
for (; i < n; i += 16) {
|
||||
accum_0 += _mm256_loadu_pd(&x[i+ 0]) * _mm256_loadu_pd(&y[i+ 0]);
|
||||
accum_1 += _mm256_loadu_pd(&x[i+ 4]) * _mm256_loadu_pd(&y[i+ 4]);
|
||||
accum_2 += _mm256_loadu_pd(&x[i+ 8]) * _mm256_loadu_pd(&y[i+ 8]);
|
||||
accum_3 += _mm256_loadu_pd(&x[i+12]) * _mm256_loadu_pd(&y[i+12]);
|
||||
}
|
||||
|
||||
/* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */
|
||||
|
||||
accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
|
||||
|
||||
__m128d half_accum0;
|
||||
|
||||
/* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */
|
||||
half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1));
|
||||
|
||||
/* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */
|
||||
half_accum0 = _mm_hadd_pd(half_accum0, half_accum0);
|
||||
|
||||
*dot = half_accum0[0];
|
||||
}
|
||||
|
||||
#else
|
||||
#include "ddot_microk_haswell-2.c"
|
||||
#endif
|
|
@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(NEHALEM)
|
||||
#include "dgemv_n_microk_nehalem-4.c"
|
||||
#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX)
|
||||
#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||
#include "dgemv_n_microk_haswell-4.c"
|
||||
#elif defined (SKYLAKEX)
|
||||
#include "dgemv_n_microk_skylakex-4.c"
|
||||
#endif
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,126 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
int i = 0;
|
||||
|
||||
__m256d x0, x1, x2, x3;
|
||||
__m256d __alpha;
|
||||
|
||||
x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0]));
|
||||
x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1]));
|
||||
x2 = _mm256_broadcastsd_pd(_mm_load_sd(&x[2]));
|
||||
x3 = _mm256_broadcastsd_pd(_mm_load_sd(&x[3]));
|
||||
|
||||
__alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
int n5;
|
||||
__m512d x05, x15, x25, x35;
|
||||
__m512d __alpha5;
|
||||
n5 = n & ~7;
|
||||
|
||||
x05 = _mm512_broadcastsd_pd(_mm_load_sd(&x[0]));
|
||||
x15 = _mm512_broadcastsd_pd(_mm_load_sd(&x[1]));
|
||||
x25 = _mm512_broadcastsd_pd(_mm_load_sd(&x[2]));
|
||||
x35 = _mm512_broadcastsd_pd(_mm_load_sd(&x[3]));
|
||||
|
||||
__alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha));
|
||||
|
||||
for (; i < n5; i+= 8) {
|
||||
__m512d tempY;
|
||||
__m512d sum;
|
||||
|
||||
sum = _mm512_loadu_pd(&ap[0][i]) * x05 +
|
||||
_mm512_loadu_pd(&ap[1][i]) * x15 +
|
||||
_mm512_loadu_pd(&ap[2][i]) * x25 +
|
||||
_mm512_loadu_pd(&ap[3][i]) * x35;
|
||||
|
||||
tempY = _mm512_loadu_pd(&y[i]);
|
||||
tempY += sum * __alpha5;
|
||||
_mm512_storeu_pd(&y[i], tempY);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; i < n; i+= 4) {
|
||||
__m256d tempY;
|
||||
__m256d sum;
|
||||
|
||||
sum = _mm256_loadu_pd(&ap[0][i]) * x0 +
|
||||
_mm256_loadu_pd(&ap[1][i]) * x1 +
|
||||
_mm256_loadu_pd(&ap[2][i]) * x2 +
|
||||
_mm256_loadu_pd(&ap[3][i]) * x3;
|
||||
|
||||
tempY = _mm256_loadu_pd(&y[i]);
|
||||
tempY += sum * __alpha;
|
||||
_mm256_storeu_pd(&y[i], tempY);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x2
|
||||
|
||||
static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
int i = 0;
|
||||
|
||||
__m256d x0, x1;
|
||||
__m256d __alpha;
|
||||
|
||||
x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0]));
|
||||
x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1]));
|
||||
|
||||
__alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
|
||||
|
||||
|
||||
for (i = 0; i < n; i+= 4) {
|
||||
__m256d tempY;
|
||||
__m256d sum;
|
||||
|
||||
sum = _mm256_loadu_pd(&ap[0][i]) * x0 + _mm256_loadu_pd(&ap[1][i]) * x1;
|
||||
|
||||
tempY = _mm256_loadu_pd(&y[i]);
|
||||
tempY += sum * __alpha;
|
||||
_mm256_storeu_pd(&y[i], tempY);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
#include "dgemv_n_microk_haswell-4.c"
|
||||
#endif
|
|
@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "dscal_microk_bulldozer-2.c"
|
||||
#elif defined(SANDYBRIDGE)
|
||||
#include "dscal_microk_sandy-2.c"
|
||||
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||
#elif defined(HASWELL) || defined(ZEN)
|
||||
#include "dscal_microk_haswell-2.c"
|
||||
#elif defined (SKYLAKEX)
|
||||
#include "dscal_microk_skylakex-2.c"
|
||||
#endif
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,77 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014-2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
|
||||
static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
int i = 0;
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
__m512d __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha));
|
||||
for (; i < n; i += 8) {
|
||||
_mm512_storeu_pd(&x[i + 0], __alpha5 * _mm512_loadu_pd(&x[i + 0]));
|
||||
}
|
||||
#else
|
||||
__m256d __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
|
||||
for (; i < n; i += 8) {
|
||||
_mm256_storeu_pd(&x[i + 0], __alpha * _mm256_loadu_pd(&x[i + 0]));
|
||||
_mm256_storeu_pd(&x[i + 4], __alpha * _mm256_loadu_pd(&x[i + 4]));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
int i = 0;
|
||||
|
||||
/* question to self: Why is this not just memset() */
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
__m512d zero = _mm512_setzero_pd();
|
||||
for (; i < n; i += 8) {
|
||||
_mm512_storeu_pd(&x[i], zero);
|
||||
}
|
||||
#else
|
||||
__m256d zero = _mm256_setzero_pd();
|
||||
for (; i < n; i += 8) {
|
||||
_mm256_storeu_pd(&x[i + 0], zero);
|
||||
_mm256_storeu_pd(&x[i + 4], zero);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
#include "dscal_microk_haswell-2.c"
|
||||
#endif
|
|
@ -30,8 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||
#include "dsymv_L_microk_bulldozer-2.c"
|
||||
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||
#elif defined(HASWELL) || defined(ZEN)
|
||||
#include "dsymv_L_microk_haswell-2.c"
|
||||
#elif defined (SKYLAKEX)
|
||||
#include "dsymv_L_microk_skylakex-2.c"
|
||||
#elif defined(SANDYBRIDGE)
|
||||
#include "dsymv_L_microk_sandy-2.c"
|
||||
#elif defined(NEHALEM)
|
||||
|
|
|
@ -0,0 +1,161 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
|
||||
static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
|
||||
{
|
||||
|
||||
|
||||
__m256d accum_0, accum_1, accum_2, accum_3;
|
||||
__m256d temp1_0, temp1_1, temp1_2, temp1_3;
|
||||
|
||||
/* the 256 bit wide acculmulator vectors start out as zero */
|
||||
accum_0 = _mm256_setzero_pd();
|
||||
accum_1 = _mm256_setzero_pd();
|
||||
accum_2 = _mm256_setzero_pd();
|
||||
accum_3 = _mm256_setzero_pd();
|
||||
|
||||
temp1_0 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[0]));
|
||||
temp1_1 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[1]));
|
||||
temp1_2 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[2]));
|
||||
temp1_3 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[3]));
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
__m512d accum_05, accum_15, accum_25, accum_35;
|
||||
__m512d temp1_05, temp1_15, temp1_25, temp1_35;
|
||||
BLASLONG to2;
|
||||
int delta;
|
||||
|
||||
/* the 512 bit wide accumulator vectors start out as zero */
|
||||
accum_05 = _mm512_setzero_pd();
|
||||
accum_15 = _mm512_setzero_pd();
|
||||
accum_25 = _mm512_setzero_pd();
|
||||
accum_35 = _mm512_setzero_pd();
|
||||
|
||||
temp1_05 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[0]));
|
||||
temp1_15 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[1]));
|
||||
temp1_25 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[2]));
|
||||
temp1_35 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[3]));
|
||||
|
||||
delta = (to - from) & ~7;
|
||||
to2 = from + delta;
|
||||
|
||||
|
||||
for (; from < to2; from += 8) {
|
||||
__m512d _x, _y;
|
||||
__m512d a0, a1, a2, a3;
|
||||
|
||||
_y = _mm512_loadu_pd(&y[from]);
|
||||
_x = _mm512_loadu_pd(&x[from]);
|
||||
|
||||
a0 = _mm512_loadu_pd(&a[0][from]);
|
||||
a1 = _mm512_loadu_pd(&a[1][from]);
|
||||
a2 = _mm512_loadu_pd(&a[2][from]);
|
||||
a3 = _mm512_loadu_pd(&a[3][from]);
|
||||
|
||||
_y += temp1_05 * a0 + temp1_15 * a1 + temp1_25 * a2 + temp1_35 * a3;
|
||||
|
||||
accum_05 += _x * a0;
|
||||
accum_15 += _x * a1;
|
||||
accum_25 += _x * a2;
|
||||
accum_35 += _x * a3;
|
||||
|
||||
_mm512_storeu_pd(&y[from], _y);
|
||||
|
||||
};
|
||||
|
||||
/*
|
||||
* we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code
|
||||
* below can continue using the intermediate results in its loop
|
||||
*/
|
||||
accum_0 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_05, 0), _mm512_extractf64x4_pd(accum_05, 1));
|
||||
accum_1 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_15, 0), _mm512_extractf64x4_pd(accum_15, 1));
|
||||
accum_2 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_25, 0), _mm512_extractf64x4_pd(accum_25, 1));
|
||||
accum_3 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_35, 0), _mm512_extractf64x4_pd(accum_35, 1));
|
||||
|
||||
#endif
|
||||
|
||||
for (; from != to; from += 4) {
|
||||
__m256d _x, _y;
|
||||
__m256d a0, a1, a2, a3;
|
||||
|
||||
_y = _mm256_loadu_pd(&y[from]);
|
||||
_x = _mm256_loadu_pd(&x[from]);
|
||||
|
||||
/* load 4 rows of matrix data */
|
||||
a0 = _mm256_loadu_pd(&a[0][from]);
|
||||
a1 = _mm256_loadu_pd(&a[1][from]);
|
||||
a2 = _mm256_loadu_pd(&a[2][from]);
|
||||
a3 = _mm256_loadu_pd(&a[3][from]);
|
||||
|
||||
_y += temp1_0 * a0 + temp1_1 * a1 + temp1_2 * a2 + temp1_3 * a3;
|
||||
|
||||
accum_0 += _x * a0;
|
||||
accum_1 += _x * a1;
|
||||
accum_2 += _x * a2;
|
||||
accum_3 += _x * a3;
|
||||
|
||||
_mm256_storeu_pd(&y[from], _y);
|
||||
|
||||
};
|
||||
|
||||
/*
|
||||
* we now have 4 accumulator vectors. Each vector needs to be summed up element wise and stored in the temp2
|
||||
* output array. There is no direct instruction for this in 256 bit space, only in 128 space.
|
||||
*/
|
||||
|
||||
__m128d half_accum0, half_accum1, half_accum2, half_accum3;
|
||||
|
||||
|
||||
/* Add upper half to lower half of each of the four 256 bit vectors to get to four 128 bit vectors */
|
||||
half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1));
|
||||
half_accum1 = _mm_add_pd(_mm256_extractf128_pd(accum_1, 0), _mm256_extractf128_pd(accum_1, 1));
|
||||
half_accum2 = _mm_add_pd(_mm256_extractf128_pd(accum_2, 0), _mm256_extractf128_pd(accum_2, 1));
|
||||
half_accum3 = _mm_add_pd(_mm256_extractf128_pd(accum_3, 0), _mm256_extractf128_pd(accum_3, 1));
|
||||
|
||||
/* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */
|
||||
half_accum0 = _mm_hadd_pd(half_accum0, half_accum0);
|
||||
half_accum1 = _mm_hadd_pd(half_accum1, half_accum1);
|
||||
half_accum2 = _mm_hadd_pd(half_accum2, half_accum2);
|
||||
half_accum3 = _mm_hadd_pd(half_accum3, half_accum3);
|
||||
|
||||
/* and store the lowest double value from each of these vectors in the temp2 output */
|
||||
temp2[0] += half_accum0[0];
|
||||
temp2[1] += half_accum1[0];
|
||||
temp2[2] += half_accum2[0];
|
||||
temp2[3] += half_accum3[0];
|
||||
}
|
||||
#else
|
||||
#include "dsymv_L_microk_haswell-2.c"
|
||||
#endif
|
|
@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(NEHALEM)
|
||||
#include "saxpy_microk_nehalem-2.c"
|
||||
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||
#elif defined(HASWELL) || defined(ZEN)
|
||||
#include "saxpy_microk_haswell-2.c"
|
||||
#elif defined (SKYLAKEX)
|
||||
#include "saxpy_microk_skylakex-2.c"
|
||||
#elif defined(SANDYBRIDGE)
|
||||
#include "saxpy_microk_sandy-2.c"
|
||||
#elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||
|
|
|
@ -0,0 +1,69 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
|
||||
__m256 __alpha;
|
||||
|
||||
__alpha = _mm256_broadcastss_ps(_mm_load_ss(alpha));
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
BLASLONG n64;
|
||||
__m512 __alpha5;
|
||||
__alpha5 = _mm512_broadcastss_ps(_mm_load_ss(alpha));
|
||||
|
||||
n64 = n & ~63;
|
||||
|
||||
for (; i < n64; i+= 64) {
|
||||
_mm512_storeu_ps(&y[i + 0], _mm512_loadu_ps(&y[i + 0]) + __alpha5 * _mm512_loadu_ps(&x[i + 0]));
|
||||
_mm512_storeu_ps(&y[i + 16], _mm512_loadu_ps(&y[i + 16]) + __alpha5 * _mm512_loadu_ps(&x[i + 16]));
|
||||
_mm512_storeu_ps(&y[i + 32], _mm512_loadu_ps(&y[i + 32]) + __alpha5 * _mm512_loadu_ps(&x[i + 32]));
|
||||
_mm512_storeu_ps(&y[i + 48], _mm512_loadu_ps(&y[i + 48]) + __alpha5 * _mm512_loadu_ps(&x[i + 48]));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
for (; i < n; i+= 32) {
|
||||
_mm256_storeu_ps(&y[i + 0], _mm256_loadu_ps(&y[i + 0]) + __alpha * _mm256_loadu_ps(&x[i + 0]));
|
||||
_mm256_storeu_ps(&y[i + 8], _mm256_loadu_ps(&y[i + 8]) + __alpha * _mm256_loadu_ps(&x[i + 8]));
|
||||
_mm256_storeu_ps(&y[i + 16], _mm256_loadu_ps(&y[i + 16]) + __alpha * _mm256_loadu_ps(&x[i + 16]));
|
||||
_mm256_storeu_ps(&y[i + 24], _mm256_loadu_ps(&y[i + 24]) + __alpha * _mm256_loadu_ps(&x[i + 24]));
|
||||
}
|
||||
}
|
||||
#else
|
||||
#include "saxpy_microk_haswell-2.c"
|
||||
#endif
|
||||
|
|
@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "sdot_microk_steamroller-2.c"
|
||||
#elif defined(NEHALEM)
|
||||
#include "sdot_microk_nehalem-2.c"
|
||||
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||
#elif defined(HASWELL) || defined(ZEN)
|
||||
#include "sdot_microk_haswell-2.c"
|
||||
#elif defined (SKYLAKEX)
|
||||
#include "sdot_microk_skylakex-2.c"
|
||||
#elif defined(SANDYBRIDGE)
|
||||
#include "sdot_microk_sandy-2.c"
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,98 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
|
||||
{
|
||||
int i = 0;
|
||||
__m256 accum_0, accum_1, accum_2, accum_3;
|
||||
|
||||
accum_0 = _mm256_setzero_ps();
|
||||
accum_1 = _mm256_setzero_ps();
|
||||
accum_2 = _mm256_setzero_ps();
|
||||
accum_3 = _mm256_setzero_ps();
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
__m512 accum_05, accum_15, accum_25, accum_35;
|
||||
int n64;
|
||||
n64 = n & (~63);
|
||||
|
||||
accum_05 = _mm512_setzero_ps();
|
||||
accum_15 = _mm512_setzero_ps();
|
||||
accum_25 = _mm512_setzero_ps();
|
||||
accum_35 = _mm512_setzero_ps();
|
||||
|
||||
for (; i < n64; i += 64) {
|
||||
accum_05 += _mm512_loadu_ps(&x[i+ 0]) * _mm512_loadu_ps(&y[i+ 0]);
|
||||
accum_15 += _mm512_loadu_ps(&x[i+16]) * _mm512_loadu_ps(&y[i+16]);
|
||||
accum_25 += _mm512_loadu_ps(&x[i+32]) * _mm512_loadu_ps(&y[i+32]);
|
||||
accum_35 += _mm512_loadu_ps(&x[i+48]) * _mm512_loadu_ps(&y[i+48]);
|
||||
}
|
||||
|
||||
/*
|
||||
* we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code
|
||||
* below can continue using the intermediate results in its loop
|
||||
*/
|
||||
accum_0 = _mm512_extractf32x8_ps(accum_05, 0) + _mm512_extractf32x8_ps(accum_05, 1);
|
||||
accum_1 = _mm512_extractf32x8_ps(accum_15, 0) + _mm512_extractf32x8_ps(accum_15, 1);
|
||||
accum_2 = _mm512_extractf32x8_ps(accum_25, 0) + _mm512_extractf32x8_ps(accum_25, 1);
|
||||
accum_3 = _mm512_extractf32x8_ps(accum_35, 0) + _mm512_extractf32x8_ps(accum_35, 1);
|
||||
|
||||
#endif
|
||||
for (; i < n; i += 32) {
|
||||
accum_0 += _mm256_loadu_ps(&x[i+ 0]) * _mm256_loadu_ps(&y[i+ 0]);
|
||||
accum_1 += _mm256_loadu_ps(&x[i+ 8]) * _mm256_loadu_ps(&y[i+ 8]);
|
||||
accum_2 += _mm256_loadu_ps(&x[i+16]) * _mm256_loadu_ps(&y[i+16]);
|
||||
accum_3 += _mm256_loadu_ps(&x[i+24]) * _mm256_loadu_ps(&y[i+24]);
|
||||
}
|
||||
|
||||
/* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */
|
||||
|
||||
accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
|
||||
|
||||
__m128 half_accum0;
|
||||
|
||||
/* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */
|
||||
half_accum0 = _mm256_extractf128_ps(accum_0, 0) + _mm256_extractf128_ps(accum_0, 1);
|
||||
|
||||
/* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */
|
||||
half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
|
||||
half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
|
||||
|
||||
*dot = half_accum0[0];
|
||||
}
|
||||
|
||||
#else
|
||||
#include "sdot_microk_haswell-2.c"
|
||||
#endif
|
|
@ -280,8 +280,8 @@
|
|||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
INTEGER ILAENV
|
||||
EXTERNAL LSAME, ILAENV
|
||||
INTEGER ILAENV2STAGE
|
||||
EXTERNAL LSAME, ILAENV2STAGE
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
|
@ -297,9 +297,9 @@
|
|||
*
|
||||
* Determine the block size, the workspace size and the hous size.
|
||||
*
|
||||
IB = ILAENV( 18, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 )
|
||||
LHMIN = ILAENV( 19, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
||||
LWMIN = ILAENV( 20, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
||||
IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 )
|
||||
LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
||||
LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
||||
*
|
||||
IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
|
||||
INFO = -1
|
||||
|
|
|
@ -285,8 +285,8 @@
|
|||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
INTEGER ILAENV
|
||||
EXTERNAL LSAME, ILAENV
|
||||
INTEGER ILAENV2STAGE
|
||||
EXTERNAL LSAME, ILAENV2STAGE
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
|
@ -296,7 +296,7 @@
|
|||
INFO = 0
|
||||
UPPER = LSAME( UPLO, 'U' )
|
||||
LQUERY = ( LWORK.EQ.-1 )
|
||||
LWMIN = ILAENV( 20, 'CHETRD_HE2HB', '', N, KD, -1, -1 )
|
||||
LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 )
|
||||
|
||||
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
|
||||
INFO = -1
|
||||
|
|
|
@ -277,8 +277,8 @@
|
|||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
INTEGER ILAENV
|
||||
EXTERNAL LSAME, ILAENV
|
||||
INTEGER ILAENV2STAGE
|
||||
EXTERNAL LSAME, ILAENV2STAGE
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
|
@ -294,9 +294,9 @@
|
|||
*
|
||||
* Determine the block size, the workspace size and the hous size.
|
||||
*
|
||||
IB = ILAENV( 18, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 )
|
||||
LHMIN = ILAENV( 19, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
||||
LWMIN = ILAENV( 20, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
||||
IB = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 )
|
||||
LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
||||
LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
||||
*
|
||||
IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
|
||||
INFO = -1
|
||||
|
|
|
@ -285,8 +285,8 @@
|
|||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
INTEGER ILAENV
|
||||
EXTERNAL LSAME, ILAENV
|
||||
INTEGER ILAENV2STAGE
|
||||
EXTERNAL LSAME, ILAENV2STAGE
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
|
@ -296,7 +296,7 @@
|
|||
INFO = 0
|
||||
UPPER = LSAME( UPLO, 'U' )
|
||||
LQUERY = ( LWORK.EQ.-1 )
|
||||
LWMIN = ILAENV( 20, 'DSYTRD_SY2SB', '', N, KD, -1, -1 )
|
||||
LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', '', N, KD, -1, -1 )
|
||||
|
||||
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
|
||||
INFO = -1
|
||||
|
|
|
@ -277,8 +277,8 @@
|
|||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
INTEGER ILAENV
|
||||
EXTERNAL LSAME, ILAENV
|
||||
INTEGER ILAENV2STAGE
|
||||
EXTERNAL LSAME, ILAENV2STAGE
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
|
@ -294,9 +294,9 @@
|
|||
*
|
||||
* Determine the block size, the workspace size and the hous size.
|
||||
*
|
||||
IB = ILAENV( 18, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 )
|
||||
LHMIN = ILAENV( 19, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
||||
LWMIN = ILAENV( 20, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
||||
IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 )
|
||||
LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
||||
LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
||||
*
|
||||
IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
|
||||
INFO = -1
|
||||
|
|
|
@ -285,8 +285,8 @@
|
|||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
INTEGER ILAENV
|
||||
EXTERNAL LSAME, ILAENV
|
||||
INTEGER ILAENV2STAGE
|
||||
EXTERNAL LSAME, ILAENV2STAGE
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
|
@ -296,7 +296,7 @@
|
|||
INFO = 0
|
||||
UPPER = LSAME( UPLO, 'U' )
|
||||
LQUERY = ( LWORK.EQ.-1 )
|
||||
LWMIN = ILAENV( 20, 'SSYTRD_SY2SB', '', N, KD, -1, -1 )
|
||||
LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 )
|
||||
|
||||
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
|
||||
INFO = -1
|
||||
|
|
|
@ -280,8 +280,8 @@
|
|||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
INTEGER ILAENV
|
||||
EXTERNAL LSAME, ILAENV
|
||||
INTEGER ILAENV2STAGE
|
||||
EXTERNAL LSAME, ILAENV2STAGE
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
|
@ -297,9 +297,9 @@
|
|||
*
|
||||
* Determine the block size, the workspace size and the hous size.
|
||||
*
|
||||
IB = ILAENV( 18, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 )
|
||||
LHMIN = ILAENV( 19, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
||||
LWMIN = ILAENV( 20, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
||||
IB = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 )
|
||||
LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
||||
LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
||||
*
|
||||
IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
|
||||
INFO = -1
|
||||
|
|
|
@ -285,8 +285,8 @@
|
|||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
INTEGER ILAENV
|
||||
EXTERNAL LSAME, ILAENV
|
||||
INTEGER ILAENV2STAGE
|
||||
EXTERNAL LSAME, ILAENV2STAGE
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
|
@ -296,7 +296,7 @@
|
|||
INFO = 0
|
||||
UPPER = LSAME( UPLO, 'U' )
|
||||
LQUERY = ( LWORK.EQ.-1 )
|
||||
LWMIN = ILAENV( 20, 'ZHETRD_HE2HB', '', N, KD, -1, -1 )
|
||||
LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 )
|
||||
|
||||
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
|
||||
INFO = -1
|
||||
|
|
Loading…
Reference in New Issue