commit
ec0cac1669
|
@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
|
|||
project(OpenBLAS C ASM)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 1.dev)
|
||||
set(OpenBLAS_PATCH_VERSION 4.dev)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
|
@ -150,6 +150,7 @@ endif()
|
|||
|
||||
# add objects to the openblas lib
|
||||
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
|
||||
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>)
|
||||
|
||||
# Android needs to explicitly link against libm
|
||||
if(ANDROID)
|
||||
|
@ -169,6 +170,7 @@ endif()
|
|||
# Set output for libopenblas
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS")
|
||||
|
||||
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
|
||||
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
|
||||
|
|
138
Changelog.txt
138
Changelog.txt
|
@ -1,4 +1,142 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.3
|
||||
31-Aug-2018
|
||||
|
||||
common:
|
||||
* thread memory allocation has been switched back to the method
|
||||
used before version 0.3.1 due to unexpected problems caused by
|
||||
the new code under some circumstances. A new compile-time option
|
||||
USE_TLS has been added to enable the new code, and it is hoped
|
||||
that this can become the default again in the next version.
|
||||
* LAPAck PR272 has been integrated, which fixes spurious errors
|
||||
in DSYEVR and related functions caused by missing conversion
|
||||
from ILAENV to ILAENV_2STAGE in several _2stage routines.
|
||||
* the cmake-generated OpenBLASConfig.cmake now uses correct case
|
||||
for the name of the library
|
||||
* added support for Haiku OS
|
||||
|
||||
x86_64:
|
||||
* added AVX512 implementations of SDOT, DDOT, SAXPY, DAXPY,
|
||||
DSCAL, DGEMVN and DSYMVL
|
||||
* added a workaround for a cygwin issue that prevented compilation
|
||||
of AVX512 code
|
||||
|
||||
IBM Z:
|
||||
* added autodetection of Z14
|
||||
* fixed TRMM errors in the generic target
|
||||
|
||||
====================================================================
|
||||
Version 0.3.2
|
||||
30-Jul-2018
|
||||
|
||||
common:
|
||||
* fixes for regressions caused by the rewrite of the thread
|
||||
initialization code in 0.3.1
|
||||
|
||||
POWER:
|
||||
* fixed cpu autodetection for the BSDs
|
||||
|
||||
MIPS64:
|
||||
* fixed utest errors in AXPY, DSDOT, ROT and SWAP
|
||||
|
||||
x86_64:
|
||||
* added autodetection of AMD Ryzen 2
|
||||
* fixed build with older versions of MSVC
|
||||
|
||||
====================================================================
|
||||
Version 0.3.1
|
||||
01-Jul-2018
|
||||
|
||||
common:
|
||||
* rewritten thread initialization code with significantly reduced overhead
|
||||
* added CBLAS interfaces to the IxAMIN BLAS extension functions
|
||||
* fixed the lapack-test target
|
||||
* CMAKE builds now create an OpenBLASConfig.cmake file
|
||||
* ZAXPY now uses a single thread for small input sizes
|
||||
* the LAPACK code was updated from Reference-LAPACK/lapack#253
|
||||
(fixing LAPACKE interfaces to Aasen's functions)
|
||||
|
||||
POWER:
|
||||
* corrected CROT and ZROT behaviour with zero INC_X
|
||||
|
||||
ARMV7:
|
||||
* corrected xDOT behaviour with zero INC_X or INC_Y
|
||||
|
||||
x86_64:
|
||||
* retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER,
|
||||
this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO
|
||||
(which will still be supported via the slower PRESCOTT kernels when this option is not set)
|
||||
* added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to
|
||||
specify the list of x86_64 targets to include. Any target not on the list will be supported
|
||||
by the Sandybridge or Nehalem kernels if available, or by Prescott.
|
||||
* improved SWITCH_RATIO on Haswell for increased GEMM throughput
|
||||
* added initial support for Intel Skylake X, including an AVX512 SGEMM kernel
|
||||
* added autodetection of Intel Cannon Lake series as Skylake X
|
||||
* added a default L2 cache size for hypervisors that return zero here (Chromebook)
|
||||
* fixed a name clash with recent Windows10 headers that broke the build with (at least)
|
||||
recent mingw from MSYS2
|
||||
* fixed a link error in mixed clang/gfortran builds with OpenMP
|
||||
* updated the OSX deployment target to 10.8
|
||||
* switched on parallel make for builds on MS Windows by default
|
||||
|
||||
x86:
|
||||
* fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y
|
||||
|
||||
====================================================================
|
||||
Version 0.3.0
|
||||
23-May-2108
|
||||
|
||||
common:
|
||||
* fixed some more thread race and locking bugs
|
||||
* added preliminary support for calling an OpenMP build of the library from multiple threads
|
||||
* removed performance impact of thread locks added in 0.2.20 on OpenMP code
|
||||
* general code cleanup
|
||||
* optimized DSDOT implementation
|
||||
* improved thread distribution for GEMM
|
||||
* corrected IMATCOPY/OMATCOPY implementation
|
||||
* fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations
|
||||
* cmake build improvements
|
||||
* pkgconfig file now contains build options
|
||||
* openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build
|
||||
* corrections and improvements for systems with more than 64 cpus
|
||||
* LAPACK code updated to 3.8.0 including later fixes
|
||||
* added ReLAPACK, a recursive implementation of several LAPACK functions
|
||||
* Rewrote ROTMG to handle cases that the netlib code failed to address
|
||||
* Disabled (broken) multithreading code for xTRMV
|
||||
* corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard
|
||||
* shared memory access failures on startup are now handled more gracefully
|
||||
* restored utests from earlier releases (and made them pass on all affected systems)
|
||||
|
||||
SPARC:
|
||||
* several fixes for cpu autodetection
|
||||
|
||||
POWER:
|
||||
* corrected vector register overwriting in several Power8 kernels
|
||||
* optimized additional BLAS functions
|
||||
|
||||
ARM:
|
||||
* added support for CortexA53 and A72
|
||||
* added autodetection for ThunderX2T99
|
||||
* made most optimized kernels the default for generic ARMv8 targets
|
||||
|
||||
x86_64:
|
||||
* parallelized DDOT kernel for Haswell
|
||||
* changed alignment directives in assembly kernels to boost performance on OSX
|
||||
* fixed register handling in the GEMV microkernels (bug exposed by gcc7)
|
||||
* added support for building on OpenBSD and Dragonfly
|
||||
* updated compiler options to work with Intel release 2018
|
||||
* support fully optimized build with clang/flang on Microsoft Windows
|
||||
* fixed building on AIX
|
||||
|
||||
IBM Z:
|
||||
* added optimized BLAS 1/2 functions
|
||||
|
||||
MIPS:
|
||||
* fixed cpu autodetection helper code
|
||||
* added mips32 1004K cpu (Mediatek MT7621 and similar SoC)
|
||||
* added mips64 I6500 cpu
|
||||
|
||||
====================================================================
|
||||
Version 0.2.20
|
||||
24-Jul-2017
|
||||
|
|
4
Makefile
4
Makefile
|
@ -97,7 +97,7 @@ endif
|
|||
|
||||
shared :
|
||||
ifndef NO_SHARED
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||
@$(MAKE) -C exports so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
|
@ -267,6 +267,8 @@ ifeq ($(F_COMPILER), GFORTRAN)
|
|||
ifdef SMP
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
else ifeq ($(OSNAME), Haiku)
|
||||
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
else
|
||||
-@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
endif
|
||||
|
|
|
@ -66,7 +66,7 @@ endif
|
|||
#for install shared library
|
||||
ifndef NO_SHARED
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.1.dev
|
||||
VERSION = 0.3.4.dev
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
@ -107,7 +107,13 @@ BUILD_LAPACK_DEPRECATED = 1
|
|||
# BUILD_RELAPACK = 1
|
||||
|
||||
# If you want to use legacy threaded Level 3 implementation.
|
||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
|
||||
# If you want to use the new, still somewhat experimental code that uses
|
||||
# thread-local storage instead of a central memory buffer in memory.c
|
||||
# Note that if your system uses GLIBC, it needs to have at least glibc 2.21
|
||||
# for this to work.
|
||||
USE_TLS = 1
|
||||
|
||||
# If you want to drive whole 64bit region by BLAS. Not all Fortran
|
||||
# compiler supports this. It's safe to keep comment it out if you
|
||||
|
|
|
@ -1018,6 +1018,10 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
|
|||
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
|
||||
endif
|
||||
|
||||
ifdef USE_TLS
|
||||
CCOMMON_OPT += -DUSE_TLS
|
||||
endif
|
||||
|
||||
ifndef SYMBOLPREFIX
|
||||
SYMBOLPREFIX =
|
||||
endif
|
||||
|
|
|
@ -12,6 +12,9 @@ ifeq ($(CORE), SKYLAKEX)
|
|||
ifndef NO_AVX512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
|
|
@ -110,6 +110,7 @@ Please read `GotoBLAS_01Readme.txt`.
|
|||
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
|
||||
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
|
||||
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
|
||||
- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
|
||||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
|
||||
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
||||
|
@ -200,6 +201,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2
|
|||
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
|
||||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
|
||||
Clang 3.0 will generate the wrong AVX binary code.
|
||||
* Please use GCC version 6 or LLVM version 6 and above to compile Skyalke AVX512 kernels.
|
||||
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
|
||||
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
|
||||
the library with `BIGNUMA=1`.
|
||||
|
|
|
@ -122,7 +122,7 @@ int main(int argc, char *argv[]){
|
|||
|
||||
FLOAT *a, *x, *y;
|
||||
FLOAT alpha[] = {1.0, 1.0};
|
||||
FLOAT beta [] = {1.0, 1.0};
|
||||
FLOAT beta [] = {1.0, 0.0};
|
||||
char trans='N';
|
||||
blasint m, i, j;
|
||||
blasint inc_x=1,inc_y=1;
|
||||
|
|
4
c_check
4
c_check
|
@ -64,6 +64,7 @@ $os = WINNT if ($data =~ /OS_WINNT/);
|
|||
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
|
||||
$os = Interix if ($data =~ /OS_INTERIX/);
|
||||
$os = Android if ($data =~ /OS_ANDROID/);
|
||||
$os = Haiku if ($data =~ /OS_HAIKU/);
|
||||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
|
@ -223,7 +224,6 @@ $data =~ /globl\s([_\.]*)(.*)/;
|
|||
$need_fu = $1;
|
||||
|
||||
$cross = 0;
|
||||
$cross = 1 if ($os ne $hostos);
|
||||
|
||||
if ($architecture ne $hostarch) {
|
||||
$cross = 1;
|
||||
|
@ -231,6 +231,8 @@ if ($architecture ne $hostarch) {
|
|||
$cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips"));
|
||||
}
|
||||
|
||||
$cross = 1 if ($os ne $hostos);
|
||||
|
||||
$openmp = "" if $ENV{USE_OPENMP} != 1;
|
||||
|
||||
$linker_L = "";
|
||||
|
|
3
cblas.h
3
cblas.h
|
@ -51,7 +51,8 @@ typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=1
|
|||
typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
|
||||
typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
|
||||
typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
|
||||
|
||||
typedef CBLAS_ORDER CBLAS_LAYOUT;
|
||||
|
||||
float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
|
||||
double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
|
||||
float cblas_sdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
|
||||
|
|
|
@ -85,7 +85,7 @@ if (NOT NOFORTRAN)
|
|||
endif ()
|
||||
|
||||
# Cannot run getarch on target if we are cross-compiling
|
||||
if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
|
||||
if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE"))
|
||||
# Write to config as getarch would
|
||||
|
||||
# TODO: Set up defines that getarch sets up based on every other target
|
||||
|
|
|
@ -214,6 +214,10 @@ if (CONSISTENT_FPCSR)
|
|||
set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR")
|
||||
endif ()
|
||||
|
||||
if (USE_TLS)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_TLS")
|
||||
endif ()
|
||||
|
||||
# Only for development
|
||||
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST")
|
||||
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST")
|
||||
|
|
|
@ -68,7 +68,7 @@ endif()
|
|||
|
||||
if (X86_64 OR X86)
|
||||
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
|
||||
if (NO_AVX512 EQUAL 1)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
|
||||
endif()
|
||||
|
|
10
common.h
10
common.h
|
@ -105,6 +105,10 @@ extern "C" {
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
#define NO_SYSV_IPC
|
||||
#endif
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
#ifdef ATOM
|
||||
#define GOTO_ATOM ATOM
|
||||
|
@ -253,8 +257,14 @@ typedef unsigned long BLASULONG;
|
|||
|
||||
#ifdef USE64BITINT
|
||||
typedef BLASLONG blasint;
|
||||
#if defined(OS_WINDOWS) && defined(__64BIT__)
|
||||
#define blasabs(x) llabs(x)
|
||||
#else
|
||||
#define blasabs(x) labs(x)
|
||||
#endif
|
||||
#else
|
||||
typedef int blasint;
|
||||
#define blasabs(x) abs(x)
|
||||
#endif
|
||||
#else
|
||||
#ifdef USE64BITINT
|
||||
|
|
|
@ -142,6 +142,52 @@ int detect(void){
|
|||
|
||||
return CPUTYPE_PPC970;
|
||||
#endif
|
||||
|
||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__)
|
||||
int id;
|
||||
id = __asm __volatile("mfpvr %0" : "=r"(id));
|
||||
switch ( id >> 16 ) {
|
||||
case 0x4e: // POWER9
|
||||
return return CPUTYPE_POWER8;
|
||||
break;
|
||||
case 0x4d:
|
||||
case 0x4b: // POWER8/8E
|
||||
return CPUTYPE_POWER8;
|
||||
break;
|
||||
case 0x4a:
|
||||
case 0x3f: // POWER7/7E
|
||||
return CPUTYPE_POWER6;
|
||||
break;
|
||||
case 0x3e:
|
||||
return CPUTYPE_POWER6;
|
||||
break;
|
||||
case 0x3a:
|
||||
return CPUTYPE_POWER5;
|
||||
break;
|
||||
case 0x35:
|
||||
case 0x38: // POWER4 /4+
|
||||
return CPUTYPE_POWER4;
|
||||
break;
|
||||
case 0x40:
|
||||
case 0x41: // POWER3 /3+
|
||||
return CPUTYPE_POWER3;
|
||||
break;
|
||||
case 0x39:
|
||||
case 0x3c:
|
||||
case 0x44:
|
||||
case 0x45:
|
||||
return CPUTYPE_PPC970;
|
||||
break;
|
||||
case 0x70:
|
||||
return CPUTYPE_CELL;
|
||||
break;
|
||||
case 0x8003:
|
||||
return CPUTYPE_PPCG4;
|
||||
break;
|
||||
default:
|
||||
return CPUTYPE_UNKNOWN;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void get_architecture(void){
|
||||
|
|
|
@ -1452,6 +1452,8 @@ int get_cpuname(void){
|
|||
switch (model) {
|
||||
case 1:
|
||||
// AMD Ryzen
|
||||
case 8:
|
||||
// AMD Ryzen2
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_ZEN;
|
||||
|
|
|
@ -29,15 +29,18 @@
|
|||
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
#define CPU_Z14 2
|
||||
|
||||
static char *cpuname[] = {
|
||||
"ZARCH_GENERIC",
|
||||
"Z13"
|
||||
"Z13",
|
||||
"Z14"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"zarch_generic",
|
||||
"z13"
|
||||
"z13",
|
||||
"z14"
|
||||
};
|
||||
|
||||
int detect(void)
|
||||
|
@ -62,6 +65,10 @@ int detect(void)
|
|||
if (strstr(p, "2964")) return CPU_Z13;
|
||||
if (strstr(p, "2965")) return CPU_Z13;
|
||||
|
||||
/* detect z14, but fall back to z13 */
|
||||
if (strstr(p, "3906")) return CPU_Z13;
|
||||
if (strstr(p, "3907")) return CPU_Z13;
|
||||
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
||||
|
@ -107,5 +114,9 @@ void get_cpuconfig(void)
|
|||
printf("#define Z13\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
break;
|
||||
case CPU_Z14:
|
||||
printf("#define Z14\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
4
ctest.c
4
ctest.c
|
@ -101,6 +101,10 @@ OS_INTERIX
|
|||
OS_LINUX
|
||||
#endif
|
||||
|
||||
#if defined(__HAIKU__)
|
||||
OS_HAIKU
|
||||
#endif
|
||||
|
||||
#if defined(__i386) || defined(_X86)
|
||||
ARCH_X86
|
||||
#endif
|
||||
|
|
|
@ -344,6 +344,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
|
||||
for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {
|
||||
|
||||
/* Make sure if no one is using workspace */
|
||||
START_RPCC();
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||
STOP_RPCC(waiting1);
|
||||
|
||||
#if defined(FUSED_GEMM) && !defined(TIMING)
|
||||
|
||||
/* Fused operation to copy region of B into workspace and apply kernel */
|
||||
|
@ -381,15 +387,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
}
|
||||
#endif
|
||||
|
||||
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) {
|
||||
/* Make sure if no one is using workspace */
|
||||
START_RPCC();
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||
STOP_RPCC(waiting1);
|
||||
/* Set flag so other threads can access local region of B */
|
||||
/* Set flag so other threads can access local region of B */
|
||||
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
|
||||
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
||||
WMB;
|
||||
}
|
||||
WMB;
|
||||
}
|
||||
|
||||
/* Get regions of B from other threads and apply kernel */
|
||||
|
@ -425,13 +426,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
/* Clear synchronization flag if this thread is done with other region of B */
|
||||
if (m_to - m_from == min_i) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
} while (current != mypos);
|
||||
|
||||
/* Iterate through steps of m
|
||||
/* Iterate through steps of m
|
||||
* Note: First step has already been finished */
|
||||
for(is = m_from + min_i; is < m_to; is += min_i){
|
||||
min_i = m_to - is;
|
||||
|
@ -461,14 +462,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
|
||||
c, ldc, is, js);
|
||||
STOP_RPCC(kernel);
|
||||
|
||||
|
||||
#ifdef TIMING
|
||||
ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l;
|
||||
#endif
|
||||
|
||||
|
||||
/* Clear synchronization flag if this thread is done with region of B */
|
||||
if (is + min_i >= m_to) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU)
|
||||
#include <dlfcn.h>
|
||||
#include <signal.h>
|
||||
#include <sys/resource.h>
|
||||
|
|
|
@ -510,7 +510,7 @@ static gotoblas_t *get_coretype(void){
|
|||
#ifndef NO_AVX2
|
||||
return &gotoblas_HASWELL;
|
||||
#else
|
||||
return &gotblas_SANDYBRIDGE;
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return &gotoblas_NEHALEM;
|
||||
|
@ -607,7 +607,7 @@ static gotoblas_t *get_coretype(void){
|
|||
}
|
||||
}
|
||||
} else if (exfamily == 8) {
|
||||
if (model == 1) {
|
||||
if (model == 1 || model == 8) {
|
||||
if(support_avx())
|
||||
return &gotoblas_ZEN;
|
||||
else{
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -35,6 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include <string.h>
|
||||
|
||||
#if defined(_WIN32) && defined(_MSC_VER)
|
||||
#if _MSC_VER < 1900
|
||||
#define snprintf _snprintf
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static char* openblas_config_str=""
|
||||
#ifdef USE64BITINT
|
||||
"USE64BITINT "
|
||||
|
|
|
@ -122,7 +122,7 @@ endif
|
|||
dllinit.$(SUFFIX) : dllinit.c
|
||||
$(CC) $(CFLAGS) -c -o $(@F) -s $<
|
||||
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||
|
||||
so : ../$(LIBSONAME)
|
||||
|
||||
|
|
|
@ -213,7 +213,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
if (trans) lenx = m;
|
||||
if (trans) leny = n;
|
||||
|
||||
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
|
|
@ -199,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
if (trans) lenx = m;
|
||||
if (trans) leny = n;
|
||||
|
||||
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
|
|
@ -22,8 +22,8 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
|||
long double s;
|
||||
long double r, roe, z;
|
||||
|
||||
long double ada = fabs(da);
|
||||
long double adb = fabs(db);
|
||||
long double ada = fabsl(da);
|
||||
long double adb = fabsl(db);
|
||||
long double scale = ada + adb;
|
||||
|
||||
#ifndef CBLAS
|
||||
|
|
|
@ -184,7 +184,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
|
||||
if (n == 0) return;
|
||||
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
|
|
@ -168,7 +168,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
|
||||
if (n == 0) return;
|
||||
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
|
|
@ -166,7 +166,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
|
|||
|
||||
if (n == 0) return;
|
||||
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
|
|
@ -237,7 +237,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
if (trans & 1) lenx = m;
|
||||
if (trans & 1) leny = n;
|
||||
|
||||
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha_r == ZERO && alpha_i == ZERO) return;
|
||||
|
||||
|
|
|
@ -225,7 +225,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
if (trans & 1) lenx = m;
|
||||
if (trans & 1) leny = n;
|
||||
|
||||
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha_r == ZERO && alpha_i == ZERO) return;
|
||||
|
||||
|
|
|
@ -190,7 +190,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
|
||||
if (n == 0) return;
|
||||
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||
|
||||
|
|
|
@ -181,7 +181,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA
|
|||
|
||||
if (n == 0) return;
|
||||
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||
|
||||
|
|
|
@ -180,7 +180,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
|
||||
if (n == 0) return;
|
||||
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
|||
long double db_i = *(DB + 1);
|
||||
long double r;
|
||||
|
||||
long double ada = fabs(da_r) + fabs(da_i);
|
||||
long double ada = fabsl(da_r) + fabsl(da_i);
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
|
|
|
@ -126,7 +126,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *
|
|||
|
||||
if (n == 0) return;
|
||||
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0);
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||
|
||||
|
|
|
@ -44,7 +44,7 @@ ifeq ($(CORE), POWER8)
|
|||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), Z13)
|
||||
ifeq ($(ARCH), zarch)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
|
|
|
@ -1,17 +1,17 @@
|
|||
ifndef SNRM2KERNEL
|
||||
SNRM2KERNEL = nrm2.c
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
endif
|
||||
|
||||
ifndef DNRM2KERNEL
|
||||
DNRM2KERNEL = nrm2.c
|
||||
DNRM2KERNEL = ../arm/nrm2.c
|
||||
endif
|
||||
|
||||
ifndef CNRM2KERNEL
|
||||
CNRM2KERNEL = znrm2.c
|
||||
CNRM2KERNEL = ../arm/znrm2.c
|
||||
endif
|
||||
|
||||
ifndef ZNRM2KERNEL
|
||||
ZNRM2KERNEL = znrm2.c
|
||||
ZNRM2KERNEL = ../arm/znrm2.c
|
||||
endif
|
||||
|
||||
ifndef SCABS_KERNEL
|
||||
|
|
|
@ -51,10 +51,12 @@ CDOTKERNEL = zdot.S
|
|||
ZDOTKERNEL = zdot.S
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
ifneq ($(OS_DARWIN)$(CROSS),11)
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
endif
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
|
@ -86,7 +88,11 @@ DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
|||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
ifneq ($(OS_DARWIN)$(CROSS),11)
|
||||
SGEMMKERNEL = sgemm_kernel_4x4.S
|
||||
else
|
||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
|
|
|
@ -661,7 +661,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
b[ 9] = ZERO;
|
||||
b[ 10] = ZERO;
|
||||
b[ 11] = ZERO;
|
||||
b[ 11] = ZERO;
|
||||
b[ 12] = ZERO;
|
||||
b[ 13] = ZERO;
|
||||
b[ 14] = ZERO;
|
||||
b[ 15] = ZERO;
|
||||
|
|
|
@ -1,3 +1,12 @@
|
|||
CAXPYKERNEL = ../mips/zaxpy.c
|
||||
ZAXPYKERNEL = ../mips/zaxpy.c
|
||||
SROTKERNEL = ../mips/rot.c
|
||||
DROTKERNEL = ../mips/rot.c
|
||||
CROTKERNEL = ../mips/zrot.c
|
||||
ZROTKERNEL = ../mips/zrot.c
|
||||
CSWAPKERNEL = ../mips/zswap.c
|
||||
ZSWAPKERNEL = ../mips/zswap.c
|
||||
|
||||
ifndef SNRM2KERNEL
|
||||
SNRM2KERNEL = snrm2.S
|
||||
endif
|
||||
|
|
|
@ -103,35 +103,83 @@
|
|||
.align 3
|
||||
|
||||
.L12:
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a1, a1
|
||||
cvt.d.s b1, b1
|
||||
madd.d s1, s1, a1, b1
|
||||
#else
|
||||
MADD s1, s1, a1, b1
|
||||
#endif
|
||||
LD a1, 4 * SIZE(X)
|
||||
LD b1, 4 * SIZE(Y)
|
||||
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a2, a2
|
||||
cvt.d.s b2, b2
|
||||
madd.d s2, s2, a2, b2
|
||||
#else
|
||||
MADD s2, s2, a2, b2
|
||||
#endif
|
||||
LD a2, 5 * SIZE(X)
|
||||
LD b2, 5 * SIZE(Y)
|
||||
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a3, a3
|
||||
cvt.d.s b3, b3
|
||||
madd.d s1, s1, a3, b3
|
||||
#else
|
||||
MADD s1, s1, a3, b3
|
||||
#endif
|
||||
LD a3, 6 * SIZE(X)
|
||||
LD b3, 6 * SIZE(Y)
|
||||
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a4, a4
|
||||
cvt.d.s b4, b4
|
||||
madd.d s2, s2, a4, b4
|
||||
#else
|
||||
MADD s2, s2, a4, b4
|
||||
#endif
|
||||
LD a4, 7 * SIZE(X)
|
||||
LD b4, 7 * SIZE(Y)
|
||||
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a1, a1
|
||||
cvt.d.s b1, b1
|
||||
madd.d s1, s1, a1, b1
|
||||
#else
|
||||
MADD s1, s1, a1, b1
|
||||
#endif
|
||||
LD a1, 8 * SIZE(X)
|
||||
LD b1, 8 * SIZE(Y)
|
||||
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a2, a2
|
||||
cvt.d.s b2, b2
|
||||
madd.d s2, s2, a2, b2
|
||||
#else
|
||||
MADD s2, s2, a2, b2
|
||||
#endif
|
||||
LD a2, 9 * SIZE(X)
|
||||
LD b2, 9 * SIZE(Y)
|
||||
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a3, a3
|
||||
cvt.d.s b3, b3
|
||||
madd.d s1, s1, a3, b3
|
||||
#else
|
||||
MADD s1, s1, a3, b3
|
||||
#endif
|
||||
LD a3, 10 * SIZE(X)
|
||||
LD b3, 10 * SIZE(Y)
|
||||
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a4, a4
|
||||
cvt.d.s b4, b4
|
||||
madd.d s2, s2, a4, b4
|
||||
#else
|
||||
MADD s2, s2, a4, b4
|
||||
#endif
|
||||
LD a4, 11 * SIZE(X)
|
||||
LD b4, 11 * SIZE(Y)
|
||||
|
||||
|
@ -143,29 +191,77 @@
|
|||
.align 3
|
||||
|
||||
.L13:
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a1, a1
|
||||
cvt.d.s b1, b1
|
||||
madd.d s1, s1, a1, b1
|
||||
#else
|
||||
MADD s1, s1, a1, b1
|
||||
#endif
|
||||
LD a1, 4 * SIZE(X)
|
||||
LD b1, 4 * SIZE(Y)
|
||||
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a2, a2
|
||||
cvt.d.s b2, b2
|
||||
madd.d s2, s2, a2, b2
|
||||
#else
|
||||
MADD s2, s2, a2, b2
|
||||
#endif
|
||||
LD a2, 5 * SIZE(X)
|
||||
LD b2, 5 * SIZE(Y)
|
||||
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a3, a3
|
||||
cvt.d.s b3, b3
|
||||
madd.d s1, s1, a3, b3
|
||||
#else
|
||||
MADD s1, s1, a3, b3
|
||||
#endif
|
||||
LD a3, 6 * SIZE(X)
|
||||
LD b3, 6 * SIZE(Y)
|
||||
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a4, a4
|
||||
cvt.d.s b4, b4
|
||||
madd.d s2, s2, a4, b4
|
||||
#else
|
||||
MADD s2, s2, a4, b4
|
||||
#endif
|
||||
LD a4, 7 * SIZE(X)
|
||||
LD b4, 7 * SIZE(Y)
|
||||
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a1, a1
|
||||
cvt.d.s b1, b1
|
||||
madd.d s1, s1, a1, b1
|
||||
#else
|
||||
MADD s1, s1, a1, b1
|
||||
#endif
|
||||
daddiu X, X, 8 * SIZE
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a2, a2
|
||||
cvt.d.s b2, b2
|
||||
madd.d s2, s2, a2, b2
|
||||
#else
|
||||
MADD s2, s2, a2, b2
|
||||
#endif
|
||||
daddiu Y, Y, 8 * SIZE
|
||||
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a3, a3
|
||||
cvt.d.s b3, b3
|
||||
madd.d s1, s1, a3, b3
|
||||
#else
|
||||
MADD s1, s1, a3, b3
|
||||
#endif
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a4, a4
|
||||
cvt.d.s b4, b4
|
||||
madd.d s2, s2, a4, b4
|
||||
#else
|
||||
MADD s2, s2, a4, b4
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
|
@ -179,8 +275,13 @@
|
|||
LD a1, 0 * SIZE(X)
|
||||
LD b1, 0 * SIZE(Y)
|
||||
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a1, a1
|
||||
cvt.d.s b1, b1
|
||||
madd.d s1, s1, a1, b1
|
||||
#else
|
||||
MADD s1, s1, a1, b1
|
||||
|
||||
#endif
|
||||
daddiu I, I, -1
|
||||
|
||||
daddiu X, X, SIZE
|
||||
|
@ -225,50 +326,85 @@
|
|||
LD b1, 0 * SIZE(Y)
|
||||
dadd Y, Y, INCY
|
||||
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a1, a1
|
||||
cvt.d.s b1, b1
|
||||
madd.d s1, s1, a1, b1
|
||||
#else
|
||||
MADD s1, s1, a1, b1
|
||||
|
||||
#endif
|
||||
LD a1, 0 * SIZE(X)
|
||||
dadd X, X, INCX
|
||||
LD b1, 0 * SIZE(Y)
|
||||
dadd Y, Y, INCY
|
||||
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a1, a1
|
||||
cvt.d.s b1, b1
|
||||
madd.d s2, s2, a1, b1
|
||||
#else
|
||||
MADD s2, s2, a1, b1
|
||||
|
||||
#endif
|
||||
LD a1, 0 * SIZE(X)
|
||||
dadd X, X, INCX
|
||||
LD b1, 0 * SIZE(Y)
|
||||
dadd Y, Y, INCY
|
||||
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a1, a1
|
||||
cvt.d.s b1, b1
|
||||
madd.d s1, s1, a1, b1
|
||||
#else
|
||||
MADD s1, s1, a1, b1
|
||||
|
||||
#endif
|
||||
LD a1, 0 * SIZE(X)
|
||||
dadd X, X, INCX
|
||||
LD b1, 0 * SIZE(Y)
|
||||
dadd Y, Y, INCY
|
||||
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a1, a1
|
||||
cvt.d.s b1, b1
|
||||
madd.d s2, s2, a1, b1
|
||||
#else
|
||||
MADD s2, s2, a1, b1
|
||||
|
||||
#endif
|
||||
LD a1, 0 * SIZE(X)
|
||||
dadd X, X, INCX
|
||||
LD b1, 0 * SIZE(Y)
|
||||
dadd Y, Y, INCY
|
||||
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a1, a1
|
||||
cvt.d.s b1, b1
|
||||
madd.d s1, s1, a1, b1
|
||||
#else
|
||||
MADD s1, s1, a1, b1
|
||||
|
||||
#endif
|
||||
LD a1, 0 * SIZE(X)
|
||||
dadd X, X, INCX
|
||||
LD b1, 0 * SIZE(Y)
|
||||
dadd Y, Y, INCY
|
||||
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a1, a1
|
||||
cvt.d.s b1, b1
|
||||
madd.d s2, s2, a1, b1
|
||||
#else
|
||||
MADD s2, s2, a1, b1
|
||||
|
||||
#endif
|
||||
LD a1, 0 * SIZE(X)
|
||||
dadd X, X, INCX
|
||||
LD b1, 0 * SIZE(Y)
|
||||
dadd Y, Y, INCY
|
||||
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a1, a1
|
||||
cvt.d.s b1, b1
|
||||
madd.d s1, s1, a1, b1
|
||||
#else
|
||||
MADD s1, s1, a1, b1
|
||||
|
||||
#endif
|
||||
LD a1, 0 * SIZE(X)
|
||||
dadd X, X, INCX
|
||||
LD b1, 0 * SIZE(Y)
|
||||
|
@ -277,7 +413,13 @@
|
|||
daddiu I, I, -1
|
||||
|
||||
bgtz I, .L23
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a1, a1
|
||||
cvt.d.s b1, b1
|
||||
madd.d s2, s2, a1, b1
|
||||
#else
|
||||
MADD s2, s2, a1, b1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
|
@ -296,13 +438,20 @@
|
|||
daddiu I, I, -1
|
||||
|
||||
bgtz I, .L26
|
||||
#ifdef DSDOT
|
||||
cvt.d.s a1, a1
|
||||
cvt.d.s b1, b1
|
||||
madd.d s1, s1, a1, b1
|
||||
#else
|
||||
MADD s1, s1, a1, b1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
ADD s1, s1, s2
|
||||
#ifdef DSDOT
|
||||
cvt.d.s s1, s1
|
||||
add.d s1, s1, s2
|
||||
#else
|
||||
ADD s1, s1, s2
|
||||
#endif
|
||||
j $31
|
||||
NOP
|
||||
|
|
|
@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "daxpy_microk_steamroller-2.c"
|
||||
#elif defined(PILEDRIVER)
|
||||
#include "daxpy_microk_piledriver-2.c"
|
||||
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||
#elif defined(HASWELL) || defined(ZEN)
|
||||
#include "daxpy_microk_haswell-2.c"
|
||||
#elif defined (SKYLAKEX)
|
||||
#include "daxpy_microk_skylakex-2.c"
|
||||
#elif defined(SANDYBRIDGE)
|
||||
#include "daxpy_microk_sandy-2.c"
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,71 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
|
||||
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
|
||||
__m256d __alpha;
|
||||
|
||||
__alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
BLASLONG n32;
|
||||
__m512d __alpha5;
|
||||
__alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha));
|
||||
|
||||
n32 = n & ~31;
|
||||
|
||||
for (; i < n32; i+= 32) {
|
||||
_mm512_storeu_pd(&y[i + 0], _mm512_loadu_pd(&y[i + 0]) + __alpha5 * _mm512_loadu_pd(&x[i + 0]));
|
||||
_mm512_storeu_pd(&y[i + 8], _mm512_loadu_pd(&y[i + 8]) + __alpha5 * _mm512_loadu_pd(&x[i + 8]));
|
||||
_mm512_storeu_pd(&y[i + 16], _mm512_loadu_pd(&y[i + 16]) + __alpha5 * _mm512_loadu_pd(&x[i + 16]));
|
||||
_mm512_storeu_pd(&y[i + 24], _mm512_loadu_pd(&y[i + 24]) + __alpha5 * _mm512_loadu_pd(&x[i + 24]));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
for (; i < n; i+= 16) {
|
||||
_mm256_storeu_pd(&y[i + 0], _mm256_loadu_pd(&y[i + 0]) + __alpha * _mm256_loadu_pd(&x[i + 0]));
|
||||
_mm256_storeu_pd(&y[i + 4], _mm256_loadu_pd(&y[i + 4]) + __alpha * _mm256_loadu_pd(&x[i + 4]));
|
||||
_mm256_storeu_pd(&y[i + 8], _mm256_loadu_pd(&y[i + 8]) + __alpha * _mm256_loadu_pd(&x[i + 8]));
|
||||
_mm256_storeu_pd(&y[i + 12], _mm256_loadu_pd(&y[i + 12]) + __alpha * _mm256_loadu_pd(&x[i + 12]));
|
||||
}
|
||||
}
|
||||
#else
|
||||
#include "daxpy_microk_haswell-2.c"
|
||||
#endif
|
||||
|
||||
|
|
@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "ddot_microk_piledriver-2.c"
|
||||
#elif defined(NEHALEM)
|
||||
#include "ddot_microk_nehalem-2.c"
|
||||
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||
#elif defined(HASWELL) || defined(ZEN)
|
||||
#include "ddot_microk_haswell-2.c"
|
||||
#elif defined (SKYLAKEX)
|
||||
#include "ddot_microk_skylakex-2.c"
|
||||
#elif defined(SANDYBRIDGE)
|
||||
#include "ddot_microk_sandy-2.c"
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,96 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
{
|
||||
int i = 0;
|
||||
__m256d accum_0, accum_1, accum_2, accum_3;
|
||||
|
||||
accum_0 = _mm256_setzero_pd();
|
||||
accum_1 = _mm256_setzero_pd();
|
||||
accum_2 = _mm256_setzero_pd();
|
||||
accum_3 = _mm256_setzero_pd();
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
__m512d accum_05, accum_15, accum_25, accum_35;
|
||||
int n32;
|
||||
n32 = n & (~31);
|
||||
|
||||
accum_05 = _mm512_setzero_pd();
|
||||
accum_15 = _mm512_setzero_pd();
|
||||
accum_25 = _mm512_setzero_pd();
|
||||
accum_35 = _mm512_setzero_pd();
|
||||
|
||||
for (; i < n32; i += 32) {
|
||||
accum_05 += _mm512_loadu_pd(&x[i+ 0]) * _mm512_loadu_pd(&y[i+ 0]);
|
||||
accum_15 += _mm512_loadu_pd(&x[i+ 8]) * _mm512_loadu_pd(&y[i+ 8]);
|
||||
accum_25 += _mm512_loadu_pd(&x[i+16]) * _mm512_loadu_pd(&y[i+16]);
|
||||
accum_35 += _mm512_loadu_pd(&x[i+24]) * _mm512_loadu_pd(&y[i+24]);
|
||||
}
|
||||
|
||||
/*
|
||||
* we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code
|
||||
* below can continue using the intermediate results in its loop
|
||||
*/
|
||||
accum_0 = _mm512_extractf64x4_pd(accum_05, 0) + _mm512_extractf64x4_pd(accum_05, 1);
|
||||
accum_1 = _mm512_extractf64x4_pd(accum_15, 0) + _mm512_extractf64x4_pd(accum_15, 1);
|
||||
accum_2 = _mm512_extractf64x4_pd(accum_25, 0) + _mm512_extractf64x4_pd(accum_25, 1);
|
||||
accum_3 = _mm512_extractf64x4_pd(accum_35, 0) + _mm512_extractf64x4_pd(accum_35, 1);
|
||||
|
||||
#endif
|
||||
for (; i < n; i += 16) {
|
||||
accum_0 += _mm256_loadu_pd(&x[i+ 0]) * _mm256_loadu_pd(&y[i+ 0]);
|
||||
accum_1 += _mm256_loadu_pd(&x[i+ 4]) * _mm256_loadu_pd(&y[i+ 4]);
|
||||
accum_2 += _mm256_loadu_pd(&x[i+ 8]) * _mm256_loadu_pd(&y[i+ 8]);
|
||||
accum_3 += _mm256_loadu_pd(&x[i+12]) * _mm256_loadu_pd(&y[i+12]);
|
||||
}
|
||||
|
||||
/* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */
|
||||
|
||||
accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
|
||||
|
||||
__m128d half_accum0;
|
||||
|
||||
/* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */
|
||||
half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1));
|
||||
|
||||
/* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */
|
||||
half_accum0 = _mm_hadd_pd(half_accum0, half_accum0);
|
||||
|
||||
*dot = half_accum0[0];
|
||||
}
|
||||
|
||||
#else
|
||||
#include "ddot_microk_haswell-2.c"
|
||||
#endif
|
|
@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(NEHALEM)
|
||||
#include "dgemv_n_microk_nehalem-4.c"
|
||||
#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX)
|
||||
#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||
#include "dgemv_n_microk_haswell-4.c"
|
||||
#elif defined (SKYLAKEX)
|
||||
#include "dgemv_n_microk_skylakex-4.c"
|
||||
#endif
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,126 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
int i = 0;
|
||||
|
||||
__m256d x0, x1, x2, x3;
|
||||
__m256d __alpha;
|
||||
|
||||
x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0]));
|
||||
x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1]));
|
||||
x2 = _mm256_broadcastsd_pd(_mm_load_sd(&x[2]));
|
||||
x3 = _mm256_broadcastsd_pd(_mm_load_sd(&x[3]));
|
||||
|
||||
__alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
int n5;
|
||||
__m512d x05, x15, x25, x35;
|
||||
__m512d __alpha5;
|
||||
n5 = n & ~7;
|
||||
|
||||
x05 = _mm512_broadcastsd_pd(_mm_load_sd(&x[0]));
|
||||
x15 = _mm512_broadcastsd_pd(_mm_load_sd(&x[1]));
|
||||
x25 = _mm512_broadcastsd_pd(_mm_load_sd(&x[2]));
|
||||
x35 = _mm512_broadcastsd_pd(_mm_load_sd(&x[3]));
|
||||
|
||||
__alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha));
|
||||
|
||||
for (; i < n5; i+= 8) {
|
||||
__m512d tempY;
|
||||
__m512d sum;
|
||||
|
||||
sum = _mm512_loadu_pd(&ap[0][i]) * x05 +
|
||||
_mm512_loadu_pd(&ap[1][i]) * x15 +
|
||||
_mm512_loadu_pd(&ap[2][i]) * x25 +
|
||||
_mm512_loadu_pd(&ap[3][i]) * x35;
|
||||
|
||||
tempY = _mm512_loadu_pd(&y[i]);
|
||||
tempY += sum * __alpha5;
|
||||
_mm512_storeu_pd(&y[i], tempY);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; i < n; i+= 4) {
|
||||
__m256d tempY;
|
||||
__m256d sum;
|
||||
|
||||
sum = _mm256_loadu_pd(&ap[0][i]) * x0 +
|
||||
_mm256_loadu_pd(&ap[1][i]) * x1 +
|
||||
_mm256_loadu_pd(&ap[2][i]) * x2 +
|
||||
_mm256_loadu_pd(&ap[3][i]) * x3;
|
||||
|
||||
tempY = _mm256_loadu_pd(&y[i]);
|
||||
tempY += sum * __alpha;
|
||||
_mm256_storeu_pd(&y[i], tempY);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x2
|
||||
|
||||
static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
int i = 0;
|
||||
|
||||
__m256d x0, x1;
|
||||
__m256d __alpha;
|
||||
|
||||
x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0]));
|
||||
x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1]));
|
||||
|
||||
__alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
|
||||
|
||||
|
||||
for (i = 0; i < n; i+= 4) {
|
||||
__m256d tempY;
|
||||
__m256d sum;
|
||||
|
||||
sum = _mm256_loadu_pd(&ap[0][i]) * x0 + _mm256_loadu_pd(&ap[1][i]) * x1;
|
||||
|
||||
tempY = _mm256_loadu_pd(&y[i]);
|
||||
tempY += sum * __alpha;
|
||||
_mm256_storeu_pd(&y[i], tempY);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
#include "dgemv_n_microk_haswell-4.c"
|
||||
#endif
|
|
@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "dscal_microk_bulldozer-2.c"
|
||||
#elif defined(SANDYBRIDGE)
|
||||
#include "dscal_microk_sandy-2.c"
|
||||
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||
#elif defined(HASWELL) || defined(ZEN)
|
||||
#include "dscal_microk_haswell-2.c"
|
||||
#elif defined (SKYLAKEX)
|
||||
#include "dscal_microk_skylakex-2.c"
|
||||
#endif
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,77 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014-2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
|
||||
static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
int i = 0;
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
__m512d __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha));
|
||||
for (; i < n; i += 8) {
|
||||
_mm512_storeu_pd(&x[i + 0], __alpha5 * _mm512_loadu_pd(&x[i + 0]));
|
||||
}
|
||||
#else
|
||||
__m256d __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
|
||||
for (; i < n; i += 8) {
|
||||
_mm256_storeu_pd(&x[i + 0], __alpha * _mm256_loadu_pd(&x[i + 0]));
|
||||
_mm256_storeu_pd(&x[i + 4], __alpha * _mm256_loadu_pd(&x[i + 4]));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
int i = 0;
|
||||
|
||||
/* question to self: Why is this not just memset() */
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
__m512d zero = _mm512_setzero_pd();
|
||||
for (; i < n; i += 8) {
|
||||
_mm512_storeu_pd(&x[i], zero);
|
||||
}
|
||||
#else
|
||||
__m256d zero = _mm256_setzero_pd();
|
||||
for (; i < n; i += 8) {
|
||||
_mm256_storeu_pd(&x[i + 0], zero);
|
||||
_mm256_storeu_pd(&x[i + 4], zero);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
#include "dscal_microk_haswell-2.c"
|
||||
#endif
|
|
@ -30,8 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||
#include "dsymv_L_microk_bulldozer-2.c"
|
||||
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||
#elif defined(HASWELL) || defined(ZEN)
|
||||
#include "dsymv_L_microk_haswell-2.c"
|
||||
#elif defined (SKYLAKEX)
|
||||
#include "dsymv_L_microk_skylakex-2.c"
|
||||
#elif defined(SANDYBRIDGE)
|
||||
#include "dsymv_L_microk_sandy-2.c"
|
||||
#elif defined(NEHALEM)
|
||||
|
|
|
@ -0,0 +1,161 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
|
||||
static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
|
||||
{
|
||||
|
||||
|
||||
__m256d accum_0, accum_1, accum_2, accum_3;
|
||||
__m256d temp1_0, temp1_1, temp1_2, temp1_3;
|
||||
|
||||
/* the 256 bit wide acculmulator vectors start out as zero */
|
||||
accum_0 = _mm256_setzero_pd();
|
||||
accum_1 = _mm256_setzero_pd();
|
||||
accum_2 = _mm256_setzero_pd();
|
||||
accum_3 = _mm256_setzero_pd();
|
||||
|
||||
temp1_0 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[0]));
|
||||
temp1_1 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[1]));
|
||||
temp1_2 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[2]));
|
||||
temp1_3 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[3]));
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
__m512d accum_05, accum_15, accum_25, accum_35;
|
||||
__m512d temp1_05, temp1_15, temp1_25, temp1_35;
|
||||
BLASLONG to2;
|
||||
int delta;
|
||||
|
||||
/* the 512 bit wide accumulator vectors start out as zero */
|
||||
accum_05 = _mm512_setzero_pd();
|
||||
accum_15 = _mm512_setzero_pd();
|
||||
accum_25 = _mm512_setzero_pd();
|
||||
accum_35 = _mm512_setzero_pd();
|
||||
|
||||
temp1_05 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[0]));
|
||||
temp1_15 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[1]));
|
||||
temp1_25 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[2]));
|
||||
temp1_35 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[3]));
|
||||
|
||||
delta = (to - from) & ~7;
|
||||
to2 = from + delta;
|
||||
|
||||
|
||||
for (; from < to2; from += 8) {
|
||||
__m512d _x, _y;
|
||||
__m512d a0, a1, a2, a3;
|
||||
|
||||
_y = _mm512_loadu_pd(&y[from]);
|
||||
_x = _mm512_loadu_pd(&x[from]);
|
||||
|
||||
a0 = _mm512_loadu_pd(&a[0][from]);
|
||||
a1 = _mm512_loadu_pd(&a[1][from]);
|
||||
a2 = _mm512_loadu_pd(&a[2][from]);
|
||||
a3 = _mm512_loadu_pd(&a[3][from]);
|
||||
|
||||
_y += temp1_05 * a0 + temp1_15 * a1 + temp1_25 * a2 + temp1_35 * a3;
|
||||
|
||||
accum_05 += _x * a0;
|
||||
accum_15 += _x * a1;
|
||||
accum_25 += _x * a2;
|
||||
accum_35 += _x * a3;
|
||||
|
||||
_mm512_storeu_pd(&y[from], _y);
|
||||
|
||||
};
|
||||
|
||||
/*
|
||||
* we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code
|
||||
* below can continue using the intermediate results in its loop
|
||||
*/
|
||||
accum_0 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_05, 0), _mm512_extractf64x4_pd(accum_05, 1));
|
||||
accum_1 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_15, 0), _mm512_extractf64x4_pd(accum_15, 1));
|
||||
accum_2 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_25, 0), _mm512_extractf64x4_pd(accum_25, 1));
|
||||
accum_3 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_35, 0), _mm512_extractf64x4_pd(accum_35, 1));
|
||||
|
||||
#endif
|
||||
|
||||
for (; from != to; from += 4) {
|
||||
__m256d _x, _y;
|
||||
__m256d a0, a1, a2, a3;
|
||||
|
||||
_y = _mm256_loadu_pd(&y[from]);
|
||||
_x = _mm256_loadu_pd(&x[from]);
|
||||
|
||||
/* load 4 rows of matrix data */
|
||||
a0 = _mm256_loadu_pd(&a[0][from]);
|
||||
a1 = _mm256_loadu_pd(&a[1][from]);
|
||||
a2 = _mm256_loadu_pd(&a[2][from]);
|
||||
a3 = _mm256_loadu_pd(&a[3][from]);
|
||||
|
||||
_y += temp1_0 * a0 + temp1_1 * a1 + temp1_2 * a2 + temp1_3 * a3;
|
||||
|
||||
accum_0 += _x * a0;
|
||||
accum_1 += _x * a1;
|
||||
accum_2 += _x * a2;
|
||||
accum_3 += _x * a3;
|
||||
|
||||
_mm256_storeu_pd(&y[from], _y);
|
||||
|
||||
};
|
||||
|
||||
/*
|
||||
* we now have 4 accumulator vectors. Each vector needs to be summed up element wise and stored in the temp2
|
||||
* output array. There is no direct instruction for this in 256 bit space, only in 128 space.
|
||||
*/
|
||||
|
||||
__m128d half_accum0, half_accum1, half_accum2, half_accum3;
|
||||
|
||||
|
||||
/* Add upper half to lower half of each of the four 256 bit vectors to get to four 128 bit vectors */
|
||||
half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1));
|
||||
half_accum1 = _mm_add_pd(_mm256_extractf128_pd(accum_1, 0), _mm256_extractf128_pd(accum_1, 1));
|
||||
half_accum2 = _mm_add_pd(_mm256_extractf128_pd(accum_2, 0), _mm256_extractf128_pd(accum_2, 1));
|
||||
half_accum3 = _mm_add_pd(_mm256_extractf128_pd(accum_3, 0), _mm256_extractf128_pd(accum_3, 1));
|
||||
|
||||
/* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */
|
||||
half_accum0 = _mm_hadd_pd(half_accum0, half_accum0);
|
||||
half_accum1 = _mm_hadd_pd(half_accum1, half_accum1);
|
||||
half_accum2 = _mm_hadd_pd(half_accum2, half_accum2);
|
||||
half_accum3 = _mm_hadd_pd(half_accum3, half_accum3);
|
||||
|
||||
/* and store the lowest double value from each of these vectors in the temp2 output */
|
||||
temp2[0] += half_accum0[0];
|
||||
temp2[1] += half_accum1[0];
|
||||
temp2[2] += half_accum2[0];
|
||||
temp2[3] += half_accum3[0];
|
||||
}
|
||||
#else
|
||||
#include "dsymv_L_microk_haswell-2.c"
|
||||
#endif
|
|
@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(NEHALEM)
|
||||
#include "saxpy_microk_nehalem-2.c"
|
||||
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||
#elif defined(HASWELL) || defined(ZEN)
|
||||
#include "saxpy_microk_haswell-2.c"
|
||||
#elif defined (SKYLAKEX)
|
||||
#include "saxpy_microk_skylakex-2.c"
|
||||
#elif defined(SANDYBRIDGE)
|
||||
#include "saxpy_microk_sandy-2.c"
|
||||
#elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||
|
|
|
@ -0,0 +1,69 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
|
||||
__m256 __alpha;
|
||||
|
||||
__alpha = _mm256_broadcastss_ps(_mm_load_ss(alpha));
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
BLASLONG n64;
|
||||
__m512 __alpha5;
|
||||
__alpha5 = _mm512_broadcastss_ps(_mm_load_ss(alpha));
|
||||
|
||||
n64 = n & ~63;
|
||||
|
||||
for (; i < n64; i+= 64) {
|
||||
_mm512_storeu_ps(&y[i + 0], _mm512_loadu_ps(&y[i + 0]) + __alpha5 * _mm512_loadu_ps(&x[i + 0]));
|
||||
_mm512_storeu_ps(&y[i + 16], _mm512_loadu_ps(&y[i + 16]) + __alpha5 * _mm512_loadu_ps(&x[i + 16]));
|
||||
_mm512_storeu_ps(&y[i + 32], _mm512_loadu_ps(&y[i + 32]) + __alpha5 * _mm512_loadu_ps(&x[i + 32]));
|
||||
_mm512_storeu_ps(&y[i + 48], _mm512_loadu_ps(&y[i + 48]) + __alpha5 * _mm512_loadu_ps(&x[i + 48]));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
for (; i < n; i+= 32) {
|
||||
_mm256_storeu_ps(&y[i + 0], _mm256_loadu_ps(&y[i + 0]) + __alpha * _mm256_loadu_ps(&x[i + 0]));
|
||||
_mm256_storeu_ps(&y[i + 8], _mm256_loadu_ps(&y[i + 8]) + __alpha * _mm256_loadu_ps(&x[i + 8]));
|
||||
_mm256_storeu_ps(&y[i + 16], _mm256_loadu_ps(&y[i + 16]) + __alpha * _mm256_loadu_ps(&x[i + 16]));
|
||||
_mm256_storeu_ps(&y[i + 24], _mm256_loadu_ps(&y[i + 24]) + __alpha * _mm256_loadu_ps(&x[i + 24]));
|
||||
}
|
||||
}
|
||||
#else
|
||||
#include "saxpy_microk_haswell-2.c"
|
||||
#endif
|
||||
|
|
@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "sdot_microk_steamroller-2.c"
|
||||
#elif defined(NEHALEM)
|
||||
#include "sdot_microk_nehalem-2.c"
|
||||
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||
#elif defined(HASWELL) || defined(ZEN)
|
||||
#include "sdot_microk_haswell-2.c"
|
||||
#elif defined (SKYLAKEX)
|
||||
#include "sdot_microk_skylakex-2.c"
|
||||
#elif defined(SANDYBRIDGE)
|
||||
#include "sdot_microk_sandy-2.c"
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,98 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
|
||||
{
|
||||
int i = 0;
|
||||
__m256 accum_0, accum_1, accum_2, accum_3;
|
||||
|
||||
accum_0 = _mm256_setzero_ps();
|
||||
accum_1 = _mm256_setzero_ps();
|
||||
accum_2 = _mm256_setzero_ps();
|
||||
accum_3 = _mm256_setzero_ps();
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
__m512 accum_05, accum_15, accum_25, accum_35;
|
||||
int n64;
|
||||
n64 = n & (~63);
|
||||
|
||||
accum_05 = _mm512_setzero_ps();
|
||||
accum_15 = _mm512_setzero_ps();
|
||||
accum_25 = _mm512_setzero_ps();
|
||||
accum_35 = _mm512_setzero_ps();
|
||||
|
||||
for (; i < n64; i += 64) {
|
||||
accum_05 += _mm512_loadu_ps(&x[i+ 0]) * _mm512_loadu_ps(&y[i+ 0]);
|
||||
accum_15 += _mm512_loadu_ps(&x[i+16]) * _mm512_loadu_ps(&y[i+16]);
|
||||
accum_25 += _mm512_loadu_ps(&x[i+32]) * _mm512_loadu_ps(&y[i+32]);
|
||||
accum_35 += _mm512_loadu_ps(&x[i+48]) * _mm512_loadu_ps(&y[i+48]);
|
||||
}
|
||||
|
||||
/*
|
||||
* we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code
|
||||
* below can continue using the intermediate results in its loop
|
||||
*/
|
||||
accum_0 = _mm512_extractf32x8_ps(accum_05, 0) + _mm512_extractf32x8_ps(accum_05, 1);
|
||||
accum_1 = _mm512_extractf32x8_ps(accum_15, 0) + _mm512_extractf32x8_ps(accum_15, 1);
|
||||
accum_2 = _mm512_extractf32x8_ps(accum_25, 0) + _mm512_extractf32x8_ps(accum_25, 1);
|
||||
accum_3 = _mm512_extractf32x8_ps(accum_35, 0) + _mm512_extractf32x8_ps(accum_35, 1);
|
||||
|
||||
#endif
|
||||
for (; i < n; i += 32) {
|
||||
accum_0 += _mm256_loadu_ps(&x[i+ 0]) * _mm256_loadu_ps(&y[i+ 0]);
|
||||
accum_1 += _mm256_loadu_ps(&x[i+ 8]) * _mm256_loadu_ps(&y[i+ 8]);
|
||||
accum_2 += _mm256_loadu_ps(&x[i+16]) * _mm256_loadu_ps(&y[i+16]);
|
||||
accum_3 += _mm256_loadu_ps(&x[i+24]) * _mm256_loadu_ps(&y[i+24]);
|
||||
}
|
||||
|
||||
/* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */
|
||||
|
||||
accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
|
||||
|
||||
__m128 half_accum0;
|
||||
|
||||
/* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */
|
||||
half_accum0 = _mm256_extractf128_ps(accum_0, 0) + _mm256_extractf128_ps(accum_0, 1);
|
||||
|
||||
/* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */
|
||||
half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
|
||||
half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
|
||||
|
||||
*dot = half_accum0[0];
|
||||
}
|
||||
|
||||
#else
|
||||
#include "sdot_microk_haswell-2.c"
|
||||
#endif
|
|
@ -50,7 +50,6 @@ lapack_int LAPACKE_dsytrf_aa_2stage_work( int matrix_layout, char uplo, lapack_i
|
|||
}
|
||||
} else if( matrix_layout == LAPACK_ROW_MAJOR ) {
|
||||
lapack_int lda_t = MAX(1,n);
|
||||
lapack_int ldb_t = MAX(1,n);
|
||||
double* a_t = NULL;
|
||||
double* tb_t = NULL;
|
||||
/* Check leading dimension(s) */
|
||||
|
|
|
@ -50,7 +50,6 @@ lapack_int LAPACKE_zhetrf_aa_2stage_work( int matrix_layout, char uplo, lapack_i
|
|||
}
|
||||
} else if( matrix_layout == LAPACK_ROW_MAJOR ) {
|
||||
lapack_int lda_t = MAX(1,n);
|
||||
lapack_int ldb_t = MAX(1,n);
|
||||
lapack_complex_double* a_t = NULL;
|
||||
lapack_complex_double* tb_t = NULL;
|
||||
/* Check leading dimension(s) */
|
||||
|
|
|
@ -50,7 +50,6 @@ lapack_int LAPACKE_zsytrf_aa_2stage_work( int matrix_layout, char uplo, lapack_i
|
|||
}
|
||||
} else if( matrix_layout == LAPACK_ROW_MAJOR ) {
|
||||
lapack_int lda_t = MAX(1,n);
|
||||
lapack_int ldb_t = MAX(1,n);
|
||||
lapack_complex_double* a_t = NULL;
|
||||
lapack_complex_double* tb_t = NULL;
|
||||
/* Check leading dimension(s) */
|
||||
|
|
|
@ -280,8 +280,8 @@
|
|||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
INTEGER ILAENV
|
||||
EXTERNAL LSAME, ILAENV
|
||||
INTEGER ILAENV2STAGE
|
||||
EXTERNAL LSAME, ILAENV2STAGE
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
|
@ -297,9 +297,9 @@
|
|||
*
|
||||
* Determine the block size, the workspace size and the hous size.
|
||||
*
|
||||
IB = ILAENV( 18, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 )
|
||||
LHMIN = ILAENV( 19, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
||||
LWMIN = ILAENV( 20, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
||||
IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 )
|
||||
LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
||||
LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
||||
*
|
||||
IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
|
||||
INFO = -1
|
||||
|
|
|
@ -285,8 +285,8 @@
|
|||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
INTEGER ILAENV
|
||||
EXTERNAL LSAME, ILAENV
|
||||
INTEGER ILAENV2STAGE
|
||||
EXTERNAL LSAME, ILAENV2STAGE
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
|
@ -296,7 +296,7 @@
|
|||
INFO = 0
|
||||
UPPER = LSAME( UPLO, 'U' )
|
||||
LQUERY = ( LWORK.EQ.-1 )
|
||||
LWMIN = ILAENV( 20, 'CHETRD_HE2HB', '', N, KD, -1, -1 )
|
||||
LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 )
|
||||
|
||||
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
|
||||
INFO = -1
|
||||
|
|
|
@ -277,8 +277,8 @@
|
|||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
INTEGER ILAENV
|
||||
EXTERNAL LSAME, ILAENV
|
||||
INTEGER ILAENV2STAGE
|
||||
EXTERNAL LSAME, ILAENV2STAGE
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
|
@ -294,9 +294,9 @@
|
|||
*
|
||||
* Determine the block size, the workspace size and the hous size.
|
||||
*
|
||||
IB = ILAENV( 18, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 )
|
||||
LHMIN = ILAENV( 19, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
||||
LWMIN = ILAENV( 20, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
||||
IB = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 )
|
||||
LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
||||
LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
||||
*
|
||||
IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
|
||||
INFO = -1
|
||||
|
|
|
@ -285,8 +285,8 @@
|
|||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
INTEGER ILAENV
|
||||
EXTERNAL LSAME, ILAENV
|
||||
INTEGER ILAENV2STAGE
|
||||
EXTERNAL LSAME, ILAENV2STAGE
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
|
@ -296,7 +296,7 @@
|
|||
INFO = 0
|
||||
UPPER = LSAME( UPLO, 'U' )
|
||||
LQUERY = ( LWORK.EQ.-1 )
|
||||
LWMIN = ILAENV( 20, 'DSYTRD_SY2SB', '', N, KD, -1, -1 )
|
||||
LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', '', N, KD, -1, -1 )
|
||||
|
||||
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
|
||||
INFO = -1
|
||||
|
|
|
@ -277,8 +277,8 @@
|
|||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
INTEGER ILAENV
|
||||
EXTERNAL LSAME, ILAENV
|
||||
INTEGER ILAENV2STAGE
|
||||
EXTERNAL LSAME, ILAENV2STAGE
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
|
@ -294,9 +294,9 @@
|
|||
*
|
||||
* Determine the block size, the workspace size and the hous size.
|
||||
*
|
||||
IB = ILAENV( 18, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 )
|
||||
LHMIN = ILAENV( 19, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
||||
LWMIN = ILAENV( 20, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
||||
IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 )
|
||||
LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
||||
LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
|
||||
*
|
||||
IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
|
||||
INFO = -1
|
||||
|
|
|
@ -285,8 +285,8 @@
|
|||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
INTEGER ILAENV
|
||||
EXTERNAL LSAME, ILAENV
|
||||
INTEGER ILAENV2STAGE
|
||||
EXTERNAL LSAME, ILAENV2STAGE
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
|
@ -296,7 +296,7 @@
|
|||
INFO = 0
|
||||
UPPER = LSAME( UPLO, 'U' )
|
||||
LQUERY = ( LWORK.EQ.-1 )
|
||||
LWMIN = ILAENV( 20, 'SSYTRD_SY2SB', '', N, KD, -1, -1 )
|
||||
LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 )
|
||||
|
||||
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
|
||||
INFO = -1
|
||||
|
|
|
@ -280,8 +280,8 @@
|
|||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
INTEGER ILAENV
|
||||
EXTERNAL LSAME, ILAENV
|
||||
INTEGER ILAENV2STAGE
|
||||
EXTERNAL LSAME, ILAENV2STAGE
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
|
@ -297,9 +297,9 @@
|
|||
*
|
||||
* Determine the block size, the workspace size and the hous size.
|
||||
*
|
||||
IB = ILAENV( 18, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 )
|
||||
LHMIN = ILAENV( 19, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
||||
LWMIN = ILAENV( 20, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
||||
IB = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 )
|
||||
LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
||||
LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
|
||||
*
|
||||
IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
|
||||
INFO = -1
|
||||
|
|
|
@ -285,8 +285,8 @@
|
|||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
INTEGER ILAENV
|
||||
EXTERNAL LSAME, ILAENV
|
||||
INTEGER ILAENV2STAGE
|
||||
EXTERNAL LSAME, ILAENV2STAGE
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
|
@ -296,7 +296,7 @@
|
|||
INFO = 0
|
||||
UPPER = LSAME( UPLO, 'U' )
|
||||
LQUERY = ( LWORK.EQ.-1 )
|
||||
LWMIN = ILAENV( 20, 'ZHETRD_HE2HB', '', N, KD, -1, -1 )
|
||||
LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 )
|
||||
|
||||
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
|
||||
INFO = -1
|
||||
|
|
5
param.h
5
param.h
|
@ -2590,8 +2590,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
|
||||
#if defined(OS_DARWIN) && defined(CROSS)
|
||||
#define SGEMM_DEFAULT_UNROLL_M 2
|
||||
#define SGEMM_DEFAULT_UNROLL N 2
|
||||
#else
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
#endif
|
||||
|
||||
#define DGEMM_DEFAULT_UNROLL_M 2
|
||||
#define DGEMM_DEFAULT_UNROLL_N 2
|
||||
|
|
|
@ -84,7 +84,7 @@ struct ctest {
|
|||
#endif
|
||||
|
||||
#if _MSC_VER < 1900
|
||||
#define snprintf _snprintf_s
|
||||
#define snprintf _snprintf
|
||||
#endif
|
||||
|
||||
#ifndef __cplusplus
|
||||
|
|
Loading…
Reference in New Issue