diff --git a/CMakeLists.txt b/CMakeLists.txt index a2421ac54..20ce02e87 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 1.dev) +set(OpenBLAS_PATCH_VERSION 3.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions @@ -150,6 +150,7 @@ endif() # add objects to the openblas lib add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) +target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $) # Android needs to explicitly link against libm if(ANDROID) @@ -169,6 +170,7 @@ endif() # Set output for libopenblas set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") +set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS") foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) diff --git a/Changelog.txt b/Changelog.txt index cb6fee70a..33dcacc51 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,115 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.2 +30-Jul-2018 + +common: + * fixes for regressions caused by the rewrite of the thread + initialization code in 0.3.1 + +POWER: + * fixed cpu autodetection for the BSDs + +MIPS64: + * fixed utest errors in AXPY, DSDOT, ROT and SWAP + +x86_64: + * added autodetection of AMD Ryzen 2 + * fixed build with older versions of MSVC + +==================================================================== +Version 0.3.1 +01-Jul-2018 + +common: + * rewritten thread initialization code with significantly reduced overhead + * added CBLAS interfaces to the IxAMIN BLAS extension functions + * fixed the lapack-test target + * CMAKE builds now create an OpenBLASConfig.cmake file + * ZAXPY now uses a single thread for small input sizes + * the LAPACK code was updated from Reference-LAPACK/lapack#253 + (fixing LAPACKE interfaces to Aasen's functions) + +POWER: + * corrected CROT and ZROT behaviour with zero INC_X + +ARMV7: + * corrected xDOT behaviour with zero INC_X or INC_Y + +x86_64: + * retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER, + this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO + (which will still be supported via the slower PRESCOTT kernels when this option is not set) + * added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to + specify the list of x86_64 targets to include. Any target not on the list will be supported + by the Sandybridge or Nehalem kernels if available, or by Prescott. + * improved SWITCH_RATIO on Haswell for increased GEMM throughput + * added initial support for Intel Skylake X, including an AVX512 SGEMM kernel + * added autodetection of Intel Cannon Lake series as Skylake X + * added a default L2 cache size for hypervisors that return zero here (Chromebook) + * fixed a name clash with recent Windows10 headers that broke the build with (at least) + recent mingw from MSYS2 + * fixed a link error in mixed clang/gfortran builds with OpenMP + * updated the OSX deployment target to 10.8 + * switched on parallel make for builds on MS Windows by default + +x86: + * fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y + +==================================================================== +Version 0.3.0 +23-May-2108 + +common: + * fixed some more thread race and locking bugs + * added preliminary support for calling an OpenMP build of the library from multiple threads + * removed performance impact of thread locks added in 0.2.20 on OpenMP code + * general code cleanup + * optimized DSDOT implementation + * improved thread distribution for GEMM + * corrected IMATCOPY/OMATCOPY implementation + * fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations + * cmake build improvements + * pkgconfig file now contains build options + * openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build + * corrections and improvements for systems with more than 64 cpus + * LAPACK code updated to 3.8.0 including later fixes + * added ReLAPACK, a recursive implementation of several LAPACK functions + * Rewrote ROTMG to handle cases that the netlib code failed to address + * Disabled (broken) multithreading code for xTRMV + * corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard + * shared memory access failures on startup are now handled more gracefully + * restored utests from earlier releases (and made them pass on all affected systems) + +SPARC: + * several fixes for cpu autodetection + +POWER: + * corrected vector register overwriting in several Power8 kernels + * optimized additional BLAS functions + +ARM: + * added support for CortexA53 and A72 + * added autodetection for ThunderX2T99 + * made most optimized kernels the default for generic ARMv8 targets + +x86_64: + * parallelized DDOT kernel for Haswell + * changed alignment directives in assembly kernels to boost performance on OSX + * fixed register handling in the GEMV microkernels (bug exposed by gcc7) + * added support for building on OpenBSD and Dragonfly + * updated compiler options to work with Intel release 2018 + * support fully optimized build with clang/flang on Microsoft Windows + * fixed building on AIX + +IBM Z: + * added optimized BLAS 1/2 functions + +MIPS: + * fixed cpu autodetection helper code + * added mips32 1004K cpu (Mediatek MT7621 and similar SoC) + * added mips64 I6500 cpu + ==================================================================== Version 0.2.20 24-Jul-2017 diff --git a/Makefile b/Makefile index b947c1198..d99521b19 100644 --- a/Makefile +++ b/Makefile @@ -97,7 +97,7 @@ endif shared : ifndef NO_SHARED -ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) @$(MAKE) -C exports so @ln -fs $(LIBSONAME) $(LIBPREFIX).so @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) @@ -267,6 +267,8 @@ ifeq ($(F_COMPILER), GFORTRAN) ifdef SMP ifeq ($(OSNAME), WINNT) -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc +else ifeq ($(OSNAME), Haiku) + -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc else -@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc endif diff --git a/Makefile.install b/Makefile.install index c51c8a021..fa657beba 100644 --- a/Makefile.install +++ b/Makefile.install @@ -66,7 +66,7 @@ endif #for install shared library ifndef NO_SHARED @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) -ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 677c05d93..f831b5040 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -12,6 +12,9 @@ ifeq ($(CORE), SKYLAKEX) ifndef NO_AVX512 CCOMMON_OPT += -march=skylake-avx512 FCOMMON_OPT += -march=skylake-avx512 +ifeq ($(OSNAME), CYGWIN_NT) +CCOMMON_OPT += -fno-asynchronous-unwind-tables +endif endif endif diff --git a/README.md b/README.md index 02d087334..9ed9be337 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,7 @@ Please read `GotoBLAS_01Readme.txt`. - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. +- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. @@ -200,6 +201,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. Clang 3.0 will generate the wrong AVX binary code. +* Please use GCC version 6 or LLVM version 6 and above to compile Skyalke AVX512 kernels. * The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`), there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build the library with `BIGNUMA=1`. diff --git a/benchmark/gemv.c b/benchmark/gemv.c index c06e829d9..b6a42f42f 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -122,7 +122,7 @@ int main(int argc, char *argv[]){ FLOAT *a, *x, *y; FLOAT alpha[] = {1.0, 1.0}; - FLOAT beta [] = {1.0, 1.0}; + FLOAT beta [] = {1.0, 0.0}; char trans='N'; blasint m, i, j; blasint inc_x=1,inc_y=1; diff --git a/c_check b/c_check index 3831d7aa3..8f6296d6c 100644 --- a/c_check +++ b/c_check @@ -64,6 +64,7 @@ $os = WINNT if ($data =~ /OS_WINNT/); $os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/); $os = Interix if ($data =~ /OS_INTERIX/); $os = Android if ($data =~ /OS_ANDROID/); +$os = Haiku if ($data =~ /OS_HAIKU/); $architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/); diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index b783ef90d..f29bc3a75 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -85,7 +85,7 @@ if (NOT NOFORTRAN) endif () # Cannot run getarch on target if we are cross-compiling -if (DEFINED CORE AND CMAKE_CROSSCOMPILING) +if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE")) # Write to config as getarch would # TODO: Set up defines that getarch sets up based on every other target diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index a565fc0d5..d339a755f 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -68,7 +68,7 @@ endif() if (X86_64 OR X86) file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") -execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512) +execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") endif() diff --git a/common.h b/common.h index 663f37e7b..6c3d5b15e 100644 --- a/common.h +++ b/common.h @@ -105,6 +105,10 @@ extern "C" { #endif #endif +#ifdef OS_HAIKU +#define NO_SYSV_IPC +#endif + #ifdef OS_WINDOWS #ifdef ATOM #define GOTO_ATOM ATOM @@ -253,8 +257,14 @@ typedef unsigned long BLASULONG; #ifdef USE64BITINT typedef BLASLONG blasint; +#if defined(OS_WINDOWS) && defined(__64BIT__) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif #else typedef int blasint; +#define blasabs(x) abs(x) #endif #else #ifdef USE64BITINT diff --git a/cpuid_power.c b/cpuid_power.c index 951204ae9..6c7baef4a 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -142,6 +142,52 @@ int detect(void){ return CPUTYPE_PPC970; #endif + +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) +int id; +id = __asm __volatile("mfpvr %0" : "=r"(id)); +switch ( id >> 16 ) { + case 0x4e: // POWER9 + return return CPUTYPE_POWER8; + break; + case 0x4d: + case 0x4b: // POWER8/8E + return CPUTYPE_POWER8; + break; + case 0x4a: + case 0x3f: // POWER7/7E + return CPUTYPE_POWER6; + break; + case 0x3e: + return CPUTYPE_POWER6; + break; + case 0x3a: + return CPUTYPE_POWER5; + break; + case 0x35: + case 0x38: // POWER4 /4+ + return CPUTYPE_POWER4; + break; + case 0x40: + case 0x41: // POWER3 /3+ + return CPUTYPE_POWER3; + break; + case 0x39: + case 0x3c: + case 0x44: + case 0x45: + return CPUTYPE_PPC970; + break; + case 0x70: + return CPUTYPE_CELL; + break; + case 0x8003: + return CPUTYPE_PPCG4; + break; + default: + return CPUTYPE_UNKNOWN; + } +#endif } void get_architecture(void){ diff --git a/cpuid_x86.c b/cpuid_x86.c index 89eb809b0..512ad877b 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1452,6 +1452,8 @@ int get_cpuname(void){ switch (model) { case 1: // AMD Ryzen + case 8: + // AMD Ryzen2 if(support_avx()) #ifndef NO_AVX2 return CPUTYPE_ZEN; diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 4e1935429..e0d9221f3 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -29,15 +29,18 @@ #define CPU_GENERIC 0 #define CPU_Z13 1 +#define CPU_Z14 2 static char *cpuname[] = { "ZARCH_GENERIC", - "Z13" + "Z13", + "Z14" }; static char *cpuname_lower[] = { "zarch_generic", - "z13" + "z13", + "z14" }; int detect(void) @@ -62,6 +65,10 @@ int detect(void) if (strstr(p, "2964")) return CPU_Z13; if (strstr(p, "2965")) return CPU_Z13; + /* detect z14, but fall back to z13 */ + if (strstr(p, "3906")) return CPU_Z13; + if (strstr(p, "3907")) return CPU_Z13; + return CPU_GENERIC; } @@ -107,5 +114,9 @@ void get_cpuconfig(void) printf("#define Z13\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); break; + case CPU_Z14: + printf("#define Z14\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + break; } } diff --git a/ctest.c b/ctest.c index 00be423d1..0571e9e02 100644 --- a/ctest.c +++ b/ctest.c @@ -101,6 +101,10 @@ OS_INTERIX OS_LINUX #endif +#if defined(__HAIKU__) +OS_HAIKU +#endif + #if defined(__i386) || defined(_X86) ARCH_X86 #endif diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index ee3e3b9a9..aeb5e6ed4 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -344,6 +344,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) { + /* Make sure if no one is using workspace */ + START_RPCC(); + for (i = 0; i < args -> nthreads; i++) + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; + STOP_RPCC(waiting1); + #if defined(FUSED_GEMM) && !defined(TIMING) /* Fused operation to copy region of B into workspace and apply kernel */ @@ -381,15 +387,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } #endif - for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) { - /* Make sure if no one is using workspace */ - START_RPCC(); - while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; - STOP_RPCC(waiting1); - /* Set flag so other threads can access local region of B */ + /* Set flag so other threads can access local region of B */ + for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; - WMB; - } + WMB; } /* Get regions of B from other threads and apply kernel */ @@ -425,13 +426,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Clear synchronization flag if this thread is done with other region of B */ if (m_to - m_from == min_i) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; } } } while (current != mypos); - /* Iterate through steps of m + /* Iterate through steps of m * Note: First step has already been finished */ for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; @@ -461,14 +462,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, js); STOP_RPCC(kernel); - + #ifdef TIMING ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l; #endif - + /* Clear synchronization flag if this thread is done with region of B */ if (is + min_i >= m_to) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; } } diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 794dfb20e..1d7f570d8 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*********************************************************************/ #include "common.h" -#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) +#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU) #include #include #include diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 13794207c..1f67dc521 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -510,7 +510,7 @@ static gotoblas_t *get_coretype(void){ #ifndef NO_AVX2 return &gotoblas_HASWELL; #else - return &gotblas_SANDYBRIDGE; + return &gotoblas_SANDYBRIDGE; #endif else return &gotoblas_NEHALEM; @@ -607,7 +607,7 @@ static gotoblas_t *get_coretype(void){ } } } else if (exfamily == 8) { - if (model == 1) { + if (model == 1 || model == 8) { if(support_avx()) return &gotoblas_ZEN; else{ diff --git a/driver/others/memory.c b/driver/others/memory.c index 6bca1e11f..1d408fcda 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -468,6 +468,7 @@ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t); # if defined(OS_WINDOWS) static DWORD local_storage_key = 0; DWORD lsk; + # else static pthread_key_t local_storage_key = 0; pthread_key_t lsk; @@ -1269,6 +1270,7 @@ void blas_shutdown(void){ #ifdef SMP BLASFUNC(blas_thread_shutdown)(); #endif + #ifdef SMP /* Only cleanupIf we were built for threading and TLS was initialized */ if (local_storage_key) diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 87a27712f..3e87f2cc2 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -35,6 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#if defined(_WIN32) && defined(_MSC_VER) +#if _MSC_VER < 1900 +#define snprintf _snprintf +#endif +#endif + static char* openblas_config_str="" #ifdef USE64BITINT "USE64BITINT " diff --git a/exports/Makefile b/exports/Makefile index 127b05057..29075a9c2 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -122,7 +122,7 @@ endif dllinit.$(SUFFIX) : dllinit.c $(CC) $(CFLAGS) -c -o $(@F) -s $< -ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) so : ../$(LIBSONAME) diff --git a/interface/gbmv.c b/interface/gbmv.c index 096c9f6f2..1d58ba807 100644 --- a/interface/gbmv.c +++ b/interface/gbmv.c @@ -213,7 +213,7 @@ void CNAME(enum CBLAS_ORDER order, if (trans) lenx = m; if (trans) leny = n; - if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; diff --git a/interface/gemv.c b/interface/gemv.c index 30709e361..c9d52cd69 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -199,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order, if (trans) lenx = m; if (trans) leny = n; - if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; diff --git a/interface/rotg.c b/interface/rotg.c index 092554299..69443a5a0 100644 --- a/interface/rotg.c +++ b/interface/rotg.c @@ -22,8 +22,8 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ long double s; long double r, roe, z; - long double ada = fabs(da); - long double adb = fabs(db); + long double ada = fabsl(da); + long double adb = fabsl(db); long double scale = ada + adb; #ifndef CBLAS diff --git a/interface/sbmv.c b/interface/sbmv.c index 761a9a0d0..25e99ca34 100644 --- a/interface/sbmv.c +++ b/interface/sbmv.c @@ -184,7 +184,7 @@ void CNAME(enum CBLAS_ORDER order, if (n == 0) return; - if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; diff --git a/interface/spmv.c b/interface/spmv.c index 403458b06..e08ae3f6e 100644 --- a/interface/spmv.c +++ b/interface/spmv.c @@ -168,7 +168,7 @@ void CNAME(enum CBLAS_ORDER order, if (n == 0) return; - if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; diff --git a/interface/symv.c b/interface/symv.c index e4e300e20..07bd20022 100644 --- a/interface/symv.c +++ b/interface/symv.c @@ -166,7 +166,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, if (n == 0) return; - if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; diff --git a/interface/zgbmv.c b/interface/zgbmv.c index a04be2fbf..5e275a8ed 100644 --- a/interface/zgbmv.c +++ b/interface/zgbmv.c @@ -237,7 +237,7 @@ void CNAME(enum CBLAS_ORDER order, if (trans & 1) lenx = m; if (trans & 1) leny = n; - if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) return; diff --git a/interface/zgemv.c b/interface/zgemv.c index 0c75564f0..3e98dba7f 100644 --- a/interface/zgemv.c +++ b/interface/zgemv.c @@ -225,7 +225,7 @@ void CNAME(enum CBLAS_ORDER order, if (trans & 1) lenx = m; if (trans & 1) leny = n; - if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) return; diff --git a/interface/zhbmv.c b/interface/zhbmv.c index 9ad1b53a1..656f137c6 100644 --- a/interface/zhbmv.c +++ b/interface/zhbmv.c @@ -190,7 +190,7 @@ void CNAME(enum CBLAS_ORDER order, if (n == 0) return; - if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; diff --git a/interface/zhemv.c b/interface/zhemv.c index 2aee880dc..d1996ad69 100644 --- a/interface/zhemv.c +++ b/interface/zhemv.c @@ -181,7 +181,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA if (n == 0) return; - if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; diff --git a/interface/zhpmv.c b/interface/zhpmv.c index b72a6d670..ff49716b5 100644 --- a/interface/zhpmv.c +++ b/interface/zhpmv.c @@ -180,7 +180,7 @@ void CNAME(enum CBLAS_ORDER order, if (n == 0) return; - if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; diff --git a/interface/zrotg.c b/interface/zrotg.c index 187343d41..8caa411fc 100644 --- a/interface/zrotg.c +++ b/interface/zrotg.c @@ -14,7 +14,7 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ long double db_i = *(DB + 1); long double r; - long double ada = fabs(da_r) + fabs(da_i); + long double ada = fabsl(da_r) + fabsl(da_i); PRINT_DEBUG_NAME; diff --git a/interface/zsbmv.c b/interface/zsbmv.c index b71d4c519..cd5cefed9 100644 --- a/interface/zsbmv.c +++ b/interface/zsbmv.c @@ -126,7 +126,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint * if (n == 0) return; - if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0); + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, blasabs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index 6afb2cf13..e257dcfc9 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -1,3 +1,12 @@ +CAXPYKERNEL = ../mips/zaxpy.c +ZAXPYKERNEL = ../mips/zaxpy.c +SROTKERNEL = ../mips/rot.c +DROTKERNEL = ../mips/rot.c +CROTKERNEL = ../mips/zrot.c +ZROTKERNEL = ../mips/zrot.c +CSWAPKERNEL = ../mips/zswap.c +ZSWAPKERNEL = ../mips/zswap.c + ifndef SNRM2KERNEL SNRM2KERNEL = snrm2.S endif diff --git a/kernel/mips64/dot.S b/kernel/mips64/dot.S index cb6fbe99c..a645495f4 100644 --- a/kernel/mips64/dot.S +++ b/kernel/mips64/dot.S @@ -103,35 +103,83 @@ .align 3 .L12: +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 +#endif LD a1, 4 * SIZE(X) LD b1, 4 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a2, a2 +cvt.d.s b2, b2 +madd.d s2, s2, a2, b2 +#else MADD s2, s2, a2, b2 +#endif LD a2, 5 * SIZE(X) LD b2, 5 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a3, a3 +cvt.d.s b3, b3 +madd.d s1, s1, a3, b3 +#else MADD s1, s1, a3, b3 +#endif LD a3, 6 * SIZE(X) LD b3, 6 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a4, a4 +cvt.d.s b4, b4 +madd.d s2, s2, a4, b4 +#else MADD s2, s2, a4, b4 +#endif LD a4, 7 * SIZE(X) LD b4, 7 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 +#endif LD a1, 8 * SIZE(X) LD b1, 8 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a2, a2 +cvt.d.s b2, b2 +madd.d s2, s2, a2, b2 +#else MADD s2, s2, a2, b2 +#endif LD a2, 9 * SIZE(X) LD b2, 9 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a3, a3 +cvt.d.s b3, b3 +madd.d s1, s1, a3, b3 +#else MADD s1, s1, a3, b3 +#endif LD a3, 10 * SIZE(X) LD b3, 10 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a4, a4 +cvt.d.s b4, b4 +madd.d s2, s2, a4, b4 +#else MADD s2, s2, a4, b4 +#endif LD a4, 11 * SIZE(X) LD b4, 11 * SIZE(Y) @@ -143,29 +191,77 @@ .align 3 .L13: +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 +#endif LD a1, 4 * SIZE(X) LD b1, 4 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a2, a2 +cvt.d.s b2, b2 +madd.d s2, s2, a2, b2 +#else MADD s2, s2, a2, b2 +#endif LD a2, 5 * SIZE(X) LD b2, 5 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a3, a3 +cvt.d.s b3, b3 +madd.d s1, s1, a3, b3 +#else MADD s1, s1, a3, b3 +#endif LD a3, 6 * SIZE(X) LD b3, 6 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a4, a4 +cvt.d.s b4, b4 +madd.d s2, s2, a4, b4 +#else MADD s2, s2, a4, b4 +#endif LD a4, 7 * SIZE(X) LD b4, 7 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 +#endif daddiu X, X, 8 * SIZE +#ifdef DSDOT +cvt.d.s a2, a2 +cvt.d.s b2, b2 +madd.d s2, s2, a2, b2 +#else MADD s2, s2, a2, b2 +#endif daddiu Y, Y, 8 * SIZE +#ifdef DSDOT +cvt.d.s a3, a3 +cvt.d.s b3, b3 +madd.d s1, s1, a3, b3 +#else MADD s1, s1, a3, b3 +#endif +#ifdef DSDOT +cvt.d.s a4, a4 +cvt.d.s b4, b4 +madd.d s2, s2, a4, b4 +#else MADD s2, s2, a4, b4 +#endif .align 3 .L15: @@ -179,8 +275,13 @@ LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 - +#endif daddiu I, I, -1 daddiu X, X, SIZE @@ -225,50 +326,85 @@ LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s2, s2, a1, b1 +#else MADD s2, s2, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s2, s2, a1, b1 +#else MADD s2, s2, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s2, s2, a1, b1 +#else MADD s2, s2, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) @@ -277,7 +413,13 @@ daddiu I, I, -1 bgtz I, .L23 +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s2, s2, a1, b1 +#else MADD s2, s2, a1, b1 +#endif .align 3 .L25: @@ -296,13 +438,20 @@ daddiu I, I, -1 bgtz I, .L26 +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 +#endif .align 3 .L999: - ADD s1, s1, s2 #ifdef DSDOT - cvt.d.s s1, s1 + add.d s1, s1, s2 +#else + ADD s1, s1, s2 #endif j $31 NOP diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index b4acdccd2..cde5bdaa6 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_steamroller-2.c" #elif defined(PILEDRIVER) #include "daxpy_microk_piledriver-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "daxpy_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "daxpy_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "daxpy_microk_sandy-2.c" #endif diff --git a/kernel/x86_64/daxpy_microk_skylakex-2.c b/kernel/x86_64/daxpy_microk_skylakex-2.c new file mode 100644 index 000000000..e785a39f1 --- /dev/null +++ b/kernel/x86_64/daxpy_microk_skylakex-2.c @@ -0,0 +1,71 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#include + +#define HAVE_KERNEL_8 1 + +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i = 0; + + __m256d __alpha; + + __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); + +#ifdef __AVX512CD__ + BLASLONG n32; + __m512d __alpha5; + __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); + + n32 = n & ~31; + + for (; i < n32; i+= 32) { + _mm512_storeu_pd(&y[i + 0], _mm512_loadu_pd(&y[i + 0]) + __alpha5 * _mm512_loadu_pd(&x[i + 0])); + _mm512_storeu_pd(&y[i + 8], _mm512_loadu_pd(&y[i + 8]) + __alpha5 * _mm512_loadu_pd(&x[i + 8])); + _mm512_storeu_pd(&y[i + 16], _mm512_loadu_pd(&y[i + 16]) + __alpha5 * _mm512_loadu_pd(&x[i + 16])); + _mm512_storeu_pd(&y[i + 24], _mm512_loadu_pd(&y[i + 24]) + __alpha5 * _mm512_loadu_pd(&x[i + 24])); + } + +#endif + + for (; i < n; i+= 16) { + _mm256_storeu_pd(&y[i + 0], _mm256_loadu_pd(&y[i + 0]) + __alpha * _mm256_loadu_pd(&x[i + 0])); + _mm256_storeu_pd(&y[i + 4], _mm256_loadu_pd(&y[i + 4]) + __alpha * _mm256_loadu_pd(&x[i + 4])); + _mm256_storeu_pd(&y[i + 8], _mm256_loadu_pd(&y[i + 8]) + __alpha * _mm256_loadu_pd(&x[i + 8])); + _mm256_storeu_pd(&y[i + 12], _mm256_loadu_pd(&y[i + 12]) + __alpha * _mm256_loadu_pd(&x[i + 12])); + } +} +#else +#include "daxpy_microk_haswell-2.c" +#endif + + diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 0dc9cd3da..969357614 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ddot_microk_piledriver-2.c" #elif defined(NEHALEM) #include "ddot_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "ddot_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "ddot_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "ddot_microk_sandy-2.c" #endif diff --git a/kernel/x86_64/ddot_microk_skylakex-2.c b/kernel/x86_64/ddot_microk_skylakex-2.c new file mode 100644 index 000000000..8eabf225a --- /dev/null +++ b/kernel/x86_64/ddot_microk_skylakex-2.c @@ -0,0 +1,96 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_8 1 + +#include + +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + int i = 0; + __m256d accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_pd(); + accum_1 = _mm256_setzero_pd(); + accum_2 = _mm256_setzero_pd(); + accum_3 = _mm256_setzero_pd(); + +#ifdef __AVX512CD__ + __m512d accum_05, accum_15, accum_25, accum_35; + int n32; + n32 = n & (~31); + + accum_05 = _mm512_setzero_pd(); + accum_15 = _mm512_setzero_pd(); + accum_25 = _mm512_setzero_pd(); + accum_35 = _mm512_setzero_pd(); + + for (; i < n32; i += 32) { + accum_05 += _mm512_loadu_pd(&x[i+ 0]) * _mm512_loadu_pd(&y[i+ 0]); + accum_15 += _mm512_loadu_pd(&x[i+ 8]) * _mm512_loadu_pd(&y[i+ 8]); + accum_25 += _mm512_loadu_pd(&x[i+16]) * _mm512_loadu_pd(&y[i+16]); + accum_35 += _mm512_loadu_pd(&x[i+24]) * _mm512_loadu_pd(&y[i+24]); + } + + /* + * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code + * below can continue using the intermediate results in its loop + */ + accum_0 = _mm512_extractf64x4_pd(accum_05, 0) + _mm512_extractf64x4_pd(accum_05, 1); + accum_1 = _mm512_extractf64x4_pd(accum_15, 0) + _mm512_extractf64x4_pd(accum_15, 1); + accum_2 = _mm512_extractf64x4_pd(accum_25, 0) + _mm512_extractf64x4_pd(accum_25, 1); + accum_3 = _mm512_extractf64x4_pd(accum_35, 0) + _mm512_extractf64x4_pd(accum_35, 1); + +#endif + for (; i < n; i += 16) { + accum_0 += _mm256_loadu_pd(&x[i+ 0]) * _mm256_loadu_pd(&y[i+ 0]); + accum_1 += _mm256_loadu_pd(&x[i+ 4]) * _mm256_loadu_pd(&y[i+ 4]); + accum_2 += _mm256_loadu_pd(&x[i+ 8]) * _mm256_loadu_pd(&y[i+ 8]); + accum_3 += _mm256_loadu_pd(&x[i+12]) * _mm256_loadu_pd(&y[i+12]); + } + + /* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */ + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + + __m128d half_accum0; + + /* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */ + half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); + + /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */ + half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); + + *dot = half_accum0[0]; +} + +#else +#include "ddot_microk_haswell-2.c" +#endif diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 309fbe767..6d2530e81 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "dgemv_n_microk_nehalem-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dgemv_n_microk_haswell-4.c" +#elif defined (SKYLAKEX) +#include "dgemv_n_microk_skylakex-4.c" #endif diff --git a/kernel/x86_64/dgemv_n_microk_skylakex-4.c b/kernel/x86_64/dgemv_n_microk_skylakex-4.c new file mode 100644 index 000000000..4030399ab --- /dev/null +++ b/kernel/x86_64/dgemv_n_microk_skylakex-4.c @@ -0,0 +1,126 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_4x4 1 + +#include + +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + int i = 0; + + __m256d x0, x1, x2, x3; + __m256d __alpha; + + x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0])); + x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1])); + x2 = _mm256_broadcastsd_pd(_mm_load_sd(&x[2])); + x3 = _mm256_broadcastsd_pd(_mm_load_sd(&x[3])); + + __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); + +#ifdef __AVX512CD__ + int n5; + __m512d x05, x15, x25, x35; + __m512d __alpha5; + n5 = n & ~7; + + x05 = _mm512_broadcastsd_pd(_mm_load_sd(&x[0])); + x15 = _mm512_broadcastsd_pd(_mm_load_sd(&x[1])); + x25 = _mm512_broadcastsd_pd(_mm_load_sd(&x[2])); + x35 = _mm512_broadcastsd_pd(_mm_load_sd(&x[3])); + + __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); + + for (; i < n5; i+= 8) { + __m512d tempY; + __m512d sum; + + sum = _mm512_loadu_pd(&ap[0][i]) * x05 + + _mm512_loadu_pd(&ap[1][i]) * x15 + + _mm512_loadu_pd(&ap[2][i]) * x25 + + _mm512_loadu_pd(&ap[3][i]) * x35; + + tempY = _mm512_loadu_pd(&y[i]); + tempY += sum * __alpha5; + _mm512_storeu_pd(&y[i], tempY); + } +#endif + + for (; i < n; i+= 4) { + __m256d tempY; + __m256d sum; + + sum = _mm256_loadu_pd(&ap[0][i]) * x0 + + _mm256_loadu_pd(&ap[1][i]) * x1 + + _mm256_loadu_pd(&ap[2][i]) * x2 + + _mm256_loadu_pd(&ap[3][i]) * x3; + + tempY = _mm256_loadu_pd(&y[i]); + tempY += sum * __alpha; + _mm256_storeu_pd(&y[i], tempY); + } + +} + + +#define HAVE_KERNEL_4x2 + +static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + int i = 0; + + __m256d x0, x1; + __m256d __alpha; + + x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0])); + x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1])); + + __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); + + + for (i = 0; i < n; i+= 4) { + __m256d tempY; + __m256d sum; + + sum = _mm256_loadu_pd(&ap[0][i]) * x0 + _mm256_loadu_pd(&ap[1][i]) * x1; + + tempY = _mm256_loadu_pd(&y[i]); + tempY += sum * __alpha; + _mm256_storeu_pd(&y[i], tempY); + } + +} + +#else +#include "dgemv_n_microk_haswell-4.c" +#endif diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index 2c7b3b17c..ef9a0a6ba 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dscal_microk_bulldozer-2.c" #elif defined(SANDYBRIDGE) #include "dscal_microk_sandy-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "dscal_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "dscal_microk_skylakex-2.c" #endif diff --git a/kernel/x86_64/dscal_microk_skylakex-2.c b/kernel/x86_64/dscal_microk_skylakex-2.c new file mode 100644 index 000000000..e0598272e --- /dev/null +++ b/kernel/x86_64/dscal_microk_skylakex-2.c @@ -0,0 +1,77 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#include + +#define HAVE_KERNEL_8 1 + +static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + int i = 0; + +#ifdef __AVX512CD__ + __m512d __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); + for (; i < n; i += 8) { + _mm512_storeu_pd(&x[i + 0], __alpha5 * _mm512_loadu_pd(&x[i + 0])); + } +#else + __m256d __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); + for (; i < n; i += 8) { + _mm256_storeu_pd(&x[i + 0], __alpha * _mm256_loadu_pd(&x[i + 0])); + _mm256_storeu_pd(&x[i + 4], __alpha * _mm256_loadu_pd(&x[i + 4])); + } +#endif +} + + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + int i = 0; + + /* question to self: Why is this not just memset() */ + +#ifdef __AVX512CD__ + __m512d zero = _mm512_setzero_pd(); + for (; i < n; i += 8) { + _mm512_storeu_pd(&x[i], zero); + } +#else + __m256d zero = _mm256_setzero_pd(); + for (; i < n; i += 8) { + _mm256_storeu_pd(&x[i + 0], zero); + _mm256_storeu_pd(&x[i + 4], zero); + } +#endif + +} + +#else +#include "dscal_microk_haswell-2.c" +#endif diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index 73099462c..a722cc9df 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -30,8 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_L_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "dsymv_L_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "dsymv_L_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_L_microk_sandy-2.c" #elif defined(NEHALEM) diff --git a/kernel/x86_64/dsymv_L_microk_skylakex-2.c b/kernel/x86_64/dsymv_L_microk_skylakex-2.c new file mode 100644 index 000000000..8244dffa1 --- /dev/null +++ b/kernel/x86_64/dsymv_L_microk_skylakex-2.c @@ -0,0 +1,161 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#include + +#define HAVE_KERNEL_4x4 1 + +static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + + __m256d accum_0, accum_1, accum_2, accum_3; + __m256d temp1_0, temp1_1, temp1_2, temp1_3; + + /* the 256 bit wide acculmulator vectors start out as zero */ + accum_0 = _mm256_setzero_pd(); + accum_1 = _mm256_setzero_pd(); + accum_2 = _mm256_setzero_pd(); + accum_3 = _mm256_setzero_pd(); + + temp1_0 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[0])); + temp1_1 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[1])); + temp1_2 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[2])); + temp1_3 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[3])); + +#ifdef __AVX512CD__ + __m512d accum_05, accum_15, accum_25, accum_35; + __m512d temp1_05, temp1_15, temp1_25, temp1_35; + BLASLONG to2; + int delta; + + /* the 512 bit wide accumulator vectors start out as zero */ + accum_05 = _mm512_setzero_pd(); + accum_15 = _mm512_setzero_pd(); + accum_25 = _mm512_setzero_pd(); + accum_35 = _mm512_setzero_pd(); + + temp1_05 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[0])); + temp1_15 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[1])); + temp1_25 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[2])); + temp1_35 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[3])); + + delta = (to - from) & ~7; + to2 = from + delta; + + + for (; from < to2; from += 8) { + __m512d _x, _y; + __m512d a0, a1, a2, a3; + + _y = _mm512_loadu_pd(&y[from]); + _x = _mm512_loadu_pd(&x[from]); + + a0 = _mm512_loadu_pd(&a[0][from]); + a1 = _mm512_loadu_pd(&a[1][from]); + a2 = _mm512_loadu_pd(&a[2][from]); + a3 = _mm512_loadu_pd(&a[3][from]); + + _y += temp1_05 * a0 + temp1_15 * a1 + temp1_25 * a2 + temp1_35 * a3; + + accum_05 += _x * a0; + accum_15 += _x * a1; + accum_25 += _x * a2; + accum_35 += _x * a3; + + _mm512_storeu_pd(&y[from], _y); + + }; + + /* + * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code + * below can continue using the intermediate results in its loop + */ + accum_0 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_05, 0), _mm512_extractf64x4_pd(accum_05, 1)); + accum_1 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_15, 0), _mm512_extractf64x4_pd(accum_15, 1)); + accum_2 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_25, 0), _mm512_extractf64x4_pd(accum_25, 1)); + accum_3 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_35, 0), _mm512_extractf64x4_pd(accum_35, 1)); + +#endif + + for (; from != to; from += 4) { + __m256d _x, _y; + __m256d a0, a1, a2, a3; + + _y = _mm256_loadu_pd(&y[from]); + _x = _mm256_loadu_pd(&x[from]); + + /* load 4 rows of matrix data */ + a0 = _mm256_loadu_pd(&a[0][from]); + a1 = _mm256_loadu_pd(&a[1][from]); + a2 = _mm256_loadu_pd(&a[2][from]); + a3 = _mm256_loadu_pd(&a[3][from]); + + _y += temp1_0 * a0 + temp1_1 * a1 + temp1_2 * a2 + temp1_3 * a3; + + accum_0 += _x * a0; + accum_1 += _x * a1; + accum_2 += _x * a2; + accum_3 += _x * a3; + + _mm256_storeu_pd(&y[from], _y); + + }; + + /* + * we now have 4 accumulator vectors. Each vector needs to be summed up element wise and stored in the temp2 + * output array. There is no direct instruction for this in 256 bit space, only in 128 space. + */ + + __m128d half_accum0, half_accum1, half_accum2, half_accum3; + + + /* Add upper half to lower half of each of the four 256 bit vectors to get to four 128 bit vectors */ + half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); + half_accum1 = _mm_add_pd(_mm256_extractf128_pd(accum_1, 0), _mm256_extractf128_pd(accum_1, 1)); + half_accum2 = _mm_add_pd(_mm256_extractf128_pd(accum_2, 0), _mm256_extractf128_pd(accum_2, 1)); + half_accum3 = _mm_add_pd(_mm256_extractf128_pd(accum_3, 0), _mm256_extractf128_pd(accum_3, 1)); + + /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */ + half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); + half_accum1 = _mm_hadd_pd(half_accum1, half_accum1); + half_accum2 = _mm_hadd_pd(half_accum2, half_accum2); + half_accum3 = _mm_hadd_pd(half_accum3, half_accum3); + + /* and store the lowest double value from each of these vectors in the temp2 output */ + temp2[0] += half_accum0[0]; + temp2[1] += half_accum1[0]; + temp2[2] += half_accum2[0]; + temp2[3] += half_accum3[0]; +} +#else +#include "dsymv_L_microk_haswell-2.c" +#endif \ No newline at end of file diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index d89c4070d..e1349da58 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "saxpy_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "saxpy_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "saxpy_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "saxpy_microk_sandy-2.c" #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) diff --git a/kernel/x86_64/saxpy_microk_skylakex-2.c b/kernel/x86_64/saxpy_microk_skylakex-2.c new file mode 100644 index 000000000..950f10ba2 --- /dev/null +++ b/kernel/x86_64/saxpy_microk_skylakex-2.c @@ -0,0 +1,69 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_16 1 + +#include + +static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i = 0; + + __m256 __alpha; + + __alpha = _mm256_broadcastss_ps(_mm_load_ss(alpha)); + +#ifdef __AVX512CD__ + BLASLONG n64; + __m512 __alpha5; + __alpha5 = _mm512_broadcastss_ps(_mm_load_ss(alpha)); + + n64 = n & ~63; + + for (; i < n64; i+= 64) { + _mm512_storeu_ps(&y[i + 0], _mm512_loadu_ps(&y[i + 0]) + __alpha5 * _mm512_loadu_ps(&x[i + 0])); + _mm512_storeu_ps(&y[i + 16], _mm512_loadu_ps(&y[i + 16]) + __alpha5 * _mm512_loadu_ps(&x[i + 16])); + _mm512_storeu_ps(&y[i + 32], _mm512_loadu_ps(&y[i + 32]) + __alpha5 * _mm512_loadu_ps(&x[i + 32])); + _mm512_storeu_ps(&y[i + 48], _mm512_loadu_ps(&y[i + 48]) + __alpha5 * _mm512_loadu_ps(&x[i + 48])); + } + +#endif + + for (; i < n; i+= 32) { + _mm256_storeu_ps(&y[i + 0], _mm256_loadu_ps(&y[i + 0]) + __alpha * _mm256_loadu_ps(&x[i + 0])); + _mm256_storeu_ps(&y[i + 8], _mm256_loadu_ps(&y[i + 8]) + __alpha * _mm256_loadu_ps(&x[i + 8])); + _mm256_storeu_ps(&y[i + 16], _mm256_loadu_ps(&y[i + 16]) + __alpha * _mm256_loadu_ps(&x[i + 16])); + _mm256_storeu_ps(&y[i + 24], _mm256_loadu_ps(&y[i + 24]) + __alpha * _mm256_loadu_ps(&x[i + 24])); + } +} +#else +#include "saxpy_microk_haswell-2.c" +#endif + diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index c3ab2ffe6..3536afc9e 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sdot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "sdot_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "sdot_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "sdot_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "sdot_microk_sandy-2.c" #endif diff --git a/kernel/x86_64/sdot_microk_skylakex-2.c b/kernel/x86_64/sdot_microk_skylakex-2.c new file mode 100644 index 000000000..1fcb7f27c --- /dev/null +++ b/kernel/x86_64/sdot_microk_skylakex-2.c @@ -0,0 +1,98 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_16 1 + +#include + +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + +{ + int i = 0; + __m256 accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_ps(); + accum_1 = _mm256_setzero_ps(); + accum_2 = _mm256_setzero_ps(); + accum_3 = _mm256_setzero_ps(); + +#ifdef __AVX512CD__ + __m512 accum_05, accum_15, accum_25, accum_35; + int n64; + n64 = n & (~63); + + accum_05 = _mm512_setzero_ps(); + accum_15 = _mm512_setzero_ps(); + accum_25 = _mm512_setzero_ps(); + accum_35 = _mm512_setzero_ps(); + + for (; i < n64; i += 64) { + accum_05 += _mm512_loadu_ps(&x[i+ 0]) * _mm512_loadu_ps(&y[i+ 0]); + accum_15 += _mm512_loadu_ps(&x[i+16]) * _mm512_loadu_ps(&y[i+16]); + accum_25 += _mm512_loadu_ps(&x[i+32]) * _mm512_loadu_ps(&y[i+32]); + accum_35 += _mm512_loadu_ps(&x[i+48]) * _mm512_loadu_ps(&y[i+48]); + } + + /* + * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code + * below can continue using the intermediate results in its loop + */ + accum_0 = _mm512_extractf32x8_ps(accum_05, 0) + _mm512_extractf32x8_ps(accum_05, 1); + accum_1 = _mm512_extractf32x8_ps(accum_15, 0) + _mm512_extractf32x8_ps(accum_15, 1); + accum_2 = _mm512_extractf32x8_ps(accum_25, 0) + _mm512_extractf32x8_ps(accum_25, 1); + accum_3 = _mm512_extractf32x8_ps(accum_35, 0) + _mm512_extractf32x8_ps(accum_35, 1); + +#endif + for (; i < n; i += 32) { + accum_0 += _mm256_loadu_ps(&x[i+ 0]) * _mm256_loadu_ps(&y[i+ 0]); + accum_1 += _mm256_loadu_ps(&x[i+ 8]) * _mm256_loadu_ps(&y[i+ 8]); + accum_2 += _mm256_loadu_ps(&x[i+16]) * _mm256_loadu_ps(&y[i+16]); + accum_3 += _mm256_loadu_ps(&x[i+24]) * _mm256_loadu_ps(&y[i+24]); + } + + /* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */ + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + + __m128 half_accum0; + + /* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */ + half_accum0 = _mm256_extractf128_ps(accum_0, 0) + _mm256_extractf128_ps(accum_0, 1); + + /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */ + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + + *dot = half_accum0[0]; +} + +#else +#include "sdot_microk_haswell-2.c" +#endif diff --git a/utest/ctest.h b/utest/ctest.h index 1deea32f6..f297dafba 100644 --- a/utest/ctest.h +++ b/utest/ctest.h @@ -84,7 +84,7 @@ struct ctest { #endif #if _MSC_VER < 1900 -#define snprintf _snprintf_s +#define snprintf _snprintf #endif #ifndef __cplusplus