Merge branch 'develop' into interim033

This commit is contained in:
Martin Kroeker 2018-08-25 19:45:19 +02:00 committed by GitHub
commit b1b743f434
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
51 changed files with 1130 additions and 57 deletions

View File

@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 1.dev)
set(OpenBLAS_PATCH_VERSION 3.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions
@ -150,6 +150,7 @@ endif()
# add objects to the openblas lib
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>)
# Android needs to explicitly link against libm
if(ANDROID)
@ -169,6 +170,7 @@ endif()
# Set output for libopenblas
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS")
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )

View File

@ -1,4 +1,115 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.2
30-Jul-2018
common:
* fixes for regressions caused by the rewrite of the thread
initialization code in 0.3.1
POWER:
* fixed cpu autodetection for the BSDs
MIPS64:
* fixed utest errors in AXPY, DSDOT, ROT and SWAP
x86_64:
* added autodetection of AMD Ryzen 2
* fixed build with older versions of MSVC
====================================================================
Version 0.3.1
01-Jul-2018
common:
* rewritten thread initialization code with significantly reduced overhead
* added CBLAS interfaces to the IxAMIN BLAS extension functions
* fixed the lapack-test target
* CMAKE builds now create an OpenBLASConfig.cmake file
* ZAXPY now uses a single thread for small input sizes
* the LAPACK code was updated from Reference-LAPACK/lapack#253
(fixing LAPACKE interfaces to Aasen's functions)
POWER:
* corrected CROT and ZROT behaviour with zero INC_X
ARMV7:
* corrected xDOT behaviour with zero INC_X or INC_Y
x86_64:
* retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER,
this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO
(which will still be supported via the slower PRESCOTT kernels when this option is not set)
* added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to
specify the list of x86_64 targets to include. Any target not on the list will be supported
by the Sandybridge or Nehalem kernels if available, or by Prescott.
* improved SWITCH_RATIO on Haswell for increased GEMM throughput
* added initial support for Intel Skylake X, including an AVX512 SGEMM kernel
* added autodetection of Intel Cannon Lake series as Skylake X
* added a default L2 cache size for hypervisors that return zero here (Chromebook)
* fixed a name clash with recent Windows10 headers that broke the build with (at least)
recent mingw from MSYS2
* fixed a link error in mixed clang/gfortran builds with OpenMP
* updated the OSX deployment target to 10.8
* switched on parallel make for builds on MS Windows by default
x86:
* fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y
====================================================================
Version 0.3.0
23-May-2108
common:
* fixed some more thread race and locking bugs
* added preliminary support for calling an OpenMP build of the library from multiple threads
* removed performance impact of thread locks added in 0.2.20 on OpenMP code
* general code cleanup
* optimized DSDOT implementation
* improved thread distribution for GEMM
* corrected IMATCOPY/OMATCOPY implementation
* fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations
* cmake build improvements
* pkgconfig file now contains build options
* openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build
* corrections and improvements for systems with more than 64 cpus
* LAPACK code updated to 3.8.0 including later fixes
* added ReLAPACK, a recursive implementation of several LAPACK functions
* Rewrote ROTMG to handle cases that the netlib code failed to address
* Disabled (broken) multithreading code for xTRMV
* corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard
* shared memory access failures on startup are now handled more gracefully
* restored utests from earlier releases (and made them pass on all affected systems)
SPARC:
* several fixes for cpu autodetection
POWER:
* corrected vector register overwriting in several Power8 kernels
* optimized additional BLAS functions
ARM:
* added support for CortexA53 and A72
* added autodetection for ThunderX2T99
* made most optimized kernels the default for generic ARMv8 targets
x86_64:
* parallelized DDOT kernel for Haswell
* changed alignment directives in assembly kernels to boost performance on OSX
* fixed register handling in the GEMV microkernels (bug exposed by gcc7)
* added support for building on OpenBSD and Dragonfly
* updated compiler options to work with Intel release 2018
* support fully optimized build with clang/flang on Microsoft Windows
* fixed building on AIX
IBM Z:
* added optimized BLAS 1/2 functions
MIPS:
* fixed cpu autodetection helper code
* added mips32 1004K cpu (Mediatek MT7621 and similar SoC)
* added mips64 I6500 cpu
====================================================================
Version 0.2.20
24-Jul-2017

View File

@ -97,7 +97,7 @@ endif
shared :
ifndef NO_SHARED
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
@ -267,6 +267,8 @@ ifeq ($(F_COMPILER), GFORTRAN)
ifdef SMP
ifeq ($(OSNAME), WINNT)
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
else ifeq ($(OSNAME), Haiku)
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
else
-@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
endif

View File

@ -66,7 +66,7 @@ endif
#for install shared library
ifndef NO_SHARED
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \

View File

@ -12,6 +12,9 @@ ifeq ($(CORE), SKYLAKEX)
ifndef NO_AVX512
CCOMMON_OPT += -march=skylake-avx512
FCOMMON_OPT += -march=skylake-avx512
ifeq ($(OSNAME), CYGWIN_NT)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
endif
endif

View File

@ -110,6 +110,7 @@ Please read `GotoBLAS_01Readme.txt`.
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
@ -200,6 +201,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
Clang 3.0 will generate the wrong AVX binary code.
* Please use GCC version 6 or LLVM version 6 and above to compile Skyalke AVX512 kernels.
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
the library with `BIGNUMA=1`.

View File

@ -122,7 +122,7 @@ int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
FLOAT beta [] = {1.0, 0.0};
char trans='N';
blasint m, i, j;
blasint inc_x=1,inc_y=1;

View File

@ -64,6 +64,7 @@ $os = WINNT if ($data =~ /OS_WINNT/);
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
$os = Interix if ($data =~ /OS_INTERIX/);
$os = Android if ($data =~ /OS_ANDROID/);
$os = Haiku if ($data =~ /OS_HAIKU/);
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);

View File

@ -85,7 +85,7 @@ if (NOT NOFORTRAN)
endif ()
# Cannot run getarch on target if we are cross-compiling
if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE"))
# Write to config as getarch would
# TODO: Set up defines that getarch sets up based on every other target

View File

@ -68,7 +68,7 @@ endif()
if (X86_64 OR X86)
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512)
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
if (NO_AVX512 EQUAL 1)
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
endif()

View File

@ -105,6 +105,10 @@ extern "C" {
#endif
#endif
#ifdef OS_HAIKU
#define NO_SYSV_IPC
#endif
#ifdef OS_WINDOWS
#ifdef ATOM
#define GOTO_ATOM ATOM
@ -253,8 +257,14 @@ typedef unsigned long BLASULONG;
#ifdef USE64BITINT
typedef BLASLONG blasint;
#if defined(OS_WINDOWS) && defined(__64BIT__)
#define blasabs(x) llabs(x)
#else
#define blasabs(x) labs(x)
#endif
#else
typedef int blasint;
#define blasabs(x) abs(x)
#endif
#else
#ifdef USE64BITINT

View File

@ -142,6 +142,52 @@ int detect(void){
return CPUTYPE_PPC970;
#endif
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__)
int id;
id = __asm __volatile("mfpvr %0" : "=r"(id));
switch ( id >> 16 ) {
case 0x4e: // POWER9
return return CPUTYPE_POWER8;
break;
case 0x4d:
case 0x4b: // POWER8/8E
return CPUTYPE_POWER8;
break;
case 0x4a:
case 0x3f: // POWER7/7E
return CPUTYPE_POWER6;
break;
case 0x3e:
return CPUTYPE_POWER6;
break;
case 0x3a:
return CPUTYPE_POWER5;
break;
case 0x35:
case 0x38: // POWER4 /4+
return CPUTYPE_POWER4;
break;
case 0x40:
case 0x41: // POWER3 /3+
return CPUTYPE_POWER3;
break;
case 0x39:
case 0x3c:
case 0x44:
case 0x45:
return CPUTYPE_PPC970;
break;
case 0x70:
return CPUTYPE_CELL;
break;
case 0x8003:
return CPUTYPE_PPCG4;
break;
default:
return CPUTYPE_UNKNOWN;
}
#endif
}
void get_architecture(void){

View File

@ -1452,6 +1452,8 @@ int get_cpuname(void){
switch (model) {
case 1:
// AMD Ryzen
case 8:
// AMD Ryzen2
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_ZEN;

View File

@ -29,15 +29,18 @@
#define CPU_GENERIC 0
#define CPU_Z13 1
#define CPU_Z14 2
static char *cpuname[] = {
"ZARCH_GENERIC",
"Z13"
"Z13",
"Z14"
};
static char *cpuname_lower[] = {
"zarch_generic",
"z13"
"z13",
"z14"
};
int detect(void)
@ -62,6 +65,10 @@ int detect(void)
if (strstr(p, "2964")) return CPU_Z13;
if (strstr(p, "2965")) return CPU_Z13;
/* detect z14, but fall back to z13 */
if (strstr(p, "3906")) return CPU_Z13;
if (strstr(p, "3907")) return CPU_Z13;
return CPU_GENERIC;
}
@ -107,5 +114,9 @@ void get_cpuconfig(void)
printf("#define Z13\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
break;
case CPU_Z14:
printf("#define Z14\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
break;
}
}

View File

@ -101,6 +101,10 @@ OS_INTERIX
OS_LINUX
#endif
#if defined(__HAIKU__)
OS_HAIKU
#endif
#if defined(__i386) || defined(_X86)
ARCH_X86
#endif

View File

@ -344,6 +344,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {
/* Make sure if no one is using workspace */
START_RPCC();
for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
STOP_RPCC(waiting1);
#if defined(FUSED_GEMM) && !defined(TIMING)
/* Fused operation to copy region of B into workspace and apply kernel */
@ -381,15 +387,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
}
#endif
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) {
/* Make sure if no one is using workspace */
START_RPCC();
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
STOP_RPCC(waiting1);
/* Set flag so other threads can access local region of B */
/* Set flag so other threads can access local region of B */
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
WMB;
}
WMB;
}
/* Get regions of B from other threads and apply kernel */
@ -425,13 +426,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Clear synchronization flag if this thread is done with other region of B */
if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
}
}
} while (current != mypos);
/* Iterate through steps of m
/* Iterate through steps of m
* Note: First step has already been finished */
for(is = m_from + min_i; is < m_to; is += min_i){
min_i = m_to - is;
@ -461,14 +462,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
c, ldc, is, js);
STOP_RPCC(kernel);
#ifdef TIMING
ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l;
#endif
/* Clear synchronization flag if this thread is done with region of B */
if (is + min_i >= m_to) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
}
}

View File

@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*********************************************************************/
#include "common.h"
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU)
#include <dlfcn.h>
#include <signal.h>
#include <sys/resource.h>

View File

@ -510,7 +510,7 @@ static gotoblas_t *get_coretype(void){
#ifndef NO_AVX2
return &gotoblas_HASWELL;
#else
return &gotblas_SANDYBRIDGE;
return &gotoblas_SANDYBRIDGE;
#endif
else
return &gotoblas_NEHALEM;
@ -607,7 +607,7 @@ static gotoblas_t *get_coretype(void){
}
}
} else if (exfamily == 8) {
if (model == 1) {
if (model == 1 || model == 8) {
if(support_avx())
return &gotoblas_ZEN;
else{

View File

@ -468,6 +468,7 @@ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t);
# if defined(OS_WINDOWS)
static DWORD local_storage_key = 0;
DWORD lsk;
# else
static pthread_key_t local_storage_key = 0;
pthread_key_t lsk;
@ -1269,6 +1270,7 @@ void blas_shutdown(void){
#ifdef SMP
BLASFUNC(blas_thread_shutdown)();
#endif
#ifdef SMP
/* Only cleanupIf we were built for threading and TLS was initialized */
if (local_storage_key)

View File

@ -35,6 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <string.h>
#if defined(_WIN32) && defined(_MSC_VER)
#if _MSC_VER < 1900
#define snprintf _snprintf
#endif
#endif
static char* openblas_config_str=""
#ifdef USE64BITINT
"USE64BITINT "

View File

@ -122,7 +122,7 @@ endif
dllinit.$(SUFFIX) : dllinit.c
$(CC) $(CFLAGS) -c -o $(@F) -s $<
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
so : ../$(LIBSONAME)

View File

@ -213,7 +213,7 @@ void CNAME(enum CBLAS_ORDER order,
if (trans) lenx = m;
if (trans) leny = n;
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha == ZERO) return;

View File

@ -199,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order,
if (trans) lenx = m;
if (trans) leny = n;
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha == ZERO) return;

View File

@ -22,8 +22,8 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
long double s;
long double r, roe, z;
long double ada = fabs(da);
long double adb = fabs(db);
long double ada = fabsl(da);
long double adb = fabsl(db);
long double scale = ada + adb;
#ifndef CBLAS

View File

@ -184,7 +184,7 @@ void CNAME(enum CBLAS_ORDER order,
if (n == 0) return;
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha == ZERO) return;

View File

@ -168,7 +168,7 @@ void CNAME(enum CBLAS_ORDER order,
if (n == 0) return;
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha == ZERO) return;

View File

@ -166,7 +166,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
if (n == 0) return;
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha == ZERO) return;

View File

@ -237,7 +237,7 @@ void CNAME(enum CBLAS_ORDER order,
if (trans & 1) lenx = m;
if (trans & 1) leny = n;
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha_r == ZERO && alpha_i == ZERO) return;

View File

@ -225,7 +225,7 @@ void CNAME(enum CBLAS_ORDER order,
if (trans & 1) lenx = m;
if (trans & 1) leny = n;
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha_r == ZERO && alpha_i == ZERO) return;

View File

@ -190,7 +190,7 @@ void CNAME(enum CBLAS_ORDER order,
if (n == 0) return;
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;

View File

@ -181,7 +181,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA
if (n == 0) return;
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;

View File

@ -180,7 +180,7 @@ void CNAME(enum CBLAS_ORDER order,
if (n == 0) return;
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;

View File

@ -14,7 +14,7 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
long double db_i = *(DB + 1);
long double r;
long double ada = fabs(da_r) + fabs(da_i);
long double ada = fabsl(da_r) + fabsl(da_i);
PRINT_DEBUG_NAME;

View File

@ -126,7 +126,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *
if (n == 0) return;
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0);
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, blasabs(incy), NULL, 0, NULL, 0);
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;

View File

@ -1,3 +1,12 @@
CAXPYKERNEL = ../mips/zaxpy.c
ZAXPYKERNEL = ../mips/zaxpy.c
SROTKERNEL = ../mips/rot.c
DROTKERNEL = ../mips/rot.c
CROTKERNEL = ../mips/zrot.c
ZROTKERNEL = ../mips/zrot.c
CSWAPKERNEL = ../mips/zswap.c
ZSWAPKERNEL = ../mips/zswap.c
ifndef SNRM2KERNEL
SNRM2KERNEL = snrm2.S
endif

View File

@ -103,35 +103,83 @@
.align 3
.L12:
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
LD a1, 4 * SIZE(X)
LD b1, 4 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a2, a2
cvt.d.s b2, b2
madd.d s2, s2, a2, b2
#else
MADD s2, s2, a2, b2
#endif
LD a2, 5 * SIZE(X)
LD b2, 5 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a3, a3
cvt.d.s b3, b3
madd.d s1, s1, a3, b3
#else
MADD s1, s1, a3, b3
#endif
LD a3, 6 * SIZE(X)
LD b3, 6 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a4, a4
cvt.d.s b4, b4
madd.d s2, s2, a4, b4
#else
MADD s2, s2, a4, b4
#endif
LD a4, 7 * SIZE(X)
LD b4, 7 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
LD a1, 8 * SIZE(X)
LD b1, 8 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a2, a2
cvt.d.s b2, b2
madd.d s2, s2, a2, b2
#else
MADD s2, s2, a2, b2
#endif
LD a2, 9 * SIZE(X)
LD b2, 9 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a3, a3
cvt.d.s b3, b3
madd.d s1, s1, a3, b3
#else
MADD s1, s1, a3, b3
#endif
LD a3, 10 * SIZE(X)
LD b3, 10 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a4, a4
cvt.d.s b4, b4
madd.d s2, s2, a4, b4
#else
MADD s2, s2, a4, b4
#endif
LD a4, 11 * SIZE(X)
LD b4, 11 * SIZE(Y)
@ -143,29 +191,77 @@
.align 3
.L13:
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
LD a1, 4 * SIZE(X)
LD b1, 4 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a2, a2
cvt.d.s b2, b2
madd.d s2, s2, a2, b2
#else
MADD s2, s2, a2, b2
#endif
LD a2, 5 * SIZE(X)
LD b2, 5 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a3, a3
cvt.d.s b3, b3
madd.d s1, s1, a3, b3
#else
MADD s1, s1, a3, b3
#endif
LD a3, 6 * SIZE(X)
LD b3, 6 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a4, a4
cvt.d.s b4, b4
madd.d s2, s2, a4, b4
#else
MADD s2, s2, a4, b4
#endif
LD a4, 7 * SIZE(X)
LD b4, 7 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
daddiu X, X, 8 * SIZE
#ifdef DSDOT
cvt.d.s a2, a2
cvt.d.s b2, b2
madd.d s2, s2, a2, b2
#else
MADD s2, s2, a2, b2
#endif
daddiu Y, Y, 8 * SIZE
#ifdef DSDOT
cvt.d.s a3, a3
cvt.d.s b3, b3
madd.d s1, s1, a3, b3
#else
MADD s1, s1, a3, b3
#endif
#ifdef DSDOT
cvt.d.s a4, a4
cvt.d.s b4, b4
madd.d s2, s2, a4, b4
#else
MADD s2, s2, a4, b4
#endif
.align 3
.L15:
@ -179,8 +275,13 @@
LD a1, 0 * SIZE(X)
LD b1, 0 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
daddiu I, I, -1
daddiu X, X, SIZE
@ -225,50 +326,85 @@
LD b1, 0 * SIZE(Y)
dadd Y, Y, INCY
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
LD a1, 0 * SIZE(X)
dadd X, X, INCX
LD b1, 0 * SIZE(Y)
dadd Y, Y, INCY
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s2, s2, a1, b1
#else
MADD s2, s2, a1, b1
#endif
LD a1, 0 * SIZE(X)
dadd X, X, INCX
LD b1, 0 * SIZE(Y)
dadd Y, Y, INCY
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
LD a1, 0 * SIZE(X)
dadd X, X, INCX
LD b1, 0 * SIZE(Y)
dadd Y, Y, INCY
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s2, s2, a1, b1
#else
MADD s2, s2, a1, b1
#endif
LD a1, 0 * SIZE(X)
dadd X, X, INCX
LD b1, 0 * SIZE(Y)
dadd Y, Y, INCY
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
LD a1, 0 * SIZE(X)
dadd X, X, INCX
LD b1, 0 * SIZE(Y)
dadd Y, Y, INCY
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s2, s2, a1, b1
#else
MADD s2, s2, a1, b1
#endif
LD a1, 0 * SIZE(X)
dadd X, X, INCX
LD b1, 0 * SIZE(Y)
dadd Y, Y, INCY
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
LD a1, 0 * SIZE(X)
dadd X, X, INCX
LD b1, 0 * SIZE(Y)
@ -277,7 +413,13 @@
daddiu I, I, -1
bgtz I, .L23
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s2, s2, a1, b1
#else
MADD s2, s2, a1, b1
#endif
.align 3
.L25:
@ -296,13 +438,20 @@
daddiu I, I, -1
bgtz I, .L26
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
.align 3
.L999:
ADD s1, s1, s2
#ifdef DSDOT
cvt.d.s s1, s1
add.d s1, s1, s2
#else
ADD s1, s1, s2
#endif
j $31
NOP

View File

@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "daxpy_microk_steamroller-2.c"
#elif defined(PILEDRIVER)
#include "daxpy_microk_piledriver-2.c"
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#elif defined(HASWELL) || defined(ZEN)
#include "daxpy_microk_haswell-2.c"
#elif defined (SKYLAKEX)
#include "daxpy_microk_skylakex-2.c"
#elif defined(SANDYBRIDGE)
#include "daxpy_microk_sandy-2.c"
#endif

View File

@ -0,0 +1,71 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#include <immintrin.h>
#define HAVE_KERNEL_8 1
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG i = 0;
__m256d __alpha;
__alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
#ifdef __AVX512CD__
BLASLONG n32;
__m512d __alpha5;
__alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha));
n32 = n & ~31;
for (; i < n32; i+= 32) {
_mm512_storeu_pd(&y[i + 0], _mm512_loadu_pd(&y[i + 0]) + __alpha5 * _mm512_loadu_pd(&x[i + 0]));
_mm512_storeu_pd(&y[i + 8], _mm512_loadu_pd(&y[i + 8]) + __alpha5 * _mm512_loadu_pd(&x[i + 8]));
_mm512_storeu_pd(&y[i + 16], _mm512_loadu_pd(&y[i + 16]) + __alpha5 * _mm512_loadu_pd(&x[i + 16]));
_mm512_storeu_pd(&y[i + 24], _mm512_loadu_pd(&y[i + 24]) + __alpha5 * _mm512_loadu_pd(&x[i + 24]));
}
#endif
for (; i < n; i+= 16) {
_mm256_storeu_pd(&y[i + 0], _mm256_loadu_pd(&y[i + 0]) + __alpha * _mm256_loadu_pd(&x[i + 0]));
_mm256_storeu_pd(&y[i + 4], _mm256_loadu_pd(&y[i + 4]) + __alpha * _mm256_loadu_pd(&x[i + 4]));
_mm256_storeu_pd(&y[i + 8], _mm256_loadu_pd(&y[i + 8]) + __alpha * _mm256_loadu_pd(&x[i + 8]));
_mm256_storeu_pd(&y[i + 12], _mm256_loadu_pd(&y[i + 12]) + __alpha * _mm256_loadu_pd(&x[i + 12]));
}
}
#else
#include "daxpy_microk_haswell-2.c"
#endif

View File

@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "ddot_microk_piledriver-2.c"
#elif defined(NEHALEM)
#include "ddot_microk_nehalem-2.c"
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#elif defined(HASWELL) || defined(ZEN)
#include "ddot_microk_haswell-2.c"
#elif defined (SKYLAKEX)
#include "ddot_microk_skylakex-2.c"
#elif defined(SANDYBRIDGE)
#include "ddot_microk_sandy-2.c"
#endif

View File

@ -0,0 +1,96 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#define HAVE_KERNEL_8 1
#include <immintrin.h>
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
{
int i = 0;
__m256d accum_0, accum_1, accum_2, accum_3;
accum_0 = _mm256_setzero_pd();
accum_1 = _mm256_setzero_pd();
accum_2 = _mm256_setzero_pd();
accum_3 = _mm256_setzero_pd();
#ifdef __AVX512CD__
__m512d accum_05, accum_15, accum_25, accum_35;
int n32;
n32 = n & (~31);
accum_05 = _mm512_setzero_pd();
accum_15 = _mm512_setzero_pd();
accum_25 = _mm512_setzero_pd();
accum_35 = _mm512_setzero_pd();
for (; i < n32; i += 32) {
accum_05 += _mm512_loadu_pd(&x[i+ 0]) * _mm512_loadu_pd(&y[i+ 0]);
accum_15 += _mm512_loadu_pd(&x[i+ 8]) * _mm512_loadu_pd(&y[i+ 8]);
accum_25 += _mm512_loadu_pd(&x[i+16]) * _mm512_loadu_pd(&y[i+16]);
accum_35 += _mm512_loadu_pd(&x[i+24]) * _mm512_loadu_pd(&y[i+24]);
}
/*
* we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code
* below can continue using the intermediate results in its loop
*/
accum_0 = _mm512_extractf64x4_pd(accum_05, 0) + _mm512_extractf64x4_pd(accum_05, 1);
accum_1 = _mm512_extractf64x4_pd(accum_15, 0) + _mm512_extractf64x4_pd(accum_15, 1);
accum_2 = _mm512_extractf64x4_pd(accum_25, 0) + _mm512_extractf64x4_pd(accum_25, 1);
accum_3 = _mm512_extractf64x4_pd(accum_35, 0) + _mm512_extractf64x4_pd(accum_35, 1);
#endif
for (; i < n; i += 16) {
accum_0 += _mm256_loadu_pd(&x[i+ 0]) * _mm256_loadu_pd(&y[i+ 0]);
accum_1 += _mm256_loadu_pd(&x[i+ 4]) * _mm256_loadu_pd(&y[i+ 4]);
accum_2 += _mm256_loadu_pd(&x[i+ 8]) * _mm256_loadu_pd(&y[i+ 8]);
accum_3 += _mm256_loadu_pd(&x[i+12]) * _mm256_loadu_pd(&y[i+12]);
}
/* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */
accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
__m128d half_accum0;
/* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */
half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1));
/* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */
half_accum0 = _mm_hadd_pd(half_accum0, half_accum0);
*dot = half_accum0[0];
}
#else
#include "ddot_microk_haswell-2.c"
#endif

View File

@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NEHALEM)
#include "dgemv_n_microk_nehalem-4.c"
#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX)
#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "dgemv_n_microk_haswell-4.c"
#elif defined (SKYLAKEX)
#include "dgemv_n_microk_skylakex-4.c"
#endif

View File

@ -0,0 +1,126 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#define HAVE_KERNEL_4x4 1
#include <immintrin.h>
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
int i = 0;
__m256d x0, x1, x2, x3;
__m256d __alpha;
x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0]));
x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1]));
x2 = _mm256_broadcastsd_pd(_mm_load_sd(&x[2]));
x3 = _mm256_broadcastsd_pd(_mm_load_sd(&x[3]));
__alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
#ifdef __AVX512CD__
int n5;
__m512d x05, x15, x25, x35;
__m512d __alpha5;
n5 = n & ~7;
x05 = _mm512_broadcastsd_pd(_mm_load_sd(&x[0]));
x15 = _mm512_broadcastsd_pd(_mm_load_sd(&x[1]));
x25 = _mm512_broadcastsd_pd(_mm_load_sd(&x[2]));
x35 = _mm512_broadcastsd_pd(_mm_load_sd(&x[3]));
__alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha));
for (; i < n5; i+= 8) {
__m512d tempY;
__m512d sum;
sum = _mm512_loadu_pd(&ap[0][i]) * x05 +
_mm512_loadu_pd(&ap[1][i]) * x15 +
_mm512_loadu_pd(&ap[2][i]) * x25 +
_mm512_loadu_pd(&ap[3][i]) * x35;
tempY = _mm512_loadu_pd(&y[i]);
tempY += sum * __alpha5;
_mm512_storeu_pd(&y[i], tempY);
}
#endif
for (; i < n; i+= 4) {
__m256d tempY;
__m256d sum;
sum = _mm256_loadu_pd(&ap[0][i]) * x0 +
_mm256_loadu_pd(&ap[1][i]) * x1 +
_mm256_loadu_pd(&ap[2][i]) * x2 +
_mm256_loadu_pd(&ap[3][i]) * x3;
tempY = _mm256_loadu_pd(&y[i]);
tempY += sum * __alpha;
_mm256_storeu_pd(&y[i], tempY);
}
}
#define HAVE_KERNEL_4x2
static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
int i = 0;
__m256d x0, x1;
__m256d __alpha;
x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0]));
x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1]));
__alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
for (i = 0; i < n; i+= 4) {
__m256d tempY;
__m256d sum;
sum = _mm256_loadu_pd(&ap[0][i]) * x0 + _mm256_loadu_pd(&ap[1][i]) * x1;
tempY = _mm256_loadu_pd(&y[i]);
tempY += sum * __alpha;
_mm256_storeu_pd(&y[i], tempY);
}
}
#else
#include "dgemv_n_microk_haswell-4.c"
#endif

View File

@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dscal_microk_bulldozer-2.c"
#elif defined(SANDYBRIDGE)
#include "dscal_microk_sandy-2.c"
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#elif defined(HASWELL) || defined(ZEN)
#include "dscal_microk_haswell-2.c"
#elif defined (SKYLAKEX)
#include "dscal_microk_skylakex-2.c"
#endif

View File

@ -0,0 +1,77 @@
/***************************************************************************
Copyright (c) 2014-2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#include <immintrin.h>
#define HAVE_KERNEL_8 1
static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
{
int i = 0;
#ifdef __AVX512CD__
__m512d __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha));
for (; i < n; i += 8) {
_mm512_storeu_pd(&x[i + 0], __alpha5 * _mm512_loadu_pd(&x[i + 0]));
}
#else
__m256d __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
for (; i < n; i += 8) {
_mm256_storeu_pd(&x[i + 0], __alpha * _mm256_loadu_pd(&x[i + 0]));
_mm256_storeu_pd(&x[i + 4], __alpha * _mm256_loadu_pd(&x[i + 4]));
}
#endif
}
static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
{
int i = 0;
/* question to self: Why is this not just memset() */
#ifdef __AVX512CD__
__m512d zero = _mm512_setzero_pd();
for (; i < n; i += 8) {
_mm512_storeu_pd(&x[i], zero);
}
#else
__m256d zero = _mm256_setzero_pd();
for (; i < n; i += 8) {
_mm256_storeu_pd(&x[i + 0], zero);
_mm256_storeu_pd(&x[i + 4], zero);
}
#endif
}
#else
#include "dscal_microk_haswell-2.c"
#endif

View File

@ -30,8 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "dsymv_L_microk_bulldozer-2.c"
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#elif defined(HASWELL) || defined(ZEN)
#include "dsymv_L_microk_haswell-2.c"
#elif defined (SKYLAKEX)
#include "dsymv_L_microk_skylakex-2.c"
#elif defined(SANDYBRIDGE)
#include "dsymv_L_microk_sandy-2.c"
#elif defined(NEHALEM)

View File

@ -0,0 +1,161 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#include <immintrin.h>
#define HAVE_KERNEL_4x4 1
static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
{
__m256d accum_0, accum_1, accum_2, accum_3;
__m256d temp1_0, temp1_1, temp1_2, temp1_3;
/* the 256 bit wide acculmulator vectors start out as zero */
accum_0 = _mm256_setzero_pd();
accum_1 = _mm256_setzero_pd();
accum_2 = _mm256_setzero_pd();
accum_3 = _mm256_setzero_pd();
temp1_0 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[0]));
temp1_1 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[1]));
temp1_2 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[2]));
temp1_3 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[3]));
#ifdef __AVX512CD__
__m512d accum_05, accum_15, accum_25, accum_35;
__m512d temp1_05, temp1_15, temp1_25, temp1_35;
BLASLONG to2;
int delta;
/* the 512 bit wide accumulator vectors start out as zero */
accum_05 = _mm512_setzero_pd();
accum_15 = _mm512_setzero_pd();
accum_25 = _mm512_setzero_pd();
accum_35 = _mm512_setzero_pd();
temp1_05 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[0]));
temp1_15 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[1]));
temp1_25 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[2]));
temp1_35 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[3]));
delta = (to - from) & ~7;
to2 = from + delta;
for (; from < to2; from += 8) {
__m512d _x, _y;
__m512d a0, a1, a2, a3;
_y = _mm512_loadu_pd(&y[from]);
_x = _mm512_loadu_pd(&x[from]);
a0 = _mm512_loadu_pd(&a[0][from]);
a1 = _mm512_loadu_pd(&a[1][from]);
a2 = _mm512_loadu_pd(&a[2][from]);
a3 = _mm512_loadu_pd(&a[3][from]);
_y += temp1_05 * a0 + temp1_15 * a1 + temp1_25 * a2 + temp1_35 * a3;
accum_05 += _x * a0;
accum_15 += _x * a1;
accum_25 += _x * a2;
accum_35 += _x * a3;
_mm512_storeu_pd(&y[from], _y);
};
/*
* we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code
* below can continue using the intermediate results in its loop
*/
accum_0 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_05, 0), _mm512_extractf64x4_pd(accum_05, 1));
accum_1 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_15, 0), _mm512_extractf64x4_pd(accum_15, 1));
accum_2 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_25, 0), _mm512_extractf64x4_pd(accum_25, 1));
accum_3 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_35, 0), _mm512_extractf64x4_pd(accum_35, 1));
#endif
for (; from != to; from += 4) {
__m256d _x, _y;
__m256d a0, a1, a2, a3;
_y = _mm256_loadu_pd(&y[from]);
_x = _mm256_loadu_pd(&x[from]);
/* load 4 rows of matrix data */
a0 = _mm256_loadu_pd(&a[0][from]);
a1 = _mm256_loadu_pd(&a[1][from]);
a2 = _mm256_loadu_pd(&a[2][from]);
a3 = _mm256_loadu_pd(&a[3][from]);
_y += temp1_0 * a0 + temp1_1 * a1 + temp1_2 * a2 + temp1_3 * a3;
accum_0 += _x * a0;
accum_1 += _x * a1;
accum_2 += _x * a2;
accum_3 += _x * a3;
_mm256_storeu_pd(&y[from], _y);
};
/*
* we now have 4 accumulator vectors. Each vector needs to be summed up element wise and stored in the temp2
* output array. There is no direct instruction for this in 256 bit space, only in 128 space.
*/
__m128d half_accum0, half_accum1, half_accum2, half_accum3;
/* Add upper half to lower half of each of the four 256 bit vectors to get to four 128 bit vectors */
half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1));
half_accum1 = _mm_add_pd(_mm256_extractf128_pd(accum_1, 0), _mm256_extractf128_pd(accum_1, 1));
half_accum2 = _mm_add_pd(_mm256_extractf128_pd(accum_2, 0), _mm256_extractf128_pd(accum_2, 1));
half_accum3 = _mm_add_pd(_mm256_extractf128_pd(accum_3, 0), _mm256_extractf128_pd(accum_3, 1));
/* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */
half_accum0 = _mm_hadd_pd(half_accum0, half_accum0);
half_accum1 = _mm_hadd_pd(half_accum1, half_accum1);
half_accum2 = _mm_hadd_pd(half_accum2, half_accum2);
half_accum3 = _mm_hadd_pd(half_accum3, half_accum3);
/* and store the lowest double value from each of these vectors in the temp2 output */
temp2[0] += half_accum0[0];
temp2[1] += half_accum1[0];
temp2[2] += half_accum2[0];
temp2[3] += half_accum3[0];
}
#else
#include "dsymv_L_microk_haswell-2.c"
#endif

View File

@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NEHALEM)
#include "saxpy_microk_nehalem-2.c"
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#elif defined(HASWELL) || defined(ZEN)
#include "saxpy_microk_haswell-2.c"
#elif defined (SKYLAKEX)
#include "saxpy_microk_skylakex-2.c"
#elif defined(SANDYBRIDGE)
#include "saxpy_microk_sandy-2.c"
#elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)

View File

@ -0,0 +1,69 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#define HAVE_KERNEL_16 1
#include <immintrin.h>
static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG i = 0;
__m256 __alpha;
__alpha = _mm256_broadcastss_ps(_mm_load_ss(alpha));
#ifdef __AVX512CD__
BLASLONG n64;
__m512 __alpha5;
__alpha5 = _mm512_broadcastss_ps(_mm_load_ss(alpha));
n64 = n & ~63;
for (; i < n64; i+= 64) {
_mm512_storeu_ps(&y[i + 0], _mm512_loadu_ps(&y[i + 0]) + __alpha5 * _mm512_loadu_ps(&x[i + 0]));
_mm512_storeu_ps(&y[i + 16], _mm512_loadu_ps(&y[i + 16]) + __alpha5 * _mm512_loadu_ps(&x[i + 16]));
_mm512_storeu_ps(&y[i + 32], _mm512_loadu_ps(&y[i + 32]) + __alpha5 * _mm512_loadu_ps(&x[i + 32]));
_mm512_storeu_ps(&y[i + 48], _mm512_loadu_ps(&y[i + 48]) + __alpha5 * _mm512_loadu_ps(&x[i + 48]));
}
#endif
for (; i < n; i+= 32) {
_mm256_storeu_ps(&y[i + 0], _mm256_loadu_ps(&y[i + 0]) + __alpha * _mm256_loadu_ps(&x[i + 0]));
_mm256_storeu_ps(&y[i + 8], _mm256_loadu_ps(&y[i + 8]) + __alpha * _mm256_loadu_ps(&x[i + 8]));
_mm256_storeu_ps(&y[i + 16], _mm256_loadu_ps(&y[i + 16]) + __alpha * _mm256_loadu_ps(&x[i + 16]));
_mm256_storeu_ps(&y[i + 24], _mm256_loadu_ps(&y[i + 24]) + __alpha * _mm256_loadu_ps(&x[i + 24]));
}
}
#else
#include "saxpy_microk_haswell-2.c"
#endif

View File

@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "sdot_microk_steamroller-2.c"
#elif defined(NEHALEM)
#include "sdot_microk_nehalem-2.c"
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#elif defined(HASWELL) || defined(ZEN)
#include "sdot_microk_haswell-2.c"
#elif defined (SKYLAKEX)
#include "sdot_microk_skylakex-2.c"
#elif defined(SANDYBRIDGE)
#include "sdot_microk_sandy-2.c"
#endif

View File

@ -0,0 +1,98 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#define HAVE_KERNEL_16 1
#include <immintrin.h>
static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
{
int i = 0;
__m256 accum_0, accum_1, accum_2, accum_3;
accum_0 = _mm256_setzero_ps();
accum_1 = _mm256_setzero_ps();
accum_2 = _mm256_setzero_ps();
accum_3 = _mm256_setzero_ps();
#ifdef __AVX512CD__
__m512 accum_05, accum_15, accum_25, accum_35;
int n64;
n64 = n & (~63);
accum_05 = _mm512_setzero_ps();
accum_15 = _mm512_setzero_ps();
accum_25 = _mm512_setzero_ps();
accum_35 = _mm512_setzero_ps();
for (; i < n64; i += 64) {
accum_05 += _mm512_loadu_ps(&x[i+ 0]) * _mm512_loadu_ps(&y[i+ 0]);
accum_15 += _mm512_loadu_ps(&x[i+16]) * _mm512_loadu_ps(&y[i+16]);
accum_25 += _mm512_loadu_ps(&x[i+32]) * _mm512_loadu_ps(&y[i+32]);
accum_35 += _mm512_loadu_ps(&x[i+48]) * _mm512_loadu_ps(&y[i+48]);
}
/*
* we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code
* below can continue using the intermediate results in its loop
*/
accum_0 = _mm512_extractf32x8_ps(accum_05, 0) + _mm512_extractf32x8_ps(accum_05, 1);
accum_1 = _mm512_extractf32x8_ps(accum_15, 0) + _mm512_extractf32x8_ps(accum_15, 1);
accum_2 = _mm512_extractf32x8_ps(accum_25, 0) + _mm512_extractf32x8_ps(accum_25, 1);
accum_3 = _mm512_extractf32x8_ps(accum_35, 0) + _mm512_extractf32x8_ps(accum_35, 1);
#endif
for (; i < n; i += 32) {
accum_0 += _mm256_loadu_ps(&x[i+ 0]) * _mm256_loadu_ps(&y[i+ 0]);
accum_1 += _mm256_loadu_ps(&x[i+ 8]) * _mm256_loadu_ps(&y[i+ 8]);
accum_2 += _mm256_loadu_ps(&x[i+16]) * _mm256_loadu_ps(&y[i+16]);
accum_3 += _mm256_loadu_ps(&x[i+24]) * _mm256_loadu_ps(&y[i+24]);
}
/* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */
accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
__m128 half_accum0;
/* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */
half_accum0 = _mm256_extractf128_ps(accum_0, 0) + _mm256_extractf128_ps(accum_0, 1);
/* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */
half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
*dot = half_accum0[0];
}
#else
#include "sdot_microk_haswell-2.c"
#endif

View File

@ -84,7 +84,7 @@ struct ctest {
#endif
#if _MSC_VER < 1900
#define snprintf _snprintf_s
#define snprintf _snprintf
#endif
#ifndef __cplusplus