Merge pull request #1747 from xianyi/develop

Merge develop into 0.3.x for 0.3.3
This commit is contained in:
Martin Kroeker 2018-08-30 23:42:19 +02:00 committed by GitHub
commit 422a8fa953
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
53 changed files with 2551 additions and 180 deletions

View File

@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 2)
set(OpenBLAS_PATCH_VERSION 3.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions
@ -150,6 +150,7 @@ endif()
# add objects to the openblas lib
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>)
# Android needs to explicitly link against libm
if(ANDROID)
@ -169,6 +170,7 @@ endif()
# Set output for libopenblas
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS")
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )

View File

@ -1,4 +1,115 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.2
30-Jul-2018
common:
* fixes for regressions caused by the rewrite of the thread
initialization code in 0.3.1
POWER:
* fixed cpu autodetection for the BSDs
MIPS64:
* fixed utest errors in AXPY, DSDOT, ROT and SWAP
x86_64:
* added autodetection of AMD Ryzen 2
* fixed build with older versions of MSVC
====================================================================
Version 0.3.1
01-Jul-2018
common:
* rewritten thread initialization code with significantly reduced overhead
* added CBLAS interfaces to the IxAMIN BLAS extension functions
* fixed the lapack-test target
* CMAKE builds now create an OpenBLASConfig.cmake file
* ZAXPY now uses a single thread for small input sizes
* the LAPACK code was updated from Reference-LAPACK/lapack#253
(fixing LAPACKE interfaces to Aasen's functions)
POWER:
* corrected CROT and ZROT behaviour with zero INC_X
ARMV7:
* corrected xDOT behaviour with zero INC_X or INC_Y
x86_64:
* retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER,
this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO
(which will still be supported via the slower PRESCOTT kernels when this option is not set)
* added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to
specify the list of x86_64 targets to include. Any target not on the list will be supported
by the Sandybridge or Nehalem kernels if available, or by Prescott.
* improved SWITCH_RATIO on Haswell for increased GEMM throughput
* added initial support for Intel Skylake X, including an AVX512 SGEMM kernel
* added autodetection of Intel Cannon Lake series as Skylake X
* added a default L2 cache size for hypervisors that return zero here (Chromebook)
* fixed a name clash with recent Windows10 headers that broke the build with (at least)
recent mingw from MSYS2
* fixed a link error in mixed clang/gfortran builds with OpenMP
* updated the OSX deployment target to 10.8
* switched on parallel make for builds on MS Windows by default
x86:
* fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y
====================================================================
Version 0.3.0
23-May-2108
common:
* fixed some more thread race and locking bugs
* added preliminary support for calling an OpenMP build of the library from multiple threads
* removed performance impact of thread locks added in 0.2.20 on OpenMP code
* general code cleanup
* optimized DSDOT implementation
* improved thread distribution for GEMM
* corrected IMATCOPY/OMATCOPY implementation
* fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations
* cmake build improvements
* pkgconfig file now contains build options
* openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build
* corrections and improvements for systems with more than 64 cpus
* LAPACK code updated to 3.8.0 including later fixes
* added ReLAPACK, a recursive implementation of several LAPACK functions
* Rewrote ROTMG to handle cases that the netlib code failed to address
* Disabled (broken) multithreading code for xTRMV
* corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard
* shared memory access failures on startup are now handled more gracefully
* restored utests from earlier releases (and made them pass on all affected systems)
SPARC:
* several fixes for cpu autodetection
POWER:
* corrected vector register overwriting in several Power8 kernels
* optimized additional BLAS functions
ARM:
* added support for CortexA53 and A72
* added autodetection for ThunderX2T99
* made most optimized kernels the default for generic ARMv8 targets
x86_64:
* parallelized DDOT kernel for Haswell
* changed alignment directives in assembly kernels to boost performance on OSX
* fixed register handling in the GEMV microkernels (bug exposed by gcc7)
* added support for building on OpenBSD and Dragonfly
* updated compiler options to work with Intel release 2018
* support fully optimized build with clang/flang on Microsoft Windows
* fixed building on AIX
IBM Z:
* added optimized BLAS 1/2 functions
MIPS:
* fixed cpu autodetection helper code
* added mips32 1004K cpu (Mediatek MT7621 and similar SoC)
* added mips64 I6500 cpu
====================================================================
Version 0.2.20
24-Jul-2017

View File

@ -97,7 +97,7 @@ endif
shared :
ifndef NO_SHARED
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
@ -267,6 +267,8 @@ ifeq ($(F_COMPILER), GFORTRAN)
ifdef SMP
ifeq ($(OSNAME), WINNT)
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
else ifeq ($(OSNAME), Haiku)
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
else
-@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
endif

View File

@ -66,7 +66,7 @@ endif
#for install shared library
ifndef NO_SHARED
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \

View File

@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.3.2
VERSION = 0.3.3.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -107,7 +107,13 @@ BUILD_LAPACK_DEPRECATED = 1
# BUILD_RELAPACK = 1
# If you want to use legacy threaded Level 3 implementation.
# USE_SIMPLE_THREADED_LEVEL3 = 1
USE_SIMPLE_THREADED_LEVEL3 = 1
# If you want to use the new, still somewhat experimental code that uses
# thread-local storage instead of a central memory buffer in memory.c
# Note that if your system uses GLIBC, it needs to have at least glibc 2.21
# for this to work.
USE_TLS = 1
# If you want to drive whole 64bit region by BLAS. Not all Fortran
# compiler supports this. It's safe to keep comment it out if you

View File

@ -1018,6 +1018,10 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
endif
ifdef USE_TLS
CCOMMON_OPT += -DUSE_TLS
endif
ifndef SYMBOLPREFIX
SYMBOLPREFIX =
endif

View File

@ -12,6 +12,9 @@ ifeq ($(CORE), SKYLAKEX)
ifndef NO_AVX512
CCOMMON_OPT += -march=skylake-avx512
FCOMMON_OPT += -march=skylake-avx512
ifeq ($(OSNAME), CYGWIN_NT)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
endif
endif

View File

@ -110,6 +110,7 @@ Please read `GotoBLAS_01Readme.txt`.
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
@ -200,6 +201,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
Clang 3.0 will generate the wrong AVX binary code.
* Please use GCC version 6 or LLVM version 6 and above to compile Skyalke AVX512 kernels.
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
the library with `BIGNUMA=1`.

View File

@ -122,7 +122,7 @@ int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
FLOAT beta [] = {1.0, 0.0};
char trans='N';
blasint m, i, j;
blasint inc_x=1,inc_y=1;

View File

@ -64,6 +64,7 @@ $os = WINNT if ($data =~ /OS_WINNT/);
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
$os = Interix if ($data =~ /OS_INTERIX/);
$os = Android if ($data =~ /OS_ANDROID/);
$os = Haiku if ($data =~ /OS_HAIKU/);
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
@ -223,7 +224,6 @@ $data =~ /globl\s([_\.]*)(.*)/;
$need_fu = $1;
$cross = 0;
$cross = 1 if ($os ne $hostos);
if ($architecture ne $hostarch) {
$cross = 1;
@ -231,6 +231,8 @@ if ($architecture ne $hostarch) {
$cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips"));
}
$cross = 1 if ($os ne $hostos);
$openmp = "" if $ENV{USE_OPENMP} != 1;
$linker_L = "";

View File

@ -214,6 +214,10 @@ if (CONSISTENT_FPCSR)
set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR")
endif ()
if (USE_TLS)
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_TLS")
endif ()
# Only for development
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST")
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST")

View File

@ -105,6 +105,10 @@ extern "C" {
#endif
#endif
#ifdef OS_HAIKU
#define NO_SYSV_IPC
#endif
#ifdef OS_WINDOWS
#ifdef ATOM
#define GOTO_ATOM ATOM
@ -253,8 +257,14 @@ typedef unsigned long BLASULONG;
#ifdef USE64BITINT
typedef BLASLONG blasint;
#if defined(OS_WINDOWS) && defined(__64BIT__)
#define blasabs(x) llabs(x)
#else
#define blasabs(x) labs(x)
#endif
#else
typedef int blasint;
#define blasabs(x) abs(x)
#endif
#else
#ifdef USE64BITINT

View File

@ -29,15 +29,18 @@
#define CPU_GENERIC 0
#define CPU_Z13 1
#define CPU_Z14 2
static char *cpuname[] = {
"ZARCH_GENERIC",
"Z13"
"Z13",
"Z14"
};
static char *cpuname_lower[] = {
"zarch_generic",
"z13"
"z13",
"z14"
};
int detect(void)
@ -62,6 +65,10 @@ int detect(void)
if (strstr(p, "2964")) return CPU_Z13;
if (strstr(p, "2965")) return CPU_Z13;
/* detect z14, but fall back to z13 */
if (strstr(p, "3906")) return CPU_Z13;
if (strstr(p, "3907")) return CPU_Z13;
return CPU_GENERIC;
}
@ -107,5 +114,9 @@ void get_cpuconfig(void)
printf("#define Z13\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
break;
case CPU_Z14:
printf("#define Z14\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
break;
}
}

View File

@ -101,6 +101,10 @@ OS_INTERIX
OS_LINUX
#endif
#if defined(__HAIKU__)
OS_HAIKU
#endif
#if defined(__i386) || defined(_X86)
ARCH_X86
#endif

View File

@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*********************************************************************/
#include "common.h"
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU)
#include <dlfcn.h>
#include <signal.h>
#include <sys/resource.h>

File diff suppressed because it is too large Load Diff

View File

@ -122,7 +122,7 @@ endif
dllinit.$(SUFFIX) : dllinit.c
$(CC) $(CFLAGS) -c -o $(@F) -s $<
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
so : ../$(LIBSONAME)

View File

@ -213,7 +213,7 @@ void CNAME(enum CBLAS_ORDER order,
if (trans) lenx = m;
if (trans) leny = n;
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha == ZERO) return;

View File

@ -199,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order,
if (trans) lenx = m;
if (trans) leny = n;
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha == ZERO) return;

View File

@ -22,8 +22,8 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
long double s;
long double r, roe, z;
long double ada = fabs(da);
long double adb = fabs(db);
long double ada = fabsl(da);
long double adb = fabsl(db);
long double scale = ada + adb;
#ifndef CBLAS

View File

@ -184,7 +184,7 @@ void CNAME(enum CBLAS_ORDER order,
if (n == 0) return;
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha == ZERO) return;

View File

@ -168,7 +168,7 @@ void CNAME(enum CBLAS_ORDER order,
if (n == 0) return;
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha == ZERO) return;

View File

@ -166,7 +166,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
if (n == 0) return;
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha == ZERO) return;

View File

@ -237,7 +237,7 @@ void CNAME(enum CBLAS_ORDER order,
if (trans & 1) lenx = m;
if (trans & 1) leny = n;
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha_r == ZERO && alpha_i == ZERO) return;

View File

@ -225,7 +225,7 @@ void CNAME(enum CBLAS_ORDER order,
if (trans & 1) lenx = m;
if (trans & 1) leny = n;
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha_r == ZERO && alpha_i == ZERO) return;

View File

@ -190,7 +190,7 @@ void CNAME(enum CBLAS_ORDER order,
if (n == 0) return;
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;

View File

@ -181,7 +181,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA
if (n == 0) return;
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;

View File

@ -180,7 +180,7 @@ void CNAME(enum CBLAS_ORDER order,
if (n == 0) return;
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;

View File

@ -14,7 +14,7 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
long double db_i = *(DB + 1);
long double r;
long double ada = fabs(da_r) + fabs(da_i);
long double ada = fabsl(da_r) + fabsl(da_i);
PRINT_DEBUG_NAME;

View File

@ -126,7 +126,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *
if (n == 0) return;
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0);
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, blasabs(incy), NULL, 0, NULL, 0);
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;

View File

@ -44,7 +44,7 @@ ifeq ($(CORE), POWER8)
USE_TRMM = 1
endif
ifeq ($(CORE), Z13)
ifeq ($(ARCH), zarch)
USE_TRMM = 1
endif

View File

@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "daxpy_microk_steamroller-2.c"
#elif defined(PILEDRIVER)
#include "daxpy_microk_piledriver-2.c"
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#elif defined(HASWELL) || defined(ZEN)
#include "daxpy_microk_haswell-2.c"
#elif defined (SKYLAKEX)
#include "daxpy_microk_skylakex-2.c"
#elif defined(SANDYBRIDGE)
#include "daxpy_microk_sandy-2.c"
#endif

View File

@ -0,0 +1,71 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#include <immintrin.h>
#define HAVE_KERNEL_8 1
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG i = 0;
__m256d __alpha;
__alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
#ifdef __AVX512CD__
BLASLONG n32;
__m512d __alpha5;
__alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha));
n32 = n & ~31;
for (; i < n32; i+= 32) {
_mm512_storeu_pd(&y[i + 0], _mm512_loadu_pd(&y[i + 0]) + __alpha5 * _mm512_loadu_pd(&x[i + 0]));
_mm512_storeu_pd(&y[i + 8], _mm512_loadu_pd(&y[i + 8]) + __alpha5 * _mm512_loadu_pd(&x[i + 8]));
_mm512_storeu_pd(&y[i + 16], _mm512_loadu_pd(&y[i + 16]) + __alpha5 * _mm512_loadu_pd(&x[i + 16]));
_mm512_storeu_pd(&y[i + 24], _mm512_loadu_pd(&y[i + 24]) + __alpha5 * _mm512_loadu_pd(&x[i + 24]));
}
#endif
for (; i < n; i+= 16) {
_mm256_storeu_pd(&y[i + 0], _mm256_loadu_pd(&y[i + 0]) + __alpha * _mm256_loadu_pd(&x[i + 0]));
_mm256_storeu_pd(&y[i + 4], _mm256_loadu_pd(&y[i + 4]) + __alpha * _mm256_loadu_pd(&x[i + 4]));
_mm256_storeu_pd(&y[i + 8], _mm256_loadu_pd(&y[i + 8]) + __alpha * _mm256_loadu_pd(&x[i + 8]));
_mm256_storeu_pd(&y[i + 12], _mm256_loadu_pd(&y[i + 12]) + __alpha * _mm256_loadu_pd(&x[i + 12]));
}
}
#else
#include "daxpy_microk_haswell-2.c"
#endif

View File

@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "ddot_microk_piledriver-2.c"
#elif defined(NEHALEM)
#include "ddot_microk_nehalem-2.c"
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#elif defined(HASWELL) || defined(ZEN)
#include "ddot_microk_haswell-2.c"
#elif defined (SKYLAKEX)
#include "ddot_microk_skylakex-2.c"
#elif defined(SANDYBRIDGE)
#include "ddot_microk_sandy-2.c"
#endif

View File

@ -0,0 +1,96 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#define HAVE_KERNEL_8 1
#include <immintrin.h>
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
{
int i = 0;
__m256d accum_0, accum_1, accum_2, accum_3;
accum_0 = _mm256_setzero_pd();
accum_1 = _mm256_setzero_pd();
accum_2 = _mm256_setzero_pd();
accum_3 = _mm256_setzero_pd();
#ifdef __AVX512CD__
__m512d accum_05, accum_15, accum_25, accum_35;
int n32;
n32 = n & (~31);
accum_05 = _mm512_setzero_pd();
accum_15 = _mm512_setzero_pd();
accum_25 = _mm512_setzero_pd();
accum_35 = _mm512_setzero_pd();
for (; i < n32; i += 32) {
accum_05 += _mm512_loadu_pd(&x[i+ 0]) * _mm512_loadu_pd(&y[i+ 0]);
accum_15 += _mm512_loadu_pd(&x[i+ 8]) * _mm512_loadu_pd(&y[i+ 8]);
accum_25 += _mm512_loadu_pd(&x[i+16]) * _mm512_loadu_pd(&y[i+16]);
accum_35 += _mm512_loadu_pd(&x[i+24]) * _mm512_loadu_pd(&y[i+24]);
}
/*
* we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code
* below can continue using the intermediate results in its loop
*/
accum_0 = _mm512_extractf64x4_pd(accum_05, 0) + _mm512_extractf64x4_pd(accum_05, 1);
accum_1 = _mm512_extractf64x4_pd(accum_15, 0) + _mm512_extractf64x4_pd(accum_15, 1);
accum_2 = _mm512_extractf64x4_pd(accum_25, 0) + _mm512_extractf64x4_pd(accum_25, 1);
accum_3 = _mm512_extractf64x4_pd(accum_35, 0) + _mm512_extractf64x4_pd(accum_35, 1);
#endif
for (; i < n; i += 16) {
accum_0 += _mm256_loadu_pd(&x[i+ 0]) * _mm256_loadu_pd(&y[i+ 0]);
accum_1 += _mm256_loadu_pd(&x[i+ 4]) * _mm256_loadu_pd(&y[i+ 4]);
accum_2 += _mm256_loadu_pd(&x[i+ 8]) * _mm256_loadu_pd(&y[i+ 8]);
accum_3 += _mm256_loadu_pd(&x[i+12]) * _mm256_loadu_pd(&y[i+12]);
}
/* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */
accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
__m128d half_accum0;
/* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */
half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1));
/* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */
half_accum0 = _mm_hadd_pd(half_accum0, half_accum0);
*dot = half_accum0[0];
}
#else
#include "ddot_microk_haswell-2.c"
#endif

View File

@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NEHALEM)
#include "dgemv_n_microk_nehalem-4.c"
#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX)
#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "dgemv_n_microk_haswell-4.c"
#elif defined (SKYLAKEX)
#include "dgemv_n_microk_skylakex-4.c"
#endif

View File

@ -0,0 +1,126 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#define HAVE_KERNEL_4x4 1
#include <immintrin.h>
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
int i = 0;
__m256d x0, x1, x2, x3;
__m256d __alpha;
x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0]));
x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1]));
x2 = _mm256_broadcastsd_pd(_mm_load_sd(&x[2]));
x3 = _mm256_broadcastsd_pd(_mm_load_sd(&x[3]));
__alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
#ifdef __AVX512CD__
int n5;
__m512d x05, x15, x25, x35;
__m512d __alpha5;
n5 = n & ~7;
x05 = _mm512_broadcastsd_pd(_mm_load_sd(&x[0]));
x15 = _mm512_broadcastsd_pd(_mm_load_sd(&x[1]));
x25 = _mm512_broadcastsd_pd(_mm_load_sd(&x[2]));
x35 = _mm512_broadcastsd_pd(_mm_load_sd(&x[3]));
__alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha));
for (; i < n5; i+= 8) {
__m512d tempY;
__m512d sum;
sum = _mm512_loadu_pd(&ap[0][i]) * x05 +
_mm512_loadu_pd(&ap[1][i]) * x15 +
_mm512_loadu_pd(&ap[2][i]) * x25 +
_mm512_loadu_pd(&ap[3][i]) * x35;
tempY = _mm512_loadu_pd(&y[i]);
tempY += sum * __alpha5;
_mm512_storeu_pd(&y[i], tempY);
}
#endif
for (; i < n; i+= 4) {
__m256d tempY;
__m256d sum;
sum = _mm256_loadu_pd(&ap[0][i]) * x0 +
_mm256_loadu_pd(&ap[1][i]) * x1 +
_mm256_loadu_pd(&ap[2][i]) * x2 +
_mm256_loadu_pd(&ap[3][i]) * x3;
tempY = _mm256_loadu_pd(&y[i]);
tempY += sum * __alpha;
_mm256_storeu_pd(&y[i], tempY);
}
}
#define HAVE_KERNEL_4x2
static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
int i = 0;
__m256d x0, x1;
__m256d __alpha;
x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0]));
x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1]));
__alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
for (i = 0; i < n; i+= 4) {
__m256d tempY;
__m256d sum;
sum = _mm256_loadu_pd(&ap[0][i]) * x0 + _mm256_loadu_pd(&ap[1][i]) * x1;
tempY = _mm256_loadu_pd(&y[i]);
tempY += sum * __alpha;
_mm256_storeu_pd(&y[i], tempY);
}
}
#else
#include "dgemv_n_microk_haswell-4.c"
#endif

View File

@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dscal_microk_bulldozer-2.c"
#elif defined(SANDYBRIDGE)
#include "dscal_microk_sandy-2.c"
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#elif defined(HASWELL) || defined(ZEN)
#include "dscal_microk_haswell-2.c"
#elif defined (SKYLAKEX)
#include "dscal_microk_skylakex-2.c"
#endif

View File

@ -0,0 +1,77 @@
/***************************************************************************
Copyright (c) 2014-2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#include <immintrin.h>
#define HAVE_KERNEL_8 1
static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
{
int i = 0;
#ifdef __AVX512CD__
__m512d __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha));
for (; i < n; i += 8) {
_mm512_storeu_pd(&x[i + 0], __alpha5 * _mm512_loadu_pd(&x[i + 0]));
}
#else
__m256d __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
for (; i < n; i += 8) {
_mm256_storeu_pd(&x[i + 0], __alpha * _mm256_loadu_pd(&x[i + 0]));
_mm256_storeu_pd(&x[i + 4], __alpha * _mm256_loadu_pd(&x[i + 4]));
}
#endif
}
static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
{
int i = 0;
/* question to self: Why is this not just memset() */
#ifdef __AVX512CD__
__m512d zero = _mm512_setzero_pd();
for (; i < n; i += 8) {
_mm512_storeu_pd(&x[i], zero);
}
#else
__m256d zero = _mm256_setzero_pd();
for (; i < n; i += 8) {
_mm256_storeu_pd(&x[i + 0], zero);
_mm256_storeu_pd(&x[i + 4], zero);
}
#endif
}
#else
#include "dscal_microk_haswell-2.c"
#endif

View File

@ -30,8 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "dsymv_L_microk_bulldozer-2.c"
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#elif defined(HASWELL) || defined(ZEN)
#include "dsymv_L_microk_haswell-2.c"
#elif defined (SKYLAKEX)
#include "dsymv_L_microk_skylakex-2.c"
#elif defined(SANDYBRIDGE)
#include "dsymv_L_microk_sandy-2.c"
#elif defined(NEHALEM)

View File

@ -0,0 +1,161 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#include <immintrin.h>
#define HAVE_KERNEL_4x4 1
static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
{
__m256d accum_0, accum_1, accum_2, accum_3;
__m256d temp1_0, temp1_1, temp1_2, temp1_3;
/* the 256 bit wide acculmulator vectors start out as zero */
accum_0 = _mm256_setzero_pd();
accum_1 = _mm256_setzero_pd();
accum_2 = _mm256_setzero_pd();
accum_3 = _mm256_setzero_pd();
temp1_0 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[0]));
temp1_1 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[1]));
temp1_2 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[2]));
temp1_3 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[3]));
#ifdef __AVX512CD__
__m512d accum_05, accum_15, accum_25, accum_35;
__m512d temp1_05, temp1_15, temp1_25, temp1_35;
BLASLONG to2;
int delta;
/* the 512 bit wide accumulator vectors start out as zero */
accum_05 = _mm512_setzero_pd();
accum_15 = _mm512_setzero_pd();
accum_25 = _mm512_setzero_pd();
accum_35 = _mm512_setzero_pd();
temp1_05 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[0]));
temp1_15 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[1]));
temp1_25 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[2]));
temp1_35 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[3]));
delta = (to - from) & ~7;
to2 = from + delta;
for (; from < to2; from += 8) {
__m512d _x, _y;
__m512d a0, a1, a2, a3;
_y = _mm512_loadu_pd(&y[from]);
_x = _mm512_loadu_pd(&x[from]);
a0 = _mm512_loadu_pd(&a[0][from]);
a1 = _mm512_loadu_pd(&a[1][from]);
a2 = _mm512_loadu_pd(&a[2][from]);
a3 = _mm512_loadu_pd(&a[3][from]);
_y += temp1_05 * a0 + temp1_15 * a1 + temp1_25 * a2 + temp1_35 * a3;
accum_05 += _x * a0;
accum_15 += _x * a1;
accum_25 += _x * a2;
accum_35 += _x * a3;
_mm512_storeu_pd(&y[from], _y);
};
/*
* we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code
* below can continue using the intermediate results in its loop
*/
accum_0 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_05, 0), _mm512_extractf64x4_pd(accum_05, 1));
accum_1 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_15, 0), _mm512_extractf64x4_pd(accum_15, 1));
accum_2 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_25, 0), _mm512_extractf64x4_pd(accum_25, 1));
accum_3 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_35, 0), _mm512_extractf64x4_pd(accum_35, 1));
#endif
for (; from != to; from += 4) {
__m256d _x, _y;
__m256d a0, a1, a2, a3;
_y = _mm256_loadu_pd(&y[from]);
_x = _mm256_loadu_pd(&x[from]);
/* load 4 rows of matrix data */
a0 = _mm256_loadu_pd(&a[0][from]);
a1 = _mm256_loadu_pd(&a[1][from]);
a2 = _mm256_loadu_pd(&a[2][from]);
a3 = _mm256_loadu_pd(&a[3][from]);
_y += temp1_0 * a0 + temp1_1 * a1 + temp1_2 * a2 + temp1_3 * a3;
accum_0 += _x * a0;
accum_1 += _x * a1;
accum_2 += _x * a2;
accum_3 += _x * a3;
_mm256_storeu_pd(&y[from], _y);
};
/*
* we now have 4 accumulator vectors. Each vector needs to be summed up element wise and stored in the temp2
* output array. There is no direct instruction for this in 256 bit space, only in 128 space.
*/
__m128d half_accum0, half_accum1, half_accum2, half_accum3;
/* Add upper half to lower half of each of the four 256 bit vectors to get to four 128 bit vectors */
half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1));
half_accum1 = _mm_add_pd(_mm256_extractf128_pd(accum_1, 0), _mm256_extractf128_pd(accum_1, 1));
half_accum2 = _mm_add_pd(_mm256_extractf128_pd(accum_2, 0), _mm256_extractf128_pd(accum_2, 1));
half_accum3 = _mm_add_pd(_mm256_extractf128_pd(accum_3, 0), _mm256_extractf128_pd(accum_3, 1));
/* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */
half_accum0 = _mm_hadd_pd(half_accum0, half_accum0);
half_accum1 = _mm_hadd_pd(half_accum1, half_accum1);
half_accum2 = _mm_hadd_pd(half_accum2, half_accum2);
half_accum3 = _mm_hadd_pd(half_accum3, half_accum3);
/* and store the lowest double value from each of these vectors in the temp2 output */
temp2[0] += half_accum0[0];
temp2[1] += half_accum1[0];
temp2[2] += half_accum2[0];
temp2[3] += half_accum3[0];
}
#else
#include "dsymv_L_microk_haswell-2.c"
#endif

View File

@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NEHALEM)
#include "saxpy_microk_nehalem-2.c"
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#elif defined(HASWELL) || defined(ZEN)
#include "saxpy_microk_haswell-2.c"
#elif defined (SKYLAKEX)
#include "saxpy_microk_skylakex-2.c"
#elif defined(SANDYBRIDGE)
#include "saxpy_microk_sandy-2.c"
#elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)

View File

@ -0,0 +1,69 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#define HAVE_KERNEL_16 1
#include <immintrin.h>
static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG i = 0;
__m256 __alpha;
__alpha = _mm256_broadcastss_ps(_mm_load_ss(alpha));
#ifdef __AVX512CD__
BLASLONG n64;
__m512 __alpha5;
__alpha5 = _mm512_broadcastss_ps(_mm_load_ss(alpha));
n64 = n & ~63;
for (; i < n64; i+= 64) {
_mm512_storeu_ps(&y[i + 0], _mm512_loadu_ps(&y[i + 0]) + __alpha5 * _mm512_loadu_ps(&x[i + 0]));
_mm512_storeu_ps(&y[i + 16], _mm512_loadu_ps(&y[i + 16]) + __alpha5 * _mm512_loadu_ps(&x[i + 16]));
_mm512_storeu_ps(&y[i + 32], _mm512_loadu_ps(&y[i + 32]) + __alpha5 * _mm512_loadu_ps(&x[i + 32]));
_mm512_storeu_ps(&y[i + 48], _mm512_loadu_ps(&y[i + 48]) + __alpha5 * _mm512_loadu_ps(&x[i + 48]));
}
#endif
for (; i < n; i+= 32) {
_mm256_storeu_ps(&y[i + 0], _mm256_loadu_ps(&y[i + 0]) + __alpha * _mm256_loadu_ps(&x[i + 0]));
_mm256_storeu_ps(&y[i + 8], _mm256_loadu_ps(&y[i + 8]) + __alpha * _mm256_loadu_ps(&x[i + 8]));
_mm256_storeu_ps(&y[i + 16], _mm256_loadu_ps(&y[i + 16]) + __alpha * _mm256_loadu_ps(&x[i + 16]));
_mm256_storeu_ps(&y[i + 24], _mm256_loadu_ps(&y[i + 24]) + __alpha * _mm256_loadu_ps(&x[i + 24]));
}
}
#else
#include "saxpy_microk_haswell-2.c"
#endif

View File

@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "sdot_microk_steamroller-2.c"
#elif defined(NEHALEM)
#include "sdot_microk_nehalem-2.c"
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#elif defined(HASWELL) || defined(ZEN)
#include "sdot_microk_haswell-2.c"
#elif defined (SKYLAKEX)
#include "sdot_microk_skylakex-2.c"
#elif defined(SANDYBRIDGE)
#include "sdot_microk_sandy-2.c"
#endif

View File

@ -0,0 +1,98 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#define HAVE_KERNEL_16 1
#include <immintrin.h>
static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
{
int i = 0;
__m256 accum_0, accum_1, accum_2, accum_3;
accum_0 = _mm256_setzero_ps();
accum_1 = _mm256_setzero_ps();
accum_2 = _mm256_setzero_ps();
accum_3 = _mm256_setzero_ps();
#ifdef __AVX512CD__
__m512 accum_05, accum_15, accum_25, accum_35;
int n64;
n64 = n & (~63);
accum_05 = _mm512_setzero_ps();
accum_15 = _mm512_setzero_ps();
accum_25 = _mm512_setzero_ps();
accum_35 = _mm512_setzero_ps();
for (; i < n64; i += 64) {
accum_05 += _mm512_loadu_ps(&x[i+ 0]) * _mm512_loadu_ps(&y[i+ 0]);
accum_15 += _mm512_loadu_ps(&x[i+16]) * _mm512_loadu_ps(&y[i+16]);
accum_25 += _mm512_loadu_ps(&x[i+32]) * _mm512_loadu_ps(&y[i+32]);
accum_35 += _mm512_loadu_ps(&x[i+48]) * _mm512_loadu_ps(&y[i+48]);
}
/*
* we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code
* below can continue using the intermediate results in its loop
*/
accum_0 = _mm512_extractf32x8_ps(accum_05, 0) + _mm512_extractf32x8_ps(accum_05, 1);
accum_1 = _mm512_extractf32x8_ps(accum_15, 0) + _mm512_extractf32x8_ps(accum_15, 1);
accum_2 = _mm512_extractf32x8_ps(accum_25, 0) + _mm512_extractf32x8_ps(accum_25, 1);
accum_3 = _mm512_extractf32x8_ps(accum_35, 0) + _mm512_extractf32x8_ps(accum_35, 1);
#endif
for (; i < n; i += 32) {
accum_0 += _mm256_loadu_ps(&x[i+ 0]) * _mm256_loadu_ps(&y[i+ 0]);
accum_1 += _mm256_loadu_ps(&x[i+ 8]) * _mm256_loadu_ps(&y[i+ 8]);
accum_2 += _mm256_loadu_ps(&x[i+16]) * _mm256_loadu_ps(&y[i+16]);
accum_3 += _mm256_loadu_ps(&x[i+24]) * _mm256_loadu_ps(&y[i+24]);
}
/* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */
accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
__m128 half_accum0;
/* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */
half_accum0 = _mm256_extractf128_ps(accum_0, 0) + _mm256_extractf128_ps(accum_0, 1);
/* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */
half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
*dot = half_accum0[0];
}
#else
#include "sdot_microk_haswell-2.c"
#endif

View File

@ -280,8 +280,8 @@
* ..
* .. External Functions ..
LOGICAL LSAME
INTEGER ILAENV
EXTERNAL LSAME, ILAENV
INTEGER ILAENV2STAGE
EXTERNAL LSAME, ILAENV2STAGE
* ..
* .. Executable Statements ..
*
@ -297,9 +297,9 @@
*
* Determine the block size, the workspace size and the hous size.
*
IB = ILAENV( 18, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 )
LHMIN = ILAENV( 19, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
LWMIN = ILAENV( 20, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 )
LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
*
IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
INFO = -1

View File

@ -285,8 +285,8 @@
* ..
* .. External Functions ..
LOGICAL LSAME
INTEGER ILAENV
EXTERNAL LSAME, ILAENV
INTEGER ILAENV2STAGE
EXTERNAL LSAME, ILAENV2STAGE
* ..
* .. Executable Statements ..
*
@ -296,7 +296,7 @@
INFO = 0
UPPER = LSAME( UPLO, 'U' )
LQUERY = ( LWORK.EQ.-1 )
LWMIN = ILAENV( 20, 'CHETRD_HE2HB', '', N, KD, -1, -1 )
LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 )
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
INFO = -1

View File

@ -277,8 +277,8 @@
* ..
* .. External Functions ..
LOGICAL LSAME
INTEGER ILAENV
EXTERNAL LSAME, ILAENV
INTEGER ILAENV2STAGE
EXTERNAL LSAME, ILAENV2STAGE
* ..
* .. Executable Statements ..
*
@ -294,9 +294,9 @@
*
* Determine the block size, the workspace size and the hous size.
*
IB = ILAENV( 18, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 )
LHMIN = ILAENV( 19, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
LWMIN = ILAENV( 20, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
IB = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 )
LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
*
IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
INFO = -1

View File

@ -285,8 +285,8 @@
* ..
* .. External Functions ..
LOGICAL LSAME
INTEGER ILAENV
EXTERNAL LSAME, ILAENV
INTEGER ILAENV2STAGE
EXTERNAL LSAME, ILAENV2STAGE
* ..
* .. Executable Statements ..
*
@ -296,7 +296,7 @@
INFO = 0
UPPER = LSAME( UPLO, 'U' )
LQUERY = ( LWORK.EQ.-1 )
LWMIN = ILAENV( 20, 'DSYTRD_SY2SB', '', N, KD, -1, -1 )
LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', '', N, KD, -1, -1 )
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
INFO = -1

View File

@ -277,8 +277,8 @@
* ..
* .. External Functions ..
LOGICAL LSAME
INTEGER ILAENV
EXTERNAL LSAME, ILAENV
INTEGER ILAENV2STAGE
EXTERNAL LSAME, ILAENV2STAGE
* ..
* .. Executable Statements ..
*
@ -294,9 +294,9 @@
*
* Determine the block size, the workspace size and the hous size.
*
IB = ILAENV( 18, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 )
LHMIN = ILAENV( 19, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
LWMIN = ILAENV( 20, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 )
LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
*
IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
INFO = -1

View File

@ -285,8 +285,8 @@
* ..
* .. External Functions ..
LOGICAL LSAME
INTEGER ILAENV
EXTERNAL LSAME, ILAENV
INTEGER ILAENV2STAGE
EXTERNAL LSAME, ILAENV2STAGE
* ..
* .. Executable Statements ..
*
@ -296,7 +296,7 @@
INFO = 0
UPPER = LSAME( UPLO, 'U' )
LQUERY = ( LWORK.EQ.-1 )
LWMIN = ILAENV( 20, 'SSYTRD_SY2SB', '', N, KD, -1, -1 )
LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 )
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
INFO = -1

View File

@ -280,8 +280,8 @@
* ..
* .. External Functions ..
LOGICAL LSAME
INTEGER ILAENV
EXTERNAL LSAME, ILAENV
INTEGER ILAENV2STAGE
EXTERNAL LSAME, ILAENV2STAGE
* ..
* .. Executable Statements ..
*
@ -297,9 +297,9 @@
*
* Determine the block size, the workspace size and the hous size.
*
IB = ILAENV( 18, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 )
LHMIN = ILAENV( 19, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
LWMIN = ILAENV( 20, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
IB = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 )
LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
*
IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
INFO = -1

View File

@ -285,8 +285,8 @@
* ..
* .. External Functions ..
LOGICAL LSAME
INTEGER ILAENV
EXTERNAL LSAME, ILAENV
INTEGER ILAENV2STAGE
EXTERNAL LSAME, ILAENV2STAGE
* ..
* .. Executable Statements ..
*
@ -296,7 +296,7 @@
INFO = 0
UPPER = LSAME( UPLO, 'U' )
LQUERY = ( LWORK.EQ.-1 )
LWMIN = ILAENV( 20, 'ZHETRD_HE2HB', '', N, KD, -1, -1 )
LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 )
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
INFO = -1