Merge branch 'develop' into interim033

2018-08-25 19:45:19 +02:00 · 2018-08-25 19:45:19 +02:00 · b1b743f434
parent 2caa2210bb 52d3f7af50
commit b1b743f434
51 changed files with 1130 additions and 57 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 1.dev)
+set(OpenBLAS_PATCH_VERSION 3.dev)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")

 # Adhere to GNU filesystem layout conventions
@ -150,6 +150,7 @@ endif()

 # add objects to the openblas lib
 add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
+target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>)

 # Android needs to explicitly link against libm
 if(ANDROID)
@ -169,6 +170,7 @@ endif()
 # Set output for libopenblas
 set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
 set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
+set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS")

 foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
  string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
--- a/Changelog.txt
+++ b/Changelog.txt
@ -1,4 +1,115 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.2
+30-Jul-2018
+
+common:
+	* fixes for regressions caused by the rewrite of the thread
+	  initialization code in 0.3.1
+
+POWER:
+	* fixed cpu autodetection for the BSDs
+
+MIPS64:
+	* fixed utest errors in AXPY, DSDOT, ROT and SWAP
+
+x86_64:
+	* added autodetection of AMD Ryzen 2
+	* fixed build with older versions of MSVC
+
+====================================================================
+Version 0.3.1
+01-Jul-2018
+
+common:
+	* rewritten thread initialization code with significantly reduced overhead
+	* added CBLAS interfaces to the IxAMIN BLAS extension functions
+	* fixed the lapack-test target
+	* CMAKE builds now create an OpenBLASConfig.cmake file
+	* ZAXPY now uses a single thread for small input sizes
+	* the LAPACK code was updated from Reference-LAPACK/lapack#253
+          (fixing LAPACKE interfaces to Aasen's functions)
+
+POWER:
+	* corrected CROT and ZROT behaviour with zero INC_X
+
+ARMV7:
+	* corrected xDOT behaviour with zero INC_X or INC_Y
+
+x86_64:
+	* retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER,
+	  this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO
+	  (which will still be supported via the slower PRESCOTT kernels when this option is not set)
+	* added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to 
+	  specify the list of x86_64 targets to include. Any target not on the list will be supported 
+	  by the Sandybridge or Nehalem kernels if available, or by Prescott.
+	* improved SWITCH_RATIO on Haswell for increased GEMM throughput
+	* added initial support for Intel Skylake X, including an AVX512 SGEMM kernel
+	* added autodetection of Intel Cannon Lake series as Skylake X
+	* added a default L2 cache size for hypervisors that return zero here (Chromebook)
+	* fixed a name clash with recent Windows10 headers that broke the build with (at least)
+	  recent mingw from MSYS2
+	* fixed a link error in mixed clang/gfortran builds with OpenMP
+	* updated the OSX deployment target to 10.8
+	* switched on parallel make for builds on MS Windows by default
+
+x86:
+	* fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y
+
+====================================================================
+Version 0.3.0
+23-May-2108
+
+common:
+	* fixed some more thread race and locking bugs
+	* added preliminary support for calling an OpenMP build of the library from multiple threads
+	* removed performance impact of thread locks added in 0.2.20 on OpenMP code
+	* general code cleanup 
+	* optimized DSDOT implementation
+	* improved thread distribution for GEMM
+	* corrected IMATCOPY/OMATCOPY implementation
+	* fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations
+	* cmake build improvements
+	* pkgconfig file now contains build options
+	* openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build
+	* corrections and improvements for systems with more than 64 cpus
+	* LAPACK code updated to 3.8.0 including later fixes
+	* added ReLAPACK, a recursive implementation of several LAPACK functions
+	* Rewrote ROTMG to handle cases that the netlib code failed to address
+	* Disabled (broken) multithreading code for xTRMV
+	* corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard
+	* shared memory access failures on startup are now handled more gracefully
+	* restored utests from earlier releases (and made them pass on all affected systems)
+
+SPARC:
+	* several fixes for cpu autodetection
+
+POWER:
+	* corrected vector register overwriting in several Power8 kernels
+	* optimized additional BLAS functions
+
+ARM:
+	* added support for CortexA53 and A72 
+	* added autodetection for ThunderX2T99
+	* made most optimized kernels the default for generic ARMv8 targets 
+
+x86_64:
+	* parallelized DDOT kernel for Haswell
+	* changed alignment directives in assembly kernels to boost performance on OSX
+	* fixed register handling in the GEMV microkernels (bug exposed by gcc7)
+	* added support for building on OpenBSD and Dragonfly 
+	* updated compiler options to work with Intel release 2018
+	* support fully optimized build with clang/flang on Microsoft Windows
+	* fixed building on AIX
+
+IBM Z:
+	* added optimized BLAS 1/2 functions
+
+MIPS:
+	* fixed cpu autodetection helper code
+	* added mips32 1004K cpu (Mediatek MT7621 and similar SoC)
+	* added mips64 I6500 cpu
+
 ====================================================================
 Version 0.2.20
 24-Jul-2017
--- a/4
+++ b/4
@ -97,7 +97,7 @@ endif

 shared :
 ifndef NO_SHARED
-ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
+ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
 	@$(MAKE) -C exports so
 	@ln -fs $(LIBSONAME) $(LIBPREFIX).so
 	@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
@ -267,6 +267,8 @@ ifeq ($(F_COMPILER), GFORTRAN)
 ifdef SMP
 ifeq ($(OSNAME), WINNT)
 	-@echo "LOADER      = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
+else ifeq ($(OSNAME), Haiku)
+	-@echo "LOADER      = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
 else
 	-@echo "LOADER      = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
 endif
--- a/Makefile.install
+++ b/Makefile.install
@ -66,7 +66,7 @@ endif
 #for install shared library
 ifndef NO_SHARED
 	@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
-ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
+ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
 	@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
 	@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
 	ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@ -12,6 +12,9 @@ ifeq ($(CORE), SKYLAKEX)
 ifndef NO_AVX512
 CCOMMON_OPT += -march=skylake-avx512
 FCOMMON_OPT += -march=skylake-avx512
+ifeq ($(OSNAME), CYGWIN_NT)
+CCOMMON_OPT += -fno-asynchronous-unwind-tables
+endif
 endif
 endif

--- a/README.md
+++ b/README.md
@ -110,6 +110,7 @@ Please read `GotoBLAS_01Readme.txt`.
 - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
 - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
 - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA  on x86-64.
+- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA  on x86-64.
 - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
 - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
 - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
@ -200,6 +201,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2
 * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
 * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
  Clang 3.0 will generate the wrong AVX binary code.
+* Please use GCC version 6 or LLVM version 6 and above to compile Skyalke AVX512 kernels.
 * The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
  there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
  the library with `BIGNUMA=1`.
--- a/benchmark/gemv.c
+++ b/benchmark/gemv.c
@ -122,7 +122,7 @@ int main(int argc, char *argv[]){

  FLOAT *a, *x, *y;
  FLOAT alpha[] = {1.0, 1.0};
-  FLOAT beta [] = {1.0, 1.0};
+  FLOAT beta [] = {1.0, 0.0};
  char trans='N';
  blasint m, i, j;
  blasint inc_x=1,inc_y=1;
--- a/1
+++ b/1
@ -64,6 +64,7 @@ $os = WINNT           if ($data =~ /OS_WINNT/);
 $os = CYGWIN_NT       if ($data =~ /OS_CYGWIN_NT/);
 $os = Interix         if ($data =~ /OS_INTERIX/);
 $os = Android         if ($data =~ /OS_ANDROID/);
+$os = Haiku           if ($data =~ /OS_HAIKU/);

 $architecture = x86    if ($data =~ /ARCH_X86/);
 $architecture = x86_64 if ($data =~ /ARCH_X86_64/);
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@ -85,7 +85,7 @@ if (NOT NOFORTRAN)
 endif ()

 # Cannot run getarch on target if we are cross-compiling
-if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
+if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE"))
  # Write to config as getarch would

  # TODO: Set up defines that getarch sets up based on every other target
--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@ -68,7 +68,7 @@ endif()

 if (X86_64 OR X86)
  file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
-execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512)
+execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
 if (NO_AVX512 EQUAL 1)
 set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
 endif()
--- a/common.h
+++ b/common.h
@ -105,6 +105,10 @@ extern "C" {
 #endif
 #endif

+#ifdef OS_HAIKU
+#define NO_SYSV_IPC
+#endif
+
 #ifdef OS_WINDOWS
 #ifdef  ATOM
 #define GOTO_ATOM ATOM
@ -253,8 +257,14 @@ typedef unsigned long BLASULONG;

 #ifdef USE64BITINT
 typedef BLASLONG blasint;
+#if defined(OS_WINDOWS) && defined(__64BIT__)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
 #else
 typedef int blasint;
+#define blasabs(x) abs(x)
 #endif
 #else
 #ifdef USE64BITINT
--- a/cpuid_power.c
+++ b/cpuid_power.c
@ -142,6 +142,52 @@ int detect(void){

  return  CPUTYPE_PPC970;
 #endif
+
+#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__)
+int id;
+id = __asm __volatile("mfpvr %0" : "=r"(id));
+switch ( id >> 16 ) {
+  case 0x4e: // POWER9
+    return  return CPUTYPE_POWER8;
+    break;
+  case 0x4d:
+  case 0x4b: // POWER8/8E 
+    return CPUTYPE_POWER8;
+    break;
+  case 0x4a:
+  case 0x3f:  // POWER7/7E
+    return CPUTYPE_POWER6; 
+    break;
+  case 0x3e:
+    return CPUTYPE_POWER6;
+    break;
+  case 0x3a:
+    return CPUTYPE_POWER5;
+    break;
+  case 0x35:
+  case 0x38: // POWER4 /4+ 
+    return CPUTYPE_POWER4;
+    break;
+  case 0x40:
+  case 0x41: // POWER3 /3+ 
+    return CPUTYPE_POWER3;
+    break;
+  case 0x39:
+  case 0x3c:
+  case 0x44:
+  case 0x45:
+    return CPUTYPE_PPC970;
+    break;
+  case 0x70: 
+    return CPUTYPE_CELL;
+    break;
+  case 0x8003: 
+    return CPUTYPE_PPCG4;
+    break;
+  default:  
+    return  CPUTYPE_UNKNOWN;
+  }
+#endif
 }

 void get_architecture(void){
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@ -1452,6 +1452,8 @@ int get_cpuname(void){
 	switch (model) {
 	case 1:
 	  // AMD Ryzen
+	case 8:
+	  // AMD Ryzen2
 	  if(support_avx())
 #ifndef NO_AVX2
 	    return CPUTYPE_ZEN;
--- a/cpuid_zarch.c
+++ b/cpuid_zarch.c
@ -29,15 +29,18 @@

 #define CPU_GENERIC    	0
 #define CPU_Z13       	1
+#define CPU_Z14       	2

 static char *cpuname[] = {
  "ZARCH_GENERIC",
-  "Z13"
+  "Z13",
+  "Z14"
 };

 static char *cpuname_lower[] = {
  "zarch_generic",
-  "z13"
+  "z13",
+  "z14"
 };

 int detect(void)
@ -62,6 +65,10 @@ int detect(void)
  if (strstr(p, "2964")) return CPU_Z13;
  if (strstr(p, "2965")) return CPU_Z13;

+  /* detect z14, but fall back to z13 */
+  if (strstr(p, "3906")) return CPU_Z13;
+  if (strstr(p, "3907")) return CPU_Z13;
+
  return CPU_GENERIC;
 }

@ -107,5 +114,9 @@ void get_cpuconfig(void)
 	  printf("#define Z13\n");
 	  printf("#define DTB_DEFAULT_ENTRIES 64\n");
 	  break;
+	case CPU_Z14:
+	  printf("#define Z14\n");
+	  printf("#define DTB_DEFAULT_ENTRIES 64\n");
+	  break;
 	}
 }
--- a/ctest.c
+++ b/ctest.c
@ -101,6 +101,10 @@ OS_INTERIX
 OS_LINUX
 #endif

+#if defined(__HAIKU__)
+OS_HAIKU
+#endif
+
 #if defined(__i386) || defined(_X86)
 ARCH_X86
 #endif
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@ -344,6 +344,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
    div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
    for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {

+      /* Make sure if no one is using workspace */
+      START_RPCC();
+      for (i = 0; i < args -> nthreads; i++)
+	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
+      STOP_RPCC(waiting1);
+
 #if defined(FUSED_GEMM) && !defined(TIMING)

      /* Fused operation to copy region of B into workspace and apply kernel */
@ -381,15 +387,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
      }
 #endif

-      for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) {
-        /* Make sure if no one is using workspace */
-        START_RPCC();
-        while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
-        STOP_RPCC(waiting1);
-        /* Set flag so other threads can access local region of B */
+      /* Set flag so other threads can access local region of B */
+      for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
        job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
-        WMB;
-      }
+      WMB;
    }

    /* Get regions of B from other threads and apply kernel */
@ -425,13 +426,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,

        /* Clear synchronization flag if this thread is done with other region of B */
 	if (m_to - m_from == min_i) {
-	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
+	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
 	  WMB;
 	}
      }
    } while (current != mypos);

-    /* Iterate through steps of m
+    /* Iterate through steps of m 
     * Note: First step has already been finished */
    for(is = m_from + min_i; is < m_to; is += min_i){
      min_i = m_to - is;
@ -461,14 +462,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
 			   c, ldc, is, js);
          STOP_RPCC(kernel);
-
+          
 #ifdef TIMING
          ops += 2 * min_i * MIN(range_n[current + 1]  - js, div_n) * min_l;
 #endif
-
+          
          /* Clear synchronization flag if this thread is done with region of B */
          if (is + min_i >= m_to) {
-            job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
+            job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
            WMB;
          }
 	}
--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /*********************************************************************/

 #include "common.h"
-#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
+#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU)
 #include <dlfcn.h>
 #include <signal.h>
 #include <sys/resource.h>
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@ -510,7 +510,7 @@ static gotoblas_t *get_coretype(void){
 #ifndef NO_AVX2
 	  return &gotoblas_HASWELL;
 #else
-	  return &gotblas_SANDYBRIDGE;
+	  return &gotoblas_SANDYBRIDGE;
 #endif
 	  else
 	  return &gotoblas_NEHALEM;
@ -607,7 +607,7 @@ static gotoblas_t *get_coretype(void){
 	  }
 	}
      } else if (exfamily == 8) {
-	if (model == 1) {
+	if (model == 1 || model == 8) {
 	  if(support_avx())
 	    return &gotoblas_ZEN;
 	  else{
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@ -468,6 +468,7 @@ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t);
 #  if defined(OS_WINDOWS)
 static DWORD local_storage_key = 0;
 DWORD lsk;
+
 #  else
 static pthread_key_t local_storage_key = 0;
 pthread_key_t lsk;
@ -1269,6 +1270,7 @@ void blas_shutdown(void){
 #ifdef SMP
  BLASFUNC(blas_thread_shutdown)();
 #endif
+
 #ifdef SMP
  /* Only cleanupIf we were built for threading and TLS was initialized */
  if (local_storage_key)
--- a/driver/others/openblas_get_config.c
+++ b/driver/others/openblas_get_config.c
@ -35,6 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #include <string.h>

+#if defined(_WIN32) && defined(_MSC_VER)
+#if _MSC_VER < 1900
+#define snprintf _snprintf
+#endif
+#endif
+
 static char* openblas_config_str=""
 #ifdef USE64BITINT
  "USE64BITINT "
--- a/exports/Makefile
+++ b/exports/Makefile
@ -122,7 +122,7 @@ endif
 dllinit.$(SUFFIX) : dllinit.c
 	$(CC) $(CFLAGS) -c -o $(@F) -s $<

-ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
+ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))

 so : ../$(LIBSONAME)

--- a/interface/gbmv.c
+++ b/interface/gbmv.c
@ -213,7 +213,7 @@ void CNAME(enum CBLAS_ORDER order,
  if (trans) lenx = m;
  if (trans) leny = n;

-  if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
+  if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);

  if (alpha == ZERO) return;

--- a/interface/gemv.c
+++ b/interface/gemv.c
@ -199,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order,
  if (trans) lenx = m;
  if (trans) leny = n;

-  if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
+  if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);

  if (alpha == ZERO) return;

--- a/interface/rotg.c
+++ b/interface/rotg.c
@ -22,8 +22,8 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
  long double s;
  long double r, roe, z;

-  long double ada = fabs(da);
-  long double adb = fabs(db);
+  long double ada = fabsl(da);
+  long double adb = fabsl(db);
  long double scale = ada + adb;

 #ifndef CBLAS
--- a/interface/sbmv.c
+++ b/interface/sbmv.c
@ -184,7 +184,7 @@ void CNAME(enum CBLAS_ORDER order,

  if (n == 0) return;

-  if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
+  if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);

  if (alpha == ZERO) return;

--- a/interface/spmv.c
+++ b/interface/spmv.c
@ -168,7 +168,7 @@ void CNAME(enum CBLAS_ORDER order,

  if (n == 0) return;

-  if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
+  if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);

  if (alpha == ZERO) return;

--- a/interface/symv.c
+++ b/interface/symv.c
@ -166,7 +166,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,

  if (n == 0) return;

-  if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
+  if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);

  if (alpha == ZERO) return;

--- a/interface/zgbmv.c
+++ b/interface/zgbmv.c
@ -237,7 +237,7 @@ void CNAME(enum CBLAS_ORDER order,
  if (trans & 1) lenx = m;
  if (trans & 1) leny = n;

-  if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
+  if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);

  if (alpha_r == ZERO && alpha_i == ZERO) return;

--- a/interface/zgemv.c
+++ b/interface/zgemv.c
@ -225,7 +225,7 @@ void CNAME(enum CBLAS_ORDER order,
  if (trans & 1) lenx = m;
  if (trans & 1) leny = n;

-  if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
+  if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);

  if (alpha_r == ZERO && alpha_i == ZERO) return;

--- a/interface/zhbmv.c
+++ b/interface/zhbmv.c
@ -190,7 +190,7 @@ void CNAME(enum CBLAS_ORDER order,

  if (n == 0) return;

-  if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
+  if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);

  if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;

--- a/interface/zhemv.c
+++ b/interface/zhemv.c
@ -181,7 +181,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA

  if (n == 0) return;

-  if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
+  if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);

  if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;

--- a/interface/zhpmv.c
+++ b/interface/zhpmv.c
@ -180,7 +180,7 @@ void CNAME(enum CBLAS_ORDER order,

  if (n == 0) return;

-  if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
+  if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);

  if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;

--- a/interface/zrotg.c
+++ b/interface/zrotg.c
@ -14,7 +14,7 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
  long double db_i = *(DB + 1);
  long double r;

-  long double ada = fabs(da_r) + fabs(da_i);
+  long double ada = fabsl(da_r) + fabsl(da_i);

  PRINT_DEBUG_NAME;

--- a/interface/zsbmv.c
+++ b/interface/zsbmv.c
@ -126,7 +126,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT  *ALPHA, FLOAT *a, blasint *

  if (n == 0) return;

-  if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0);
+  if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, blasabs(incy), NULL, 0, NULL, 0);

  if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;

--- a/kernel/mips64/KERNEL
+++ b/kernel/mips64/KERNEL
@ -1,3 +1,12 @@
+CAXPYKERNEL = ../mips/zaxpy.c
+ZAXPYKERNEL = ../mips/zaxpy.c
+SROTKERNEL = ../mips/rot.c
+DROTKERNEL = ../mips/rot.c
+CROTKERNEL = ../mips/zrot.c
+ZROTKERNEL = ../mips/zrot.c
+CSWAPKERNEL = ../mips/zswap.c
+ZSWAPKERNEL = ../mips/zswap.c
+
 ifndef SNRM2KERNEL
 SNRM2KERNEL = snrm2.S
 endif
--- a/kernel/mips64/dot.S
+++ b/kernel/mips64/dot.S
@ -103,35 +103,83 @@
 	.align 3

 .L12:
+#ifdef DSDOT
+cvt.d.s  a1, a1
+cvt.d.s  b1, b1  
+madd.d s1, s1, a1, b1
+#else
 	MADD	s1, s1, a1, b1
+#endif
 	LD	a1,  4 * SIZE(X)
 	LD	b1,  4 * SIZE(Y)

+#ifdef DSDOT
+cvt.d.s  a2, a2
+cvt.d.s  b2, b2
+madd.d s2, s2, a2, b2
+#else
 	MADD	s2, s2, a2, b2
+#endif
 	LD	a2,  5 * SIZE(X)
 	LD	b2,  5 * SIZE(Y)

+#ifdef DSDOT
+cvt.d.s  a3, a3
+cvt.d.s  b3, b3
+madd.d s1, s1, a3, b3
+#else
 	MADD	s1, s1, a3, b3
+#endif
 	LD	a3,  6 * SIZE(X)
 	LD	b3,  6 * SIZE(Y)

+#ifdef DSDOT
+cvt.d.s  a4, a4
+cvt.d.s  b4, b4
+madd.d s2, s2, a4, b4
+#else
 	MADD	s2, s2, a4, b4
+#endif
 	LD	a4,  7 * SIZE(X)
 	LD	b4,  7 * SIZE(Y)

+#ifdef DSDOT
+cvt.d.s  a1, a1
+cvt.d.s  b1, b1
+madd.d s1, s1, a1, b1
+#else
 	MADD	s1, s1, a1, b1
+#endif
 	LD	a1,  8 * SIZE(X)
 	LD	b1,  8 * SIZE(Y)

+#ifdef DSDOT
+cvt.d.s  a2, a2
+cvt.d.s  b2, b2
+madd.d s2, s2, a2, b2
+#else
 	MADD	s2, s2, a2, b2
+#endif
 	LD	a2,  9 * SIZE(X)
 	LD	b2,  9 * SIZE(Y)

+#ifdef DSDOT
+cvt.d.s  a3, a3
+cvt.d.s  b3, b3
+madd.d s1, s1, a3, b3
+#else
 	MADD	s1, s1, a3, b3
+#endif
 	LD	a3, 10 * SIZE(X)
 	LD	b3, 10 * SIZE(Y)

+#ifdef DSDOT
+cvt.d.s  a4, a4
+cvt.d.s  b4, b4
+madd.d s2, s2, a4, b4
+#else
 	MADD	s2, s2, a4, b4
+#endif
 	LD	a4, 11 * SIZE(X)
 	LD	b4, 11 * SIZE(Y)

@ -143,29 +191,77 @@
 	.align 3

 .L13:
+#ifdef DSDOT
+cvt.d.s  a1, a1
+cvt.d.s  b1, b1
+madd.d s1, s1, a1, b1
+#else
 	MADD	s1, s1, a1, b1
+#endif
 	LD	a1,  4 * SIZE(X)
 	LD	b1,  4 * SIZE(Y)

+#ifdef DSDOT
+cvt.d.s a2, a2
+cvt.d.s b2, b2
+madd.d s2, s2, a2, b2
+#else
 	MADD	s2, s2, a2, b2
+#endif
 	LD	a2,  5 * SIZE(X)
 	LD	b2,  5 * SIZE(Y)

+#ifdef DSDOT
+cvt.d.s a3, a3 
+cvt.d.s b3, b3
+madd.d s1, s1, a3, b3
+#else
 	MADD	s1, s1, a3, b3
+#endif
 	LD	a3,  6 * SIZE(X)
 	LD	b3,  6 * SIZE(Y)

+#ifdef DSDOT
+cvt.d.s a4, a4
+cvt.d.s b4, b4
+madd.d s2, s2, a4, b4
+#else
 	MADD	s2, s2, a4, b4
+#endif
 	LD	a4,  7 * SIZE(X)
 	LD	b4,  7 * SIZE(Y)

+#ifdef DSDOT
+cvt.d.s  a1, a1
+cvt.d.s  b1, b1
+madd.d s1, s1, a1, b1
+#else
 	MADD	s1, s1, a1, b1
+#endif
 	daddiu	X, X, 8 * SIZE
+#ifdef DSDOT
+cvt.d.s  a2, a2
+cvt.d.s  b2, b2
+madd.d s2, s2, a2, b2
+#else
 	MADD	s2, s2, a2, b2
+#endif
 	daddiu	Y, Y, 8 * SIZE

+#ifdef DSDOT
+cvt.d.s  a3, a3
+cvt.d.s  b3, b3
+madd.d s1, s1, a3, b3
+#else
 	MADD	s1, s1, a3, b3
+#endif
+#ifdef DSDOT
+cvt.d.s  a4, a4
+cvt.d.s  b4, b4
+madd.d s2, s2, a4, b4
+#else
 	MADD	s2, s2, a4, b4
+#endif
 	.align 3

 .L15:
@ -179,8 +275,13 @@
 	LD	a1,  0 * SIZE(X)
 	LD	b1,  0 * SIZE(Y)

+#ifdef DSDOT
+cvt.d.s  a1, a1
+cvt.d.s  b1, b1
+madd.d s1, s1, a1, b1
+#else
 	MADD	s1, s1, a1, b1
-
+#endif
 	daddiu	I, I, -1

 	daddiu	X, X, SIZE
@ -225,50 +326,85 @@
 	LD	b1,  0 * SIZE(Y)
 	dadd	Y, Y, INCY

+#ifdef DSDOT
+cvt.d.s  a1, a1
+cvt.d.s  b1, b1
+madd.d s1, s1, a1, b1
+#else
 	MADD	s1, s1, a1, b1
-
+#endif
 	LD	a1,  0 * SIZE(X)
 	dadd	X, X, INCX
 	LD	b1,  0 * SIZE(Y)
 	dadd	Y, Y, INCY

+#ifdef DSDOT
+cvt.d.s  a1, a1
+cvt.d.s  b1, b1
+madd.d s2, s2, a1, b1
+#else
 	MADD	s2, s2, a1, b1
-
+#endif
 	LD	a1,  0 * SIZE(X)
 	dadd	X, X, INCX
 	LD	b1,  0 * SIZE(Y)
 	dadd	Y, Y, INCY

+#ifdef DSDOT
+cvt.d.s  a1, a1
+cvt.d.s  b1, b1
+madd.d s1, s1, a1, b1
+#else
 	MADD	s1, s1, a1, b1
-
+#endif
 	LD	a1,  0 * SIZE(X)
 	dadd	X, X, INCX
 	LD	b1,  0 * SIZE(Y)
 	dadd	Y, Y, INCY

+#ifdef DSDOT
+cvt.d.s  a1, a1
+cvt.d.s  b1, b1
+madd.d s2, s2, a1, b1
+#else
 	MADD	s2, s2, a1, b1
-
+#endif
 	LD	a1,  0 * SIZE(X)
 	dadd	X, X, INCX
 	LD	b1,  0 * SIZE(Y)
 	dadd	Y, Y, INCY

+#ifdef DSDOT
+cvt.d.s  a1, a1
+cvt.d.s  b1, b1
+madd.d s1, s1, a1, b1
+#else
 	MADD	s1, s1, a1, b1
-
+#endif
 	LD	a1,  0 * SIZE(X)
 	dadd	X, X, INCX
 	LD	b1,  0 * SIZE(Y)
 	dadd	Y, Y, INCY

+#ifdef DSDOT
+cvt.d.s  a1, a1
+cvt.d.s  b1, b1
+madd.d s2, s2, a1, b1
+#else
 	MADD	s2, s2, a1, b1
-
+#endif
 	LD	a1,  0 * SIZE(X)
 	dadd	X, X, INCX
 	LD	b1,  0 * SIZE(Y)
 	dadd	Y, Y, INCY

+#ifdef DSDOT
+cvt.d.s  a1, a1
+cvt.d.s  b1, b1
+madd.d s1, s1, a1, b1
+#else
 	MADD	s1, s1, a1, b1
-
+#endif
 	LD	a1,  0 * SIZE(X)
 	dadd	X, X, INCX
 	LD	b1,  0 * SIZE(Y)
@ -277,7 +413,13 @@
 	daddiu	I, I, -1

 	bgtz	I, .L23
+#ifdef DSDOT
+cvt.d.s  a1, a1
+cvt.d.s  b1, b1
+madd.d s2, s2, a1, b1
+#else
 	MADD	s2, s2, a1, b1
+#endif
 	.align 3

 .L25:
@ -296,13 +438,20 @@
 	daddiu	I, I, -1

 	bgtz	I, .L26
+#ifdef DSDOT
+cvt.d.s  a1, a1
+cvt.d.s  b1, b1
+madd.d s1, s1, a1, b1
+#else
 	MADD	s1, s1, a1, b1
+#endif
 	.align 3

 .L999:
-	ADD	s1, s1, s2
 #ifdef DSDOT
-	cvt.d.s s1, s1
+	add.d s1, s1, s2
+#else
+	ADD	s1, s1, s2
 #endif
 	j	$31
 	NOP
--- a/kernel/x86_64/daxpy.c
+++ b/kernel/x86_64/daxpy.c
@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "daxpy_microk_steamroller-2.c"
 #elif defined(PILEDRIVER)
 #include "daxpy_microk_piledriver-2.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#elif defined(HASWELL) || defined(ZEN)
 #include "daxpy_microk_haswell-2.c"
+#elif defined (SKYLAKEX)
+#include "daxpy_microk_skylakex-2.c"
 #elif defined(SANDYBRIDGE)
 #include "daxpy_microk_sandy-2.c"
 #endif
--- a/kernel/x86_64/daxpy_microk_skylakex-2.c
+++ b/kernel/x86_64/daxpy_microk_skylakex-2.c
@ -0,0 +1,71 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+
+#include <immintrin.h>
+
+#define HAVE_KERNEL_8 1
+
+static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+	BLASLONG i = 0;
+
+	__m256d __alpha;
+
+	__alpha =  _mm256_broadcastsd_pd(_mm_load_sd(alpha));
+
+#ifdef __AVX512CD__
+	BLASLONG n32;
+	__m512d __alpha5;
+	__alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha));
+
+	n32 = n & ~31;
+
+	for (; i < n32; i+= 32) {
+		_mm512_storeu_pd(&y[i +  0], _mm512_loadu_pd(&y[i +  0]) +  __alpha5 * _mm512_loadu_pd(&x[i +  0]));
+		_mm512_storeu_pd(&y[i +  8], _mm512_loadu_pd(&y[i +  8]) +  __alpha5 * _mm512_loadu_pd(&x[i +  8]));
+		_mm512_storeu_pd(&y[i + 16], _mm512_loadu_pd(&y[i + 16]) +  __alpha5 * _mm512_loadu_pd(&x[i + 16]));
+		_mm512_storeu_pd(&y[i + 24], _mm512_loadu_pd(&y[i + 24]) +  __alpha5 * _mm512_loadu_pd(&x[i + 24]));
+	}
+
+#endif
+
+	for (; i < n; i+= 16) {
+		_mm256_storeu_pd(&y[i +  0], _mm256_loadu_pd(&y[i +  0]) + __alpha * _mm256_loadu_pd(&x[i +  0]));
+		_mm256_storeu_pd(&y[i +  4], _mm256_loadu_pd(&y[i +  4]) + __alpha * _mm256_loadu_pd(&x[i +  4]));
+		_mm256_storeu_pd(&y[i +  8], _mm256_loadu_pd(&y[i +  8]) + __alpha * _mm256_loadu_pd(&x[i +  8]));
+		_mm256_storeu_pd(&y[i + 12], _mm256_loadu_pd(&y[i + 12]) + __alpha * _mm256_loadu_pd(&x[i + 12]));
+	}
+}
+#else
+#include "daxpy_microk_haswell-2.c"
+#endif
+
+
--- a/kernel/x86_64/ddot.c
+++ b/kernel/x86_64/ddot.c
@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "ddot_microk_piledriver-2.c"
 #elif defined(NEHALEM)
 #include "ddot_microk_nehalem-2.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#elif defined(HASWELL) || defined(ZEN)
 #include "ddot_microk_haswell-2.c"
+#elif defined (SKYLAKEX)
+#include "ddot_microk_skylakex-2.c"
 #elif defined(SANDYBRIDGE)
 #include "ddot_microk_sandy-2.c"
 #endif
--- a/kernel/x86_64/ddot_microk_skylakex-2.c
+++ b/kernel/x86_64/ddot_microk_skylakex-2.c
@ -0,0 +1,96 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+
+#define HAVE_KERNEL_8 1
+
+#include <immintrin.h>
+
+static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
+{
+	int i = 0;
+	__m256d accum_0, accum_1, accum_2, accum_3;
+	
+	accum_0 = _mm256_setzero_pd();
+	accum_1 = _mm256_setzero_pd();
+	accum_2 = _mm256_setzero_pd();
+	accum_3 = _mm256_setzero_pd();
+
+#ifdef __AVX512CD__
+	__m512d accum_05, accum_15, accum_25, accum_35;
+	int n32;
+	n32 = n & (~31);
+
+	accum_05 = _mm512_setzero_pd();
+	accum_15 = _mm512_setzero_pd();
+	accum_25 = _mm512_setzero_pd();
+	accum_35 = _mm512_setzero_pd();
+
+	for (; i < n32; i += 32) {
+		accum_05 += _mm512_loadu_pd(&x[i+ 0]) * _mm512_loadu_pd(&y[i+ 0]);
+		accum_15 += _mm512_loadu_pd(&x[i+ 8]) * _mm512_loadu_pd(&y[i+ 8]);
+		accum_25 += _mm512_loadu_pd(&x[i+16]) * _mm512_loadu_pd(&y[i+16]);
+		accum_35 += _mm512_loadu_pd(&x[i+24]) * _mm512_loadu_pd(&y[i+24]);
+	}
+
+	/*
+	 * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code
+	 * below can continue using the intermediate results in its loop
+	 */
+	accum_0 = _mm512_extractf64x4_pd(accum_05, 0) + _mm512_extractf64x4_pd(accum_05, 1);
+	accum_1 = _mm512_extractf64x4_pd(accum_15, 0) + _mm512_extractf64x4_pd(accum_15, 1);
+	accum_2 = _mm512_extractf64x4_pd(accum_25, 0) + _mm512_extractf64x4_pd(accum_25, 1);
+	accum_3 = _mm512_extractf64x4_pd(accum_35, 0) + _mm512_extractf64x4_pd(accum_35, 1);
+
+#endif
+	for (; i < n; i += 16) {
+		accum_0 += _mm256_loadu_pd(&x[i+ 0]) * _mm256_loadu_pd(&y[i+ 0]);
+		accum_1 += _mm256_loadu_pd(&x[i+ 4]) * _mm256_loadu_pd(&y[i+ 4]);
+		accum_2 += _mm256_loadu_pd(&x[i+ 8]) * _mm256_loadu_pd(&y[i+ 8]);
+		accum_3 += _mm256_loadu_pd(&x[i+12]) * _mm256_loadu_pd(&y[i+12]);
+	}
+
+	/* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */
+
+	accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
+
+	__m128d half_accum0;
+
+	/* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */
+	half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1));
+
+	/* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */
+	half_accum0 = _mm_hadd_pd(half_accum0, half_accum0);
+
+	*dot = half_accum0[0];
+}
+
+#else
+#include "ddot_microk_haswell-2.c"
+#endif
--- a/kernel/x86_64/dgemv_n_4.c
+++ b/kernel/x86_64/dgemv_n_4.c
@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if defined(NEHALEM)
 #include "dgemv_n_microk_nehalem-4.c"
-#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX)
+#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR)
 #include "dgemv_n_microk_haswell-4.c"
+#elif  defined (SKYLAKEX)
+#include "dgemv_n_microk_skylakex-4.c"
 #endif


--- a/kernel/x86_64/dgemv_n_microk_skylakex-4.c
+++ b/kernel/x86_64/dgemv_n_microk_skylakex-4.c
@ -0,0 +1,126 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+
+#define HAVE_KERNEL_4x4 1
+
+#include <immintrin.h>
+
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+	int i = 0;
+
+	__m256d x0, x1, x2, x3;
+	__m256d __alpha;
+
+	x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0]));
+	x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1]));
+	x2 = _mm256_broadcastsd_pd(_mm_load_sd(&x[2]));
+	x3 = _mm256_broadcastsd_pd(_mm_load_sd(&x[3]));
+
+	__alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
+
+#ifdef __AVX512CD__
+	int n5;
+	__m512d x05, x15, x25, x35;
+	__m512d __alpha5;
+	n5 = n & ~7;
+
+	x05 = _mm512_broadcastsd_pd(_mm_load_sd(&x[0]));
+	x15 = _mm512_broadcastsd_pd(_mm_load_sd(&x[1]));
+	x25 = _mm512_broadcastsd_pd(_mm_load_sd(&x[2]));
+	x35 = _mm512_broadcastsd_pd(_mm_load_sd(&x[3]));
+
+	__alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha));
+
+	for (; i < n5; i+= 8) {
+		__m512d tempY;
+		__m512d sum;
+
+		sum = _mm512_loadu_pd(&ap[0][i]) * x05 +
+		      _mm512_loadu_pd(&ap[1][i]) * x15 +
+		      _mm512_loadu_pd(&ap[2][i]) * x25 +
+		      _mm512_loadu_pd(&ap[3][i]) * x35;
+
+		tempY = _mm512_loadu_pd(&y[i]);
+		tempY += sum *  __alpha5;
+		_mm512_storeu_pd(&y[i], tempY);
+	}
+#endif
+
+	for (; i < n; i+= 4) {
+		__m256d tempY;
+		__m256d sum;
+
+		sum = _mm256_loadu_pd(&ap[0][i]) * x0 +
+		      _mm256_loadu_pd(&ap[1][i]) * x1 +
+		      _mm256_loadu_pd(&ap[2][i]) * x2 +
+		      _mm256_loadu_pd(&ap[3][i]) * x3;
+
+		tempY = _mm256_loadu_pd(&y[i]);
+		tempY += sum *  __alpha;
+		_mm256_storeu_pd(&y[i], tempY);
+	}
+
+} 
+
+
+#define HAVE_KERNEL_4x2
+
+static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+	int i = 0;
+
+	__m256d x0, x1;
+	__m256d __alpha;
+
+	x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0]));
+	x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1]));
+
+	__alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
+
+
+	for (i = 0; i < n; i+= 4) {
+		__m256d tempY;
+		__m256d sum;
+
+		sum = _mm256_loadu_pd(&ap[0][i]) * x0 + _mm256_loadu_pd(&ap[1][i]) * x1;
+
+		tempY = _mm256_loadu_pd(&y[i]);
+		tempY +=  sum *  __alpha;
+		_mm256_storeu_pd(&y[i], tempY);
+	}
+
+}
+
+#else
+#include "dgemv_n_microk_haswell-4.c"
+#endif 
--- a/kernel/x86_64/dscal.c
+++ b/kernel/x86_64/dscal.c
@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "dscal_microk_bulldozer-2.c"
 #elif defined(SANDYBRIDGE)
 #include "dscal_microk_sandy-2.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#elif defined(HASWELL) || defined(ZEN)
 #include "dscal_microk_haswell-2.c"
+#elif  defined (SKYLAKEX)
+#include "dscal_microk_skylakex-2.c"
 #endif


--- a/kernel/x86_64/dscal_microk_skylakex-2.c
+++ b/kernel/x86_64/dscal_microk_skylakex-2.c
@ -0,0 +1,77 @@
+/***************************************************************************
+Copyright (c) 2014-2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+
+#include <immintrin.h>
+
+#define HAVE_KERNEL_8 1
+
+static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
+{
+	int i = 0;
+
+#ifdef __AVX512CD__
+	__m512d __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha));
+	for (; i < n; i += 8) {
+                _mm512_storeu_pd(&x[i +  0], __alpha5 * _mm512_loadu_pd(&x[i +  0]));
+	}
+#else
+	__m256d __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha));
+	for (; i < n; i += 8) {
+                _mm256_storeu_pd(&x[i +  0], __alpha * _mm256_loadu_pd(&x[i +  0]));
+                _mm256_storeu_pd(&x[i +  4], __alpha * _mm256_loadu_pd(&x[i +  4]));
+	}
+#endif
+} 
+
+
+static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
+{
+	int i = 0;
+
+	/* question to self: Why is this not just memset() */
+
+#ifdef __AVX512CD__
+	__m512d zero = _mm512_setzero_pd();
+	for (; i < n; i += 8) {
+                _mm512_storeu_pd(&x[i], zero);
+	}
+#else
+	__m256d zero = _mm256_setzero_pd();
+	for (; i < n; i += 8) {
+                _mm256_storeu_pd(&x[i +  0], zero);
+                _mm256_storeu_pd(&x[i +  4], zero);
+	}
+#endif
+
+} 
+
+#else
+#include "dscal_microk_haswell-2.c"
+#endif
--- a/kernel/x86_64/dsymv_L.c
+++ b/kernel/x86_64/dsymv_L.c
@ -30,8 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
 #include "dsymv_L_microk_bulldozer-2.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#elif defined(HASWELL) || defined(ZEN)
 #include "dsymv_L_microk_haswell-2.c"
+#elif defined (SKYLAKEX)
+#include "dsymv_L_microk_skylakex-2.c"
 #elif defined(SANDYBRIDGE)
 #include "dsymv_L_microk_sandy-2.c"
 #elif defined(NEHALEM)
--- a/kernel/x86_64/dsymv_L_microk_skylakex-2.c
+++ b/kernel/x86_64/dsymv_L_microk_skylakex-2.c
@ -0,0 +1,161 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+
+#include <immintrin.h>
+
+#define HAVE_KERNEL_4x4 1
+
+static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
+{
+
+
+	__m256d accum_0, accum_1, accum_2, accum_3; 
+	__m256d temp1_0, temp1_1, temp1_2, temp1_3;
+
+	/* the 256 bit wide acculmulator vectors start out as zero */
+	accum_0 = _mm256_setzero_pd();
+	accum_1 = _mm256_setzero_pd();
+	accum_2 = _mm256_setzero_pd();
+	accum_3 = _mm256_setzero_pd();
+
+	temp1_0 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[0]));
+	temp1_1 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[1]));
+	temp1_2 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[2]));
+	temp1_3 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[3]));
+
+#ifdef __AVX512CD__
+	__m512d accum_05, accum_15, accum_25, accum_35;
+	__m512d temp1_05, temp1_15, temp1_25, temp1_35;
+	BLASLONG to2;
+	int delta;
+
+	/* the 512 bit wide accumulator vectors start out as zero */
+	accum_05 = _mm512_setzero_pd();
+	accum_15 = _mm512_setzero_pd();
+	accum_25 = _mm512_setzero_pd();
+	accum_35 = _mm512_setzero_pd();
+
+	temp1_05 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[0]));
+	temp1_15 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[1]));
+	temp1_25 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[2]));
+	temp1_35 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[3]));
+
+	delta = (to - from) & ~7;
+	to2 = from + delta;
+
+
+	for (; from < to2; from += 8) {
+		__m512d _x, _y;
+		__m512d a0, a1, a2, a3;
+
+		_y = _mm512_loadu_pd(&y[from]);
+		_x = _mm512_loadu_pd(&x[from]);
+
+		a0 = _mm512_loadu_pd(&a[0][from]);
+		a1 = _mm512_loadu_pd(&a[1][from]);
+		a2 = _mm512_loadu_pd(&a[2][from]);
+		a3 = _mm512_loadu_pd(&a[3][from]);
+
+		_y += temp1_05 * a0 + temp1_15 * a1 + temp1_25 * a2 + temp1_35 * a3;
+
+		accum_05 += _x * a0;
+		accum_15 += _x * a1;
+		accum_25 += _x * a2;
+		accum_35 += _x * a3;
+
+		_mm512_storeu_pd(&y[from], _y);
+
+	};
+
+	/*
+	 * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code
+	 * below can continue using the intermediate results in its loop
+	 */
+	accum_0 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_05, 0), _mm512_extractf64x4_pd(accum_05, 1));
+	accum_1 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_15, 0), _mm512_extractf64x4_pd(accum_15, 1));
+	accum_2 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_25, 0), _mm512_extractf64x4_pd(accum_25, 1));
+	accum_3 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_35, 0), _mm512_extractf64x4_pd(accum_35, 1));
+
+#endif
+
+	for (; from != to; from += 4) {
+		__m256d _x, _y;
+		__m256d a0, a1, a2, a3;
+
+		_y = _mm256_loadu_pd(&y[from]);
+		_x = _mm256_loadu_pd(&x[from]);
+
+		/* load 4 rows of matrix data */
+		a0 = _mm256_loadu_pd(&a[0][from]);
+		a1 = _mm256_loadu_pd(&a[1][from]);
+		a2 = _mm256_loadu_pd(&a[2][from]);
+		a3 = _mm256_loadu_pd(&a[3][from]);
+
+		_y += temp1_0 * a0 + temp1_1 * a1 + temp1_2 * a2 + temp1_3 * a3;
+
+		accum_0 += _x * a0;
+		accum_1 += _x * a1;
+		accum_2 += _x * a2;
+		accum_3 += _x * a3;
+
+		_mm256_storeu_pd(&y[from], _y);
+
+	};
+
+	/*
+	 * we now have 4 accumulator vectors. Each vector needs to be summed up element wise and stored in the temp2
+	 * output array. There is no direct instruction for this in 256 bit space, only in 128 space.
+	 */
+
+	__m128d half_accum0, half_accum1, half_accum2, half_accum3;
+
+
+	/* Add upper half to lower half of each of the four 256 bit vectors to get to four 128 bit vectors */
+	half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1));
+	half_accum1 = _mm_add_pd(_mm256_extractf128_pd(accum_1, 0), _mm256_extractf128_pd(accum_1, 1));
+	half_accum2 = _mm_add_pd(_mm256_extractf128_pd(accum_2, 0), _mm256_extractf128_pd(accum_2, 1));
+	half_accum3 = _mm_add_pd(_mm256_extractf128_pd(accum_3, 0), _mm256_extractf128_pd(accum_3, 1));
+
+	/* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */
+	half_accum0 = _mm_hadd_pd(half_accum0, half_accum0);
+	half_accum1 = _mm_hadd_pd(half_accum1, half_accum1);
+	half_accum2 = _mm_hadd_pd(half_accum2, half_accum2);
+	half_accum3 = _mm_hadd_pd(half_accum3, half_accum3);
+
+	/* and store the lowest double value from each of these vectors in the temp2 output */
+	temp2[0] += half_accum0[0];
+	temp2[1] += half_accum1[0];
+	temp2[2] += half_accum2[0];
+	temp2[3] += half_accum3[0];
+} 
+#else
+#include "dsymv_L_microk_haswell-2.c"
+#endif
--- a/kernel/x86_64/saxpy.c
+++ b/kernel/x86_64/saxpy.c
@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if defined(NEHALEM)
 #include "saxpy_microk_nehalem-2.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#elif defined(HASWELL) || defined(ZEN)
 #include "saxpy_microk_haswell-2.c"
+#elif defined (SKYLAKEX)
+#include "saxpy_microk_skylakex-2.c"
 #elif defined(SANDYBRIDGE)
 #include "saxpy_microk_sandy-2.c"
 #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
--- a/kernel/x86_64/saxpy_microk_skylakex-2.c
+++ b/kernel/x86_64/saxpy_microk_skylakex-2.c
@ -0,0 +1,69 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+
+#define HAVE_KERNEL_16 1
+
+#include <immintrin.h>
+
+static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+	BLASLONG i = 0;
+
+	__m256 __alpha;
+
+	__alpha =  _mm256_broadcastss_ps(_mm_load_ss(alpha));
+
+#ifdef __AVX512CD__
+	BLASLONG n64;
+	__m512 __alpha5;
+	__alpha5 = _mm512_broadcastss_ps(_mm_load_ss(alpha));
+
+	n64 = n & ~63;
+
+	for (; i < n64; i+= 64) {
+		_mm512_storeu_ps(&y[i +  0], _mm512_loadu_ps(&y[i +  0]) + __alpha5 * _mm512_loadu_ps(&x[i +  0]));
+		_mm512_storeu_ps(&y[i + 16], _mm512_loadu_ps(&y[i + 16]) + __alpha5 * _mm512_loadu_ps(&x[i + 16]));
+		_mm512_storeu_ps(&y[i + 32], _mm512_loadu_ps(&y[i + 32]) + __alpha5 * _mm512_loadu_ps(&x[i + 32]));
+		_mm512_storeu_ps(&y[i + 48], _mm512_loadu_ps(&y[i + 48]) + __alpha5 * _mm512_loadu_ps(&x[i + 48]));
+	}
+
+#endif
+
+	for (; i < n; i+= 32) {
+		_mm256_storeu_ps(&y[i +  0], _mm256_loadu_ps(&y[i +  0]) + __alpha * _mm256_loadu_ps(&x[i +  0]));
+		_mm256_storeu_ps(&y[i +  8], _mm256_loadu_ps(&y[i +  8]) + __alpha * _mm256_loadu_ps(&x[i +  8]));
+		_mm256_storeu_ps(&y[i + 16], _mm256_loadu_ps(&y[i + 16]) + __alpha * _mm256_loadu_ps(&x[i + 16]));
+		_mm256_storeu_ps(&y[i + 24], _mm256_loadu_ps(&y[i + 24]) + __alpha * _mm256_loadu_ps(&x[i + 24]));
+	}
+}
+#else
+#include "saxpy_microk_haswell-2.c"
+#endif
+
--- a/kernel/x86_64/sdot.c
+++ b/kernel/x86_64/sdot.c
@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "sdot_microk_steamroller-2.c"
 #elif defined(NEHALEM)
 #include "sdot_microk_nehalem-2.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#elif defined(HASWELL) || defined(ZEN)
 #include "sdot_microk_haswell-2.c"
+#elif  defined (SKYLAKEX)
+#include "sdot_microk_skylakex-2.c"
 #elif defined(SANDYBRIDGE)
 #include "sdot_microk_sandy-2.c"
 #endif
--- a/kernel/x86_64/sdot_microk_skylakex-2.c
+++ b/kernel/x86_64/sdot_microk_skylakex-2.c
@ -0,0 +1,98 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+
+#define HAVE_KERNEL_16 1
+
+#include <immintrin.h>
+
+static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
+
+{
+	int i = 0;
+	__m256 accum_0, accum_1, accum_2, accum_3;
+
+	accum_0 = _mm256_setzero_ps();
+	accum_1 = _mm256_setzero_ps();
+	accum_2 = _mm256_setzero_ps();
+	accum_3 = _mm256_setzero_ps();
+
+#ifdef __AVX512CD__
+	__m512 accum_05, accum_15, accum_25, accum_35;
+	int n64;
+	n64 = n & (~63);
+
+	accum_05 = _mm512_setzero_ps();
+	accum_15 = _mm512_setzero_ps();
+	accum_25 = _mm512_setzero_ps();
+	accum_35 = _mm512_setzero_ps();
+
+	for (; i < n64; i += 64) {
+		accum_05 += _mm512_loadu_ps(&x[i+ 0]) * _mm512_loadu_ps(&y[i+ 0]);
+		accum_15 += _mm512_loadu_ps(&x[i+16]) * _mm512_loadu_ps(&y[i+16]);
+		accum_25 += _mm512_loadu_ps(&x[i+32]) * _mm512_loadu_ps(&y[i+32]);
+		accum_35 += _mm512_loadu_ps(&x[i+48]) * _mm512_loadu_ps(&y[i+48]);
+	}
+
+	/*
+	 * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code
+	 * below can continue using the intermediate results in its loop
+	 */
+	accum_0 = _mm512_extractf32x8_ps(accum_05, 0) + _mm512_extractf32x8_ps(accum_05, 1);
+	accum_1 = _mm512_extractf32x8_ps(accum_15, 0) + _mm512_extractf32x8_ps(accum_15, 1);
+	accum_2 = _mm512_extractf32x8_ps(accum_25, 0) + _mm512_extractf32x8_ps(accum_25, 1);
+	accum_3 = _mm512_extractf32x8_ps(accum_35, 0) + _mm512_extractf32x8_ps(accum_35, 1);
+
+#endif
+	for (; i < n; i += 32) {
+		accum_0 += _mm256_loadu_ps(&x[i+ 0]) * _mm256_loadu_ps(&y[i+ 0]);
+		accum_1 += _mm256_loadu_ps(&x[i+ 8]) * _mm256_loadu_ps(&y[i+ 8]);
+		accum_2 += _mm256_loadu_ps(&x[i+16]) * _mm256_loadu_ps(&y[i+16]);
+		accum_3 += _mm256_loadu_ps(&x[i+24]) * _mm256_loadu_ps(&y[i+24]);
+	}
+
+	/* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */
+
+	accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
+
+	__m128 half_accum0;
+
+	/* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */
+	half_accum0 = _mm256_extractf128_ps(accum_0, 0) + _mm256_extractf128_ps(accum_0, 1);
+
+	/* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */
+	half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
+	half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
+
+	*dot = half_accum0[0];
+}
+
+#else
+#include "sdot_microk_haswell-2.c"
+#endif
--- a/utest/ctest.h
+++ b/utest/ctest.h
@ -84,7 +84,7 @@ struct ctest {
 #endif

 #if _MSC_VER < 1900
-#define snprintf _snprintf_s
+#define snprintf _snprintf
 #endif

 #ifndef __cplusplus