From f1bb85d378ef4ebcfd4f4c7bbb14b074bfdc945f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 16 Oct 2020 20:52:15 +0200
Subject: [PATCH 01/83] Add AVX flags for clang/aocc as well

---
 Makefile.x86_64 | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile.x86_64 b/Makefile.x86_64
index 27eb571ee..3a42e19e4 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -78,6 +78,10 @@ GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
 ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
 CCOMMON_OPT += -mavx2
 endif
+else 
+ifeq ($(C_COMPILER), CLANG)
+CCOMMON_OPT += -mavx2
+endif
 endif
 ifeq ($(F_COMPILER), GFORTRAN)
 # AVX2 support was added in 4.7.0

From 5381a18056c1ad6fe171eef275f4b0095e22ee57 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 17 Oct 2020 22:05:36 +0200
Subject: [PATCH 02/83] Update Changelog.txt with the 0.3.11 changes

---
 Changelog.txt | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/Changelog.txt b/Changelog.txt
index cbf0b50f5..bd0e60992 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,76 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.11
+ 17-Oct-2020
+
+ common:
+ 	* API change:
+	  the newly added BFLOAT16 functions were renamed to use the
+	  letter "B" instead of "H" to avoid potential confusion with
+	  the IEEE "half precision float" type, i.e. the 0.3.10
+	  SHGEMM is now SBGEMM and the corresponding build option
+	  was changed from "BUILD_HALF" to "BUILD_BFLOAT16".
+	* Reduced the default BLAS3_MEM_ALLOC_THRESHOLD (used as an upper
+	  limit for placing temporary arrays on the stack) to be compatible
+	  with a stack size of 1mb (as imposed by the JAVA runtime library) 
+	* Added mixed-precision dot function SBDOT and utility functions
+	  shstobf16, shdtobf16, sbf16tos and dbf16tod to convert between
+	  single or double precision float arrays and bfloat16 arrays
+	* Fixed prototypes of LAPACK_?ggsvp and LAPACK_?ggsvd functions
+	  in lapack.h
+	* Fixed underflow and rounding errors in LAPACK SLANV2 and DLANV2
+	  (causing miscalculations in e.g. SHSEQR/DHSEQR, LAPACK issue #263)
+	* Fixed workspace calculation in LAPACK ?GELQ (LAPACK issue #415)
+	* Fixed several bugs in the LAPACK testsuite
+	* Improved performance of TRMM and TRSM for certain problem sizes
+	* Fixed infinite recursions and workspace miscalculations in ReLAPACK
+	* CMAKE builds no longer require pkg-config for creating the .pc file
+	* Makefile builds no longer misread NO_CBLAS=0 or NO_LAPACK=0 as 
+	  enabling these options
+	* Fixed detection of gfortran when invoked through an mpi wrapper
+	* Improve thread reinitialization performance with OpenMP xafter a fork 
+	* Added support for building only the subset of the library required
+	  for a particular precision by specifying BUILD_SINGLE, BUILD_DOUBLE
+	* Optional function name prefixes and suffixes are now correctly
+	  reflected in the generated cblas.h
+	* Added CMAKE build support for the LAPACK and multithreading tests
+
+POWER:
+	* Added optimized support for POWER10
+	* Added support for compiling for POWER8 in 32bit mode
+	* Added support for compilation with LLVM/clang
+	* Added support for compilation with NVIDIA/PGI compilers
+	* Fixed building on big-endian POWER8
+	* Fixed miscompilation of ZDOTC by gcc10
+	* Fixed alignment errors in the POWER8 SAXPY kernel
+	* Improved CPU detection on AIX
+	* Supported building with older compilers on POWER9
+
+x86_64:
+	* Added support for Intel Cooperlake
+	* Added autodetection of AMD Renoir/Matisse/Zen3 cpus
+	* Added autodetection of Intel Comet Lake cpus
+	* Reimplemented ?sum, ?dot and daxpy using universal intrinsics
+	* Reset the fpu state before using the fpu on Windows as a workaround
+	  for a problem introduced in Windows 10 build 19041 (a.k.a. SDK 2004)
+	* Fixed potentially undefined behaviour in the dot and gemv_t kernels
+	* Fixed a potential segmentation fault in DYNAMIC_ARCH builds
+	* Fixed building for ZEN with PGI/NVIDIA and AMD AOCC compilers
+	
+ARMV7:
+	* Fixed cpu detection on BSD-like systems
+
+ARMV8:
+	* Added preliminary support for Apple Vortex cpus
+	* Added support for the Cavium ThunderX3T110 cpu
+	* Fixed cpu detection on BSD-like systems
+	* Fixed compilation in -std=C18 mode
+
+
+IBM Z:
+	* Added support for compiling with the clang compiler
+	* Improved GEMM performance on Z14
+
 ====================================================================
 Version 0.3.10
  14-Jun-2020

From fe9015b619037fdbd04b8ffe4d58ab4f22ea21fd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 17 Oct 2020 22:10:50 +0200
Subject: [PATCH 03/83] Update version for 0.3.11 release

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a6cf2ef83..e77aec030 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 10.dev)
+set(OpenBLAS_PATCH_VERSION 11)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions

From b8f689200eccb3802aaa1188a98d3b5578fce295 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 17 Oct 2020 22:11:34 +0200
Subject: [PATCH 04/83] Update version number to 0.3.11

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index 67d183936..acfe568d6 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.10.dev
+VERSION = 0.3.11
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

From 26a701f4ad35372c449fd74875fa7f6ff35aeb10 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 17 Oct 2020 22:40:06 +0200
Subject: [PATCH 05/83] Update version string to 0.3.11.dev

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e77aec030..21f0c9571 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 11)
+set(OpenBLAS_PATCH_VERSION 11.dev)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions

From 0ac610270809cb6dee8f5587784ceab8df356495 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 17 Oct 2020 22:40:47 +0200
Subject: [PATCH 06/83] Update version string to 0.3.11.dev

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index acfe568d6..e8f8c2951 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.11
+VERSION = 0.3.11.dev
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

From a5c667b55cbb3881f1f1a73fa47b81c50ffa6453 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 18 Oct 2020 09:40:31 -0500
Subject: [PATCH 07/83] Need a space when redirecting to file

Following two commands have two completely different meanings
perl ./gensymbol objcopy x86_64 _ 0 0  0 0 0 0 "" "64_" 1 0 1 1 1 1 > objcopy.def
perl ./gensymbol objcopy x86_64 _ 0 0  0 0 0 0 "" "64_" 1 0 1 1 1 1> objcopy.def
---
 exports/Makefile | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/exports/Makefile b/exports/Makefile
index 3f1ffba11..eec0593aa 100644
--- a/exports/Makefile
+++ b/exports/Makefile
@@ -120,10 +120,10 @@ dll  : ../$(LIBDLLNAME)
 	-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB)
 
 $(LIBPREFIX).def : gensymbol
-	perl ./gensymbol win2k    $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
+	perl ./gensymbol win2k    $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
 
 libgoto_hpl.def : gensymbol
-	perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
+	perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
 
 ifeq ($(OSNAME), Darwin)
 INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
@@ -258,16 +258,16 @@ static : ../$(LIBNAME)
 	rm -f goto.$(SUFFIX)
 
 osx.def : gensymbol ../Makefile.system ../getarch.c
-	perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
+	perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
 
 aix.def : gensymbol ../Makefile.system ../getarch.c
-	perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
+	perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
 
 objcopy.def : gensymbol ../Makefile.system ../getarch.c
-	perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
+	perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
 
 objconv.def : gensymbol ../Makefile.system ../getarch.c
-	perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
+	perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
 
 test : linktest.c
 	$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.

From 1a0c18512226cab3bdb17e4a31474a590242e183 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 18 Oct 2020 18:54:54 +0200
Subject: [PATCH 08/83] Support cross-compiling for Apple Vortex

---
 getarch.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/getarch.c b/getarch.c
index e2c22d3a0..3f1448305 100644
--- a/getarch.c
+++ b/getarch.c
@@ -1222,6 +1222,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #endif
 
+#ifdef FORCE_VORTEX
+#define FORCE
+#define ARCHITECTURE    "ARM64"
+#define SUBARCHITECTURE "VORTEX"
+#define SUBDIRNAME      "arm64"
+#define ARCHCONFIG   "-DVORTEX " \
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+       "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
+       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
+#define LIBNAME   "vortex"
+#define CORENAME  "VORTEX"
+#endif
+
 #ifdef FORCE_ZARCH_GENERIC
 #define FORCE
 #define ARCHITECTURE    "ZARCH"

From f5902ab0a13e4a49f1794d4f4dbdc5e99908691e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 18 Oct 2020 19:10:58 +0200
Subject: [PATCH 09/83] Support cross-compiling for Apple Vortex

---
 cmake/prebuild.cmake | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index f40304c09..3e38abbf5 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -416,6 +416,29 @@ endif ()
     set(ZGEMM_UNROLL_M 4)
     set(ZGEMM_UNROLL_N 4)
     set(SYMV_P 16)
+elseif ("${TCORE}" STREQUAL "VORTEX")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define ARMV8\n"
+      "#define L1_CODE_SIZE\t32768\n"
+      "#define L1_CODE_LINESIZE\t64\n"
+      "#define L1_CODE_ASSOCIATIVE\t4\n"
+      "#define L1_DATA_SIZE\t32768\n"
+      "#define L1_DATA_LINESIZE\t64\n"
+      "#define L1_DATA_ASSOCIATIVE\t4\n"
+      "#define L2_SIZE\t5262144\n"
+      "#define L2_LINESIZE\t64\n"
+      "#define L2_ASSOCIATIVE\t8\n"
+      "#define DTB_DEFAULT_ENTRIES\t64\n"
+      "#define DTB_SIZE\t4096\n")
+    set(SGEMM_UNROLL_M 16)
+    set(SGEMM_UNROLL_N 4)
+    set(DGEMM_UNROLL_M 8)
+    set(DGEMM_UNROLL_N 4)
+    set(CGEMM_UNROLL_M 8)
+    set(CGEMM_UNROLL_N 4)
+    set(ZGEMM_UNROLL_M 4)
+    set(ZGEMM_UNROLL_N 4)
+    set(SYMV_P 16)
   elseif ("${TCORE}" STREQUAL "POWER6")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_DATA_SIZE 32768\n"

From 2e7ee7c716c16b95a215a8309688c910867fa844 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 18 Oct 2020 19:22:05 +0200
Subject: [PATCH 10/83] Fix naming of L2 cache size item reported for Vortex

---
 cpuid_arm64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpuid_arm64.c b/cpuid_arm64.c
index ae150ef1b..5f5d7771b 100644
--- a/cpuid_arm64.c
+++ b/cpuid_arm64.c
@@ -424,7 +424,7 @@ void get_cpuconfig(void)
 			sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0);
 			printf("#define L1_DATA_SIZE	     %d       \n",value);
 			sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0);
-			printf("#define L2_DATA_SIZE	     %d       \n",value);
+			printf("#define L2_SIZE	     %d       \n",value);
 			break;
 #endif			
 	}

From 7d6c85f9da82f10615daefc9135a2616a4347855 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 18 Oct 2020 19:27:51 +0200
Subject: [PATCH 11/83] Add compiler option -mmma for POWER10

---
 Makefile.power | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.power b/Makefile.power
index e766f8499..59af8ef55 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -10,7 +10,7 @@ USE_OPENMP = 1
 endif
 
 ifeq ($(CORE), POWER10)
-COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx  -fno-fast-math
+COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -mmma  -fno-fast-math
 FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10  -fno-fast-math
 endif
 

From d85b24e10320c292c9e3b0f8eff24c032411eeb7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 18 Oct 2020 19:29:45 +0200
Subject: [PATCH 12/83] Clean up STACKSIZE redefinition

---
 kernel/power/dtrmm_kernel_16x4_power8.S | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S
index 84c65f503..91154ad37 100644
--- a/kernel/power/dtrmm_kernel_16x4_power8.S
+++ b/kernel/power/dtrmm_kernel_16x4_power8.S
@@ -82,7 +82,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef __64BIT__
-#define STACKSIZE 320
 #define STACKSIZE 520
 #define ALPHA_SP   296+200(SP)
 #define FZERO	304+200(SP)

From c1422f3e4624f1733bcc0896a491bf32bc2c1b97 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 18 Oct 2020 19:31:01 +0200
Subject: [PATCH 13/83] Clean up STACKSIZE redefinition

---
 kernel/power/dtrsm_kernel_LT_16x4_power8.S | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/power/dtrsm_kernel_LT_16x4_power8.S b/kernel/power/dtrsm_kernel_LT_16x4_power8.S
index 8a423f181..5b349db12 100644
--- a/kernel/power/dtrsm_kernel_LT_16x4_power8.S
+++ b/kernel/power/dtrsm_kernel_LT_16x4_power8.S
@@ -47,7 +47,6 @@
 #endif
 
 #ifdef __64BIT__
-#define STACKSIZE 320
 #define STACKSIZE 520
 #define ALPHA   296+200(SP)
 #define FZERO	304+200(SP)

From 17e288e18d0f308d0edccf6e53ac34a4029d4e46 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 18 Oct 2020 19:37:04 +0200
Subject: [PATCH 14/83] Clean up STACKSIZE redefinition

---
 kernel/power/ctrmm_kernel_8x4_power8.S | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/power/ctrmm_kernel_8x4_power8.S b/kernel/power/ctrmm_kernel_8x4_power8.S
index 822420dfd..35faad19e 100644
--- a/kernel/power/ctrmm_kernel_8x4_power8.S
+++ b/kernel/power/ctrmm_kernel_8x4_power8.S
@@ -82,12 +82,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef __64BIT__
-#define STACKSIZE 400
 #define STACKSIZE 592
 #define ALPHA_R_SP 304+192(SP)
 #define ALPHA_I_SP 312+192(SP)
 #else
-#define STACKSIZE 256
 #define STACKSIZE 452
 #define ALPHA_R_SP 224+196(SP)
 #define ALPHA_I_SP 232+196(SP)

From 97cf10062f328afa1d1a3a4700839a46d7fe6214 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 18 Oct 2020 19:39:18 +0200
Subject: [PATCH 15/83] Clean up STACKSIZE redefinition

---
 kernel/power/strmm_kernel_16x8_power8.S | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/power/strmm_kernel_16x8_power8.S b/kernel/power/strmm_kernel_16x8_power8.S
index 78e539231..a8182b5aa 100644
--- a/kernel/power/strmm_kernel_16x8_power8.S
+++ b/kernel/power/strmm_kernel_16x8_power8.S
@@ -12,7 +12,7 @@ the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
+derived from this software without specific prior written permission.  
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@@ -82,7 +82,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef __64BIT__
-#define STACKSIZE 340
 #define STACKSIZE 540
 #define ALPHA_SP   296+200(SP)
 #define FZERO	304+200(SP)

From f1a4071d8cd6aa94ff0e86a77e6b8f29823b2751 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 18 Oct 2020 19:41:43 +0200
Subject: [PATCH 16/83] Clean up STACKSIZE redefinition

---
 kernel/power/dgemm_kernel_16x4_power8.S | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S
index 651fd53fc..f8ed12ee9 100644
--- a/kernel/power/dgemm_kernel_16x4_power8.S
+++ b/kernel/power/dgemm_kernel_16x4_power8.S
@@ -82,12 +82,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef __64BIT__
-#define STACKSIZE 320
 #define STACKSIZE 512
 #define ALPHA_SP   296+192(SP)
 #define FZERO	304+192(SP)
 #else
-#define STACKSIZE 240
 #define STACKSIZE 440
 #define ALPHA_SP   224+200(SP)
 #define FZERO	232+200(SP)

From 03e781b766eea3551e00b698a3cb345b908e1d8d Mon Sep 17 00:00:00 2001
From: Bart Oldeman <bart.oldeman@calculquebec.ca>
Date: Sun, 18 Oct 2020 19:50:38 +0000
Subject: [PATCH 17/83] sgemm_direct_skylakex: fix 75eeb26 regression.

The
`#if defined(SKYLAKEX) || defined (COOPERLAKE)`
from that commit was before #include "common.h" so caused the
compiled function to be empty, returning garbage results for
qualifying sgemm's on those architectures.

Closes #2914
---
 kernel/x86_64/sgemm_direct_skylakex.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/x86_64/sgemm_direct_skylakex.c b/kernel/x86_64/sgemm_direct_skylakex.c
index a7cddbb3d..aaadcf151 100644
--- a/kernel/x86_64/sgemm_direct_skylakex.c
+++ b/kernel/x86_64/sgemm_direct_skylakex.c
@@ -1,7 +1,8 @@
-#if defined(SKYLAKEX) || defined (COOPERLAKE)
 /* the direct sgemm code written by Arjan van der Ven */
 #include <immintrin.h>
 #include "common.h"
+
+#if defined(SKYLAKEX) || defined (COOPERLAKE)
 /*
  * "Direct sgemm" code. This code operates directly on the inputs and outputs
  * of the sgemm call, avoiding the copies, memory realignments and threading,

From 14b1d339331d48b8aeaaab899770a295917cd720 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 18 Oct 2020 21:42:32 -0500
Subject: [PATCH 18/83] Fix exporting some lapack and cblas

---
 exports/gensymbol | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/exports/gensymbol b/exports/gensymbol
index 8482ecb7e..e1f728790 100644
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -50,7 +50,7 @@
     zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2,
     zgeadd, dzsum);
 
-@cblasobjs = (lsame, xerbla);
+@blasobjs = (lsame, xerbla);
 @halfblasobjs = (sbgemm, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
 @cblasobjsc = (
     cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv,
@@ -92,7 +92,7 @@
     cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy
 );
 
-@cblasobjs = (  cblas_xerbla );
+@cblasobjs = ( cblas_xerbla );
 
 @halfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod);
 
@@ -3600,6 +3600,7 @@ if ($ARGV[13] == 1) {
 	@lapack2objs = (@lapack2objs, @lapack2objss);
 	@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs,  @lapack_embeded_underscore_objs_s); 
 	@lapackeobjs = (@lapackeobjs, @lapackeobjss);
+	@lapackobjs2 = (@lapackobjs2, @lapackobjs2s);
 }
 if ($ARGV[14] == 1) {
 	@blasobjs = (@blasobjs, @blasobjsd);
@@ -3608,6 +3609,7 @@ if ($ARGV[14] == 1) {
 	@lapack2objs = (@lapack2objs, @lapack2objsd);
 	@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs,  @lapack_embeded_underscore_objs_d);
 	@lapackeobjs = (@lapackeobjs, @lapackeobjsd);
+	@lapackobjs2 = (@lapackobjs2, @lapackobjs2d);
 }
 if ($ARGV[15] == 1) {
 	@blasobjs = (@blasobjs, @blasobjsc);
@@ -3618,6 +3620,7 @@ if ($ARGV[15] == 1) {
 	@lapack2objs = (@lapack2objs, @lapack2objsc, @lapac2objszc);
 	@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs,  @lapack_embeded_underscore_objs_c);
 	@lapackeobjs = (@lapackeobjs, @lapackeobjsc);
+	@lapackobjs2 = (@lapackobjs2,  @lapackobjs2sc, @lapackobjs2c);
 }
 if ($ARGV[16] == 1) {
 	@blasobjs = (@blasobjs, @blasobjsz);
@@ -3628,6 +3631,7 @@ if ($ARGV[16] == 1) {
 	@lapack2objs = (@lapack2objs, @lapack2objsz, @lapack2objszc);
 	@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs,  @lapack_embeded_underscore_objs_z);
 	@lapackeobjs = (@lapackeobjs, @lapackeobjsz);
+	@lapackobjs2 = (@lapackobjs2,  @lapackobjs2dz, @lapackobjs2z);
 }
 if ($ARGV[8] == 1) {
     #ONLY_CBLAS=1

From 7eddaf0d6fb861c11c425fc47b87870585a95829 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 19 Oct 2020 08:11:22 +0200
Subject: [PATCH 19/83] Remove -mmma again (reduntant with cpu=power10) and add
 override statements

---
 Makefile.power | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile.power b/Makefile.power
index 59af8ef55..6de59c53d 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -10,8 +10,8 @@ USE_OPENMP = 1
 endif
 
 ifeq ($(CORE), POWER10)
-COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -mmma  -fno-fast-math
-FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10  -fno-fast-math
+override COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
+override FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10  -fno-fast-math
 endif
 
 ifeq ($(CORE), POWER9)

From a61c086408650f51e09dbbfcc1b72ecb33272000 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 19 Oct 2020 09:12:12 +0200
Subject: [PATCH 20/83] Fix spurious trailing whitespace in comment

---
 kernel/power/strmm_kernel_16x8_power8.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/power/strmm_kernel_16x8_power8.S b/kernel/power/strmm_kernel_16x8_power8.S
index a8182b5aa..1f9912c49 100644
--- a/kernel/power/strmm_kernel_16x8_power8.S
+++ b/kernel/power/strmm_kernel_16x8_power8.S
@@ -12,7 +12,7 @@ the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.  
+derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

From fe2a922adaac599077651119c2230987a44a7fb6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 19 Oct 2020 17:43:53 +0200
Subject: [PATCH 21/83] Add POWER10 compiler options to CCOMMON_OPT rather than
 COMMON_OPT

---
 Makefile.power | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile.power b/Makefile.power
index 6de59c53d..c7e972290 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -10,8 +10,8 @@ USE_OPENMP = 1
 endif
 
 ifeq ($(CORE), POWER10)
-override COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
-override FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10  -fno-fast-math
+CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
+FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10  -fno-fast-math
 endif
 
 ifeq ($(CORE), POWER9)

From 4ad33c46b0c4b13606653d9a461f06a22f4fd404 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 19 Oct 2020 20:37:52 +0200
Subject: [PATCH 22/83] Add back symbols that got dropped when splitting by
 type

---
 exports/gensymbol | 103 +++++++++++++++++++++++++++++++---------------
 1 file changed, 70 insertions(+), 33 deletions(-)

diff --git a/exports/gensymbol b/exports/gensymbol
index e1f728790..d5ec45fad 100644
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -51,7 +51,7 @@
     zgeadd, dzsum);
 
 @blasobjs = (lsame, xerbla);
-@halfblasobjs = (sbgemm, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
+@bfblasobjs = (sbgemm, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
 @cblasobjsc = (
     cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv,
     cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k,
@@ -72,7 +72,7 @@
     );
     
 @cblasobjss = (
-    cblas_sasum, cblas_saxpy,
+    cblas_sasum, cblas_saxpy, cblas_saxpby,
     cblas_scopy, cblas_sdot, cblas_sdsdot, cblas_sgbmv, cblas_sgemm,
     cblas_sgemv, cblas_sger, cblas_snrm2, cblas_srot, cblas_srotg,
     cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr,
@@ -92,9 +92,9 @@
     cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy
 );
 
-@cblasobjs = ( cblas_xerbla );
+@cblasobjs = (  cblas_xerbla );
 
-@halfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod);
+@bfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod);
 
 @exblasobjs = (
     qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
@@ -415,7 +415,7 @@ zpotri,
     cgeqrt, cgeqrt2, cgeqrt3, cgemqrt,
     ctpqrt, ctpqrt2, ctpmqrt, ctprfb,
 );
-@lapack2objszc = (
+@lapackobjs2zc = (
     # ZCLASRC -- Double-single mixed precision complex routines called from
     # single, single-extra and double precision complex LAPACK
     # routines (i.e. from CLASRC, CXLASRC, ZLASRC).
@@ -425,7 +425,7 @@ zpotri,
     cpotrs,
 );
 
-@lapack2objsd = (
+@lapackobjs2d = (
     # DLASRC  -- Double precision real LAPACK routines
     # already provided by @lapackobjs:
     #     dgesv, dgetf2, dgetrs, dlaswp, dlauu2, dlauum, dpotf2, dpotrf, dpotri,
@@ -568,7 +568,7 @@ zpotri,
 );
     # functions added for lapack-3.6.0
 
-@lapack2objsc = ( @lapack2objsc,
+@lapackobjs2c = ( @lapackobjs2c,
     cgejsv,
     cgesvdx,
     cgesvj,
@@ -604,7 +604,7 @@ zpotri,
     csyr2,
     cunm22,
 );
-@lapackobjs2d = (@lapack2objsd,
+@lapackobjs2d = (@lapackobjs2d,
     dbdsvdx,
     dgesvdx,
     dgetrf2,
@@ -637,7 +637,7 @@ zpotri,
     dpotrf2,
     dsecnd,
     );
-    @lapack2objss = (@lapack2objss,
+    @lapackobjs2s = (@lapackobjs2s,
     sbdsvdx,
     second,
     sgesvdx,
@@ -670,7 +670,7 @@ zpotri,
     sorm22,
     spotrf2,
     );
-    @lapack2objsz = (@lapack2objsz,
+    @lapackobjs2z = (@lapackobjs2z,
     zgejsv,
     zgesvdx,
     zgesvj,
@@ -707,7 +707,7 @@ zpotri,
     zunm22,
 );
     # functions added for lapack-3.7.0
-@lapack2objss = (@lapack2objss,
+@lapackobjs2s = (@lapackobjs2s,
     slarfy,
     strevc3,
     sgelqt,
@@ -726,7 +726,7 @@ zpotri,
     stplqt2,
     stpmlqt,
     );
-    @lapack2objsd = (@lapack2objsd,
+    @lapackobjs2d = (@lapackobjs2d,
     dlarfy,
     dsyconvf,
     dtrevc3,
@@ -746,7 +746,7 @@ zpotri,
     dtplqt2,
     dtpmlqt,
     );
-    @lapack2objsc = (@lapack2objsc,
+    @lapackobjs2c = (@lapackobjs2c,
     clarfy,
     csyconvf,
     ctrevc3,
@@ -766,7 +766,7 @@ zpotri,
     ctplqt2,
     ctpmlqt,
     );
-    @lapack2objsz = (@lapack2objsz,
+    @lapackobjs2z = (@lapackobjs2z,
     zlarfy,
     zsyconvf,
     ztrevc3,
@@ -786,7 +786,7 @@ zpotri,
     zlamswlq,
     zgemlq,
     );
-    @lapack2objs = (@lapack2objs,
+    @lapackobjs2 = (@lapackobjs2,
     sladiv1,
     dladiv1,
     iparam2stage,
@@ -796,21 +796,21 @@ zpotri,
     ilaenv2stage,
     );
     # functions added for lapack-3.9.0
-@lapack2objsc = (@lapack2objsc,
+@lapackobjs2c = (@lapackobjs2c,
     cgesvdq,
     cungtsqr,
     dcombssq,
     );
-@lapack2objsd = (@lapack2objsd,
+@lapackobjs2d = (@lapackobjs2d,
     dgesvdq,
     dorgtsqr,
     );
-@lapack2objss = (@lapack2objss,
+@lapackobjs2s = (@lapackobjs2s,
     scombssq,
     sgesvdq,
     sorgtsqr,
     );
-@lapack2objsz = (@lapack2objsz,
+@lapackobjs2z = (@lapackobjs2z,
     zgesvdq,
     zungtsqr
 );
@@ -835,10 +835,29 @@ zpotri,
   dlatzm,  dtzrqf);
   
 @lapack_deprecated_objss = ( 
+  sgelsx,
   sgegs,
-  sgegv,                   
+  sgegv,
+  sgeqpf,
+  sggsvd,
+  sggsvp,
+  slahrd,
+  slatzm,
+  stzrqf
   );
-                           
+
+@lapack_deprecated_objsz = ( 
+  zgegs,
+  zgegv,
+  zgelsx,
+  zgeqpf,
+  zggsvd,
+  zggsvp,
+  zlahrd,
+  zlatzm,
+  ztzrqf
+  );
+
 @lapacke_deprecated_objsc = (
     LAPACKE_cggsvp,
     LAPACKE_cggsvp_work,
@@ -3590,48 +3609,66 @@ use File::Basename;
 my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib");
 
 if ($ARGV[12] == 1) {
-	@blasobjs = (@blasobjs, @halfblasobjs);
-	@cblasobjs = (@cblasobjs, @halfcblasobjs);
+	@blasobjs = (@blasobjs, @bfblasobjs);
+	@cblasobjs = (@cblasobjs, @bfcblasobjs);
 }
 if ($ARGV[13] == 1) {
 	@blasobjs = (@blasobjs, @blasobjss);
 	@cblasobjs = (@cblasobjs, @cblasobjss);
 	@lapackobjs = (@lapackobjs, @lapackobjss);
-	@lapack2objs = (@lapack2objs, @lapack2objss);
+	@lapackobjs2 = (@lapackobjs2, @lapackobjs2s);
+	@lapackobjs2 = (@lapackobjs2, @lapackobjs2sc);
+	@lapackobjs2 = (@lapackobjs2, @lapackobjs2ds);
+	@lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objss);
+	@lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objss);
 	@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs,  @lapack_embeded_underscore_objs_s); 
 	@lapackeobjs = (@lapackeobjs, @lapackeobjss);
-	@lapackobjs2 = (@lapackobjs2, @lapackobjs2s);
 }
 if ($ARGV[14] == 1) {
 	@blasobjs = (@blasobjs, @blasobjsd);
 	@cblasobjs = (@cblasobjs, @cblasobjsd);
 	@lapackobjs = (@lapackobjs, @lapackobjsd);
-	@lapack2objs = (@lapack2objs, @lapack2objsd);
+	if ($ARGV[13] == 0) { 
+		@lapackobjs2 = (@lapackobjs2, @lapackobjs2ds);
+	}
+	@lapackobjs2 = (@lapackobjs2, @lapackobjs2d, @lapackobjs2dz);
+	@lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsd);
+	@lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objsd);
 	@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs,  @lapack_embeded_underscore_objs_d);
 	@lapackeobjs = (@lapackeobjs, @lapackeobjsd);
-	@lapackobjs2 = (@lapackobjs2, @lapackobjs2d);
 }
 if ($ARGV[15] == 1) {
 	@blasobjs = (@blasobjs, @blasobjsc);
 	@cblasobjs = (@cblasobjs, @cblasobjsc);
 	@gemm3mobjs = (@gemm3mobjs, @gemm3mobjsc);
-	@cblasgemm3mobjs = (@cblasgemm3mobjs, @sblasgemm3mobjsc);
+	@cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsc);
 	@lapackobjs = (@lapackobjs, @lapackobjsc);
-	@lapack2objs = (@lapack2objs, @lapack2objsc, @lapac2objszc);
+	@lapackobjs2 = (@lapackobjs2, @lapackobjs2c, @lapackobjs2zc);
+	if ($ARGV[13] == 0) { 
+		@lapackobjs2 = (@lapackobjs2, @lapackobjs2sc);
+	}
+	@lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsc);
+	@lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objsc);
 	@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs,  @lapack_embeded_underscore_objs_c);
 	@lapackeobjs = (@lapackeobjs, @lapackeobjsc);
-	@lapackobjs2 = (@lapackobjs2,  @lapackobjs2sc, @lapackobjs2c);
 }
 if ($ARGV[16] == 1) {
 	@blasobjs = (@blasobjs, @blasobjsz);
 	@cblasobjs = (@cblasobjs, @cblasobjsz);
 	@gemm3mobjs = (@gemm3mobjs, @gemm3mobjsz);
-	@cblasgemm3mobjs = (@cblasgemm3mobjs, @sblasgemm3mobjsz);
+	@cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsz);
 	@lapackobjs = (@lapackobjs, @lapackobjsz);
-	@lapack2objs = (@lapack2objs, @lapack2objsz, @lapack2objszc);
+	@lapackobjs2 = (@lapackobjs2, @lapackobjs2z);
+	if ($ARGV[15] == 0) { 
+		@lapackobjs2 = (@lapackobjs2, @lapackobjs2zc);
+	}
+	if ($ARGV[14] == 0) { 
+		@lapackobjs2 = (@lapackobjs2, @lapackobjs2dz);
+	}
+	@lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsz);
+	@lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objsz);
 	@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs,  @lapack_embeded_underscore_objs_z);
 	@lapackeobjs = (@lapackeobjs, @lapackeobjsz);
-	@lapackobjs2 = (@lapackobjs2,  @lapackobjs2dz, @lapackobjs2z);
 }
 if ($ARGV[8] == 1) {
     #ONLY_CBLAS=1

From ff65952e46b84a4a1a969d1cff7e90c3fb15ae43 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 20 Oct 2020 00:55:41 +0200
Subject: [PATCH 23/83] Move HAVE_P10_SUPPORT to the build system

to be able to include a binutils version check
---
 driver/others/dynamic_power.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index ca1d42408..85fc5b3ba 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -6,10 +6,10 @@ extern gotoblas_t gotoblas_POWER8;
 #if (!defined __GNUC__) || ( __GNUC__ >= 6)
 extern gotoblas_t gotoblas_POWER9;
 #endif
-#if (!defined __GNUC__) || ( __GNUC__ >= 11) \
-     || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
-#define HAVE_P10_SUPPORT 1
-#endif
+//#if (!defined __GNUC__) || ( __GNUC__ >= 11) \
+//     || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
+//#define HAVE_P10_SUPPORT 1
+//#endif
 #ifdef HAVE_P10_SUPPORT
 extern gotoblas_t gotoblas_POWER10;
 #endif

From bb8c3f68611fadff9a99b7cfdebaf250ccbaa129 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 20 Oct 2020 01:04:20 +0200
Subject: [PATCH 24/83] Add ld/binutils version check for POWER10 support

---
 Makefile.system | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index 461f7370b..7f0c26796 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -641,6 +641,7 @@ DYNAMIC_CORE += POWER8
 ifneq ($(C_COMPILER), GCC)
 DYNAMIC_CORE += POWER9
 DYNAMIC_CORE += POWER10
+CCOMMON_OPT += -DHAVE_P10_SUPPORT
 endif
 ifeq ($(C_COMPILER), GCC)
 ifeq ($(GCCVERSIONGT5), 1)
@@ -648,11 +649,14 @@ DYNAMIC_CORE += POWER9
 else
 $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
 endif
-ifeq ($(GCCVERSIONGTEQ11), 1)
+LDVERSIONGTEQ35 := $(shell expr ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-" >= 35)
+ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11)
 DYNAMIC_CORE += POWER10
+CCOMMON_OPT += -DHAVE_P10_SUPPORT
 else ifeq ($(GCCVERSIONGTEQ10), 1)
-ifeq ($(GCCMINORVERSIONGTEQ2), 1)
+ifeq ($(GCCMINORVERSIONGTEQ2)$(LDVERSIONGTEQ35), 11)
 DYNAMIC_CORE += POWER10
+CCOMMON_OPT += -DHAVE_P10_SUPPORT
 endif
 else
 $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)

From eddc65c7b751d280d5cc16f4121eeb923920c8c1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 20 Oct 2020 01:09:49 +0200
Subject: [PATCH 25/83] Add POWER10 support flag (unconditionally for now)

---
 cmake/arch.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index 99e685d04..5457bfb07 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -49,6 +49,7 @@ if (DYNAMIC_ARCH)
   
   if (POWER)
 	  set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10)
+	  set(CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_P10_SUPPORT")
   endif ()
   
   if (X86)

From b073d759d056cef826e3906ef00a42068df46091 Mon Sep 17 00:00:00 2001
From: Bart Oldeman <bart.oldeman@calculquebec.ca>
Date: Tue, 20 Oct 2020 02:16:47 +0000
Subject: [PATCH 26/83] x86_64: clobber all xmm registers after vzeroupper

As observed using GCC 10 using -march=native -ftree-vectorize
on Knights Landing, it is now smart enough to find clobbers inside
non-inlined static functions.

In particular, sgemv counted on a kernel to preserve the whole
%ymm2 register (since it was not in the clobber list), but the top
part was destroyed by vzeroupper. This caused many tests to fail.

This patch makes sure all xmm (and ymm/zmm by extension) registers
are listed as clobbered to avoid this happening, as most kernels
already did correctly in fact.
---
 kernel/x86_64/caxpy_microk_bulldozer-2.c    |  5 +++--
 kernel/x86_64/caxpy_microk_haswell-2.c      |  2 +-
 kernel/x86_64/caxpy_microk_sandy-2.c        |  2 +-
 kernel/x86_64/caxpy_microk_steamroller-2.c  |  5 +++--
 kernel/x86_64/daxpy_microk_haswell-2.c      |  5 +++--
 kernel/x86_64/ddot_microk_haswell-2.c       |  5 +++--
 kernel/x86_64/ddot_microk_piledriver-2.c    |  2 ++
 kernel/x86_64/ddot_microk_sandy-2.c         |  5 +++--
 kernel/x86_64/ddot_microk_steamroller-2.c   |  1 +
 kernel/x86_64/dgemv_n_microk_haswell-4.c    | 14 ++++++--------
 kernel/x86_64/dgemv_n_microk_piledriver-4.c |  6 ++++--
 kernel/x86_64/dgemv_t_microk_haswell-4.c    |  2 ++
 kernel/x86_64/saxpy_microk_haswell-2.c      |  3 ++-
 kernel/x86_64/saxpy_microk_piledriver-2.c   |  6 ++++--
 kernel/x86_64/sdot_microk_haswell-2.c       |  5 +++--
 kernel/x86_64/sdot_microk_sandy-2.c         |  5 +++--
 kernel/x86_64/sgemv_n_microk_haswell-4.c    | 14 ++++++--------
 kernel/x86_64/sgemv_t_microk_haswell-4.c    |  2 ++
 kernel/x86_64/zaxpy_microk_bulldozer-2.c    |  5 +++--
 kernel/x86_64/zaxpy_microk_haswell-2.c      |  2 +-
 kernel/x86_64/zaxpy_microk_sandy-2.c        |  6 ++++--
 kernel/x86_64/zaxpy_microk_steamroller-2.c  |  5 +++--
 22 files changed, 63 insertions(+), 44 deletions(-)

diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c
index ca2209340..a32558dc9 100644
--- a/kernel/x86_64/caxpy_microk_bulldozer-2.c
+++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c
@@ -122,7 +122,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
           "r" (alpha),  // 4
           "r" (mvec)    // 5
 	: "cc", 
-	  "%xmm0", "%xmm1",
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
@@ -189,9 +189,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
           "r" (alpha),  // 4
           "r" (mvec)    // 5
 	: "cc", 
-	  "%xmm0", "%xmm1",
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
 
diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c
index b605ea34c..129ce7a49 100644
--- a/kernel/x86_64/caxpy_microk_haswell-2.c
+++ b/kernel/x86_64/caxpy_microk_haswell-2.c
@@ -120,7 +120,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
           "r" (alpha),  // 4
           "r" (mvec)    // 5
 	: "cc", 
-	  "%xmm0", "%xmm1",
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15", 
diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c
index 72d37afed..564dfbd0f 100644
--- a/kernel/x86_64/caxpy_microk_sandy-2.c
+++ b/kernel/x86_64/caxpy_microk_sandy-2.c
@@ -104,7 +104,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
           "r" (alpha),  // 4
           "r" (mvec)    // 5
 	: "cc", 
-	  "%xmm0", "%xmm1",
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15", 
diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c
index 7ca7af070..cc5c5de76 100644
--- a/kernel/x86_64/caxpy_microk_steamroller-2.c
+++ b/kernel/x86_64/caxpy_microk_steamroller-2.c
@@ -122,7 +122,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
           "r" (alpha),  // 4
           "r" (mvec)    // 5
 	: "cc", 
-	  "%xmm0", "%xmm1",
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
@@ -189,9 +189,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
           "r" (alpha),  // 4
           "r" (mvec)    // 5
 	: "cc", 
-	  "%xmm0", "%xmm1",
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
 
diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c
index f3682e6d7..ecc0ecbd3 100644
--- a/kernel/x86_64/daxpy_microk_haswell-2.c
+++ b/kernel/x86_64/daxpy_microk_haswell-2.c
@@ -67,8 +67,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
           "r" (y),      // 3
           "r" (alpha)   // 4
 	: "cc", 
-	  "%xmm0", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c
index dbb5487f7..faac72870 100644
--- a/kernel/x86_64/ddot_microk_haswell-2.c
+++ b/kernel/x86_64/ddot_microk_haswell-2.c
@@ -84,8 +84,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
           "r" (y),      // 3
           "r" (dot)     // 4
 	: "cc", 
-	  "%xmm4", "%xmm5", 
-	  "%xmm6", "%xmm7", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c
index cc4bcd90a..0320a2e36 100644
--- a/kernel/x86_64/ddot_microk_piledriver-2.c
+++ b/kernel/x86_64/ddot_microk_piledriver-2.c
@@ -91,6 +91,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	: "cc", 
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
@@ -155,6 +156,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	: "cc", 
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c
index 84493ec27..35ba86a7d 100644
--- a/kernel/x86_64/ddot_microk_sandy-2.c
+++ b/kernel/x86_64/ddot_microk_sandy-2.c
@@ -89,8 +89,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
           "r" (y),      // 3
           "r" (dot)     // 4
 	: "cc", 
-	  "%xmm4", "%xmm5", 
-	  "%xmm6", "%xmm7", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c
index 27d5244ce..94c012f0d 100644
--- a/kernel/x86_64/ddot_microk_steamroller-2.c
+++ b/kernel/x86_64/ddot_microk_steamroller-2.c
@@ -88,6 +88,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	: "cc", 
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c
index da0fa2fff..c20c0a030 100644
--- a/kernel/x86_64/dgemv_n_microk_haswell-4.c
+++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c
@@ -105,9 +105,8 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
           "r" (alpha)   // 8
 	: "cc", 
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
-	  "%xmm4", "%xmm5", 
-	  "%xmm6", "%xmm7", 
-	  "%xmm8", "%xmm9", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
@@ -182,11 +181,10 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
           "r" (ap[1]),  // 5
           "r" (alpha)   // 6
 	: "cc", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", 
-	  "%xmm6", 
-	  "%xmm8", 
-	  "%xmm12", "%xmm13",
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
 }
diff --git a/kernel/x86_64/dgemv_n_microk_piledriver-4.c b/kernel/x86_64/dgemv_n_microk_piledriver-4.c
index 466931b82..57fa426ba 100644
--- a/kernel/x86_64/dgemv_n_microk_piledriver-4.c
+++ b/kernel/x86_64/dgemv_n_microk_piledriver-4.c
@@ -140,7 +140,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	  "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", 
 	  "%xmm6", "%xmm7", 
-	  "%xmm8", "%xmm9", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
@@ -235,9 +235,11 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
           "r" (ap[3]),  // 7
           "r" (alpha)   // 8
 	: "cc", 
+	  "%xmm0", "%xmm1",
+	  "%xmm2", "%xmm3",
 	  "%xmm4", "%xmm5", 
 	  "%xmm6", "%xmm7", 
-	  "%xmm8", "%xmm9", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
diff --git a/kernel/x86_64/dgemv_t_microk_haswell-4.c b/kernel/x86_64/dgemv_t_microk_haswell-4.c
index 958fd3e0a..b398307d3 100644
--- a/kernel/x86_64/dgemv_t_microk_haswell-4.c
+++ b/kernel/x86_64/dgemv_t_microk_haswell-4.c
@@ -117,7 +117,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
           "r" (ap[2]),  // 6
           "r" (ap[3])   // 7
 	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c
index 7099ba4c6..8cc697f05 100644
--- a/kernel/x86_64/saxpy_microk_haswell-2.c
+++ b/kernel/x86_64/saxpy_microk_haswell-2.c
@@ -67,7 +67,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
           "r" (y),      // 3
           "r" (alpha)   // 4
 	: "cc", 
-	  "%xmm0", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
diff --git a/kernel/x86_64/saxpy_microk_piledriver-2.c b/kernel/x86_64/saxpy_microk_piledriver-2.c
index 5feea7f24..ebbcc0045 100644
--- a/kernel/x86_64/saxpy_microk_piledriver-2.c
+++ b/kernel/x86_64/saxpy_microk_piledriver-2.c
@@ -86,7 +86,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
           "r" (y),      // 3
           "r" (alpha)   // 4
 	: "cc", 
-	  "%xmm0", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
@@ -147,7 +148,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
           "r" (y),      // 3
           "r" (alpha)   // 4
 	: "cc", 
-	  "%xmm0", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c
index 91dc928d3..322f4b28c 100644
--- a/kernel/x86_64/sdot_microk_haswell-2.c
+++ b/kernel/x86_64/sdot_microk_haswell-2.c
@@ -87,8 +87,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
           "r" (y),      // 3
           "r" (dot)     // 4
 	: "cc", 
-	  "%xmm4", "%xmm5", 
-	  "%xmm6", "%xmm7", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
diff --git a/kernel/x86_64/sdot_microk_sandy-2.c b/kernel/x86_64/sdot_microk_sandy-2.c
index ae25d5a50..ce09b06cf 100644
--- a/kernel/x86_64/sdot_microk_sandy-2.c
+++ b/kernel/x86_64/sdot_microk_sandy-2.c
@@ -90,8 +90,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
           "r" (y),      // 3
           "r" (dot)     // 4
 	: "cc", 
-	  "%xmm4", "%xmm5", 
-	  "%xmm6", "%xmm7", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c
index 93e1e26e8..556dcfde5 100644
--- a/kernel/x86_64/sgemv_n_microk_haswell-4.c
+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c
@@ -164,11 +164,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
           "r" (ap[3]),  // 8
           "r" (alpha)   // 9
 	: "cc", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm2", "%xmm3", 
-	  "%xmm4", "%xmm5", 
-	  "%xmm6", "%xmm7", 
-	  "%xmm8", "%xmm9", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
@@ -286,9 +284,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
           "r" (ap[3]),  // 7
           "r" (alpha)   // 8
 	: "cc", 
-	  "%xmm4", "%xmm5", 
-	  "%xmm6", "%xmm7", 
-	  "%xmm8", "%xmm9", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
diff --git a/kernel/x86_64/sgemv_t_microk_haswell-4.c b/kernel/x86_64/sgemv_t_microk_haswell-4.c
index 8c370b4c0..fcabc0def 100644
--- a/kernel/x86_64/sgemv_t_microk_haswell-4.c
+++ b/kernel/x86_64/sgemv_t_microk_haswell-4.c
@@ -138,7 +138,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
           "r" (ap[2]),  // 6
           "r" (ap[3])   // 7
 	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c
index 15d367971..ccb26134f 100644
--- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c
+++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c
@@ -122,7 +122,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
           "r" (alpha),  // 4
           "r" (mvec)    // 5
 	: "cc", 
-	  "%xmm0", "%xmm1",
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
@@ -189,9 +189,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
           "r" (alpha),  // 4
           "r" (mvec)    // 5
 	: "cc", 
-	  "%xmm0", "%xmm1",
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
 
diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c
index 89d23daf3..8f299ea2d 100644
--- a/kernel/x86_64/zaxpy_microk_haswell-2.c
+++ b/kernel/x86_64/zaxpy_microk_haswell-2.c
@@ -120,7 +120,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
           "r" (alpha),  // 4
           "r" (mvec)    // 5
 	: "cc", 
-	  "%xmm0", "%xmm1",
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15", 
diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c
index 17b8b24f7..5246c72e8 100644
--- a/kernel/x86_64/zaxpy_microk_sandy-2.c
+++ b/kernel/x86_64/zaxpy_microk_sandy-2.c
@@ -108,9 +108,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
           "r" (alpha),  // 4
           "r" (mvec)    // 5
 	: "cc", 
-	  "%xmm0", "%xmm1",
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
 	return;
@@ -185,9 +186,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
           "r" (alpha),  // 4
           "r" (mvec)    // 5
 	: "cc", 
-	  "%xmm0", "%xmm1",
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
 
diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c
index 907b1ae00..88e3a680b 100644
--- a/kernel/x86_64/zaxpy_microk_steamroller-2.c
+++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c
@@ -122,7 +122,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
           "r" (alpha),  // 4
           "r" (mvec)    // 5
 	: "cc", 
-	  "%xmm0", "%xmm1",
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
@@ -189,9 +189,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
           "r" (alpha),  // 4
           "r" (mvec)    // 5
 	: "cc", 
-	  "%xmm0", "%xmm1",
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
 

From 1a0f57c8f0bd4e2a1eb8ae7a996a09468d7f3067 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 20 Oct 2020 08:37:53 +0200
Subject: [PATCH 27/83] Fix missing backquotes

---
 Makefile.system | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.system b/Makefile.system
index 7f0c26796..30d8f4ccf 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -649,7 +649,7 @@ DYNAMIC_CORE += POWER9
 else
 $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
 endif
-LDVERSIONGTEQ35 := $(shell expr ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-" >= 35)
+LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35)
 ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11)
 DYNAMIC_CORE += POWER10
 CCOMMON_OPT += -DHAVE_P10_SUPPORT

From 00813363bedef30ebd49373c8ccb4c8de6711ed2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 20 Oct 2020 23:56:30 +0200
Subject: [PATCH 28/83] Enable -mavx2 for flang as well

---
 Makefile.x86_64 | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile.x86_64 b/Makefile.x86_64
index 3a42e19e4..58264262e 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -90,6 +90,10 @@ GCCMINORVERSIONGTEQ7 := $(shell expr `$(FC) -dumpversion | cut -f2 -d.` \>= 7)
 ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
 FCOMMON_OPT += -mavx2
 endif
+else
+ifeq ($(F_COMPILER), FLANG)
+FCOMMON_OPT += -mavx2
+endif
 endif
 endif
 endif

From 4a1d00f5892906aa705a54d49fb45ab1a2ff15f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C4=B0smail=20D=C3=B6nmez?= <ismail@i10z.com>
Date: Wed, 21 Oct 2020 08:43:39 +0200
Subject: [PATCH 29/83] Fix build with -Werror=return-type
 dgemm_tcopy_16_skylakex.c CNAME function should return an int, add a return 0
 similar to other files.

---
 kernel/x86_64/dgemm_tcopy_16_skylakex.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/x86_64/dgemm_tcopy_16_skylakex.c b/kernel/x86_64/dgemm_tcopy_16_skylakex.c
index a1da60f8f..ff2c48617 100644
--- a/kernel/x86_64/dgemm_tcopy_16_skylakex.c
+++ b/kernel/x86_64/dgemm_tcopy_16_skylakex.c
@@ -126,4 +126,5 @@ int CNAME(BLASLONG dim_second, BLASLONG dim_first, double *src, BLASLONG lead_di
     }
     src1 += src_inc;
   }
+  return 0;
 }

From ad745c0bae5a17e6460556b39dd991519c31ca4f Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Wed, 21 Oct 2020 09:53:45 -0500
Subject: [PATCH 30/83] Optimize scopy/ccopy for POWER10

This patch makes use of new POWER10 vector pair instructions for
loads and stores. Also reorganized all variants of copy functions
to make use of same kernel.
---
 kernel/power/KERNEL.POWER10                   |   4 +-
 kernel/power/ccopy_power10.c                  | 132 +++++++++++++++++
 ...microk_power10.c => copy_microk_power10.c} |  19 ++-
 kernel/power/dcopy_power10.c                  |   8 +-
 kernel/power/scopy_power10.c                  | 123 ++++++++++++++++
 kernel/power/zcopy_microk_power10.c           | 134 ------------------
 kernel/power/zcopy_power10.c                  |   8 +-
 7 files changed, 280 insertions(+), 148 deletions(-)
 create mode 100644 kernel/power/ccopy_power10.c
 rename kernel/power/{dcopy_microk_power10.c => copy_microk_power10.c} (91%)
 create mode 100644 kernel/power/scopy_power10.c
 delete mode 100644 kernel/power/zcopy_microk_power10.c

diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index 031d96581..86df7e3a2 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -150,9 +150,9 @@ CAXPYKERNEL  = caxpy.c
 endif
 ZAXPYKERNEL  = zaxpy_power10.c
 #
-SCOPYKERNEL  = scopy.c
+SCOPYKERNEL  = scopy_power10.c
 DCOPYKERNEL  = dcopy_power10.c
-CCOPYKERNEL  = ccopy.c
+CCOPYKERNEL  = ccopy_power10.c
 ZCOPYKERNEL  = zcopy_power10.c
 #
 SDOTKERNEL   =  sdot.c
diff --git a/kernel/power/ccopy_power10.c b/kernel/power/ccopy_power10.c
new file mode 100644
index 000000000..a5877cd12
--- /dev/null
+++ b/kernel/power/ccopy_power10.c
@@ -0,0 +1,132 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "copy_microk_power10.c"
+#endif
+
+#ifndef HAVE_KERNEL
+
+static void copy_kernel(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG i=0;
+	FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+
+	while ( i<n )
+	{
+
+		f0 = x1[0];
+		f1 = x1[1];
+		f2 = x1[2];
+		f3 = x1[3];
+		f4 = x1[4];
+		f5 = x1[5];
+		f6 = x1[6];
+		f7 = x1[7];
+
+		y1[0] = f0;
+		y1[1] = f1;
+		y1[2] = f2;
+		y1[3] = f3;
+		y1[4] = f4;
+		y1[5] = f5;
+		y1[6] = f6;
+		y1[7] = f7;
+
+		x1 += 8;
+		y1 += 8;
+
+		i+=4;
+	}
+	return;
+
+}
+
+
+#endif
+
+
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	if ( n <= 0     )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1 ))
+	{
+
+		BLASLONG n1 = n & -64;
+		if ( n1 > 0 )
+		{
+			copy_kernel(n1, x, y);
+			i=n1;
+			ix=n1*2;
+			iy=n1*2;
+		}
+
+		while(i < n)
+		{
+			y[iy] = x[iy] ;
+			y[iy+1] = x[ix+1] ;
+			ix+=2;
+			iy+=2;
+			i++ ;
+
+		}
+
+
+	}
+	else
+	{
+
+		BLASLONG inc_x2 = 2 * inc_x;
+		BLASLONG inc_y2 = 2 * inc_y;
+
+		while(i < n)
+		{
+			y[iy] = x[ix] ;
+			y[iy+1] = x[ix+1] ;
+			ix += inc_x2 ;
+			iy += inc_y2 ;
+			i++ ;
+
+		}
+
+	}
+	return(0);
+	
+
+}
+
+
diff --git a/kernel/power/dcopy_microk_power10.c b/kernel/power/copy_microk_power10.c
similarity index 91%
rename from kernel/power/dcopy_microk_power10.c
rename to kernel/power/copy_microk_power10.c
index 8940e0db9..c90dc3785 100644
--- a/kernel/power/dcopy_microk_power10.c
+++ b/kernel/power/copy_microk_power10.c
@@ -25,9 +25,9 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#define HAVE_KERNEL_64 1
+#define HAVE_KERNEL 1
 
-static void dcopy_kernel_64 (long n, double *x, double *y)
+static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
 {
   __asm__
     (
@@ -49,8 +49,13 @@ static void dcopy_kernel_64 (long n, double *x, double *y)
        "lxvp		60, 448(%2)	\n\t"
        "lxvp		62, 480(%2)	\n\t"
        "addi		%2, %2, 512	\n\t"
-
+#if !defined(COMPLEX) && !defined(DOUBLE)
+       "addic.		%1, %1, -128	\n\t"
+#elif defined(COMPLEX) && defined(DOUBLE)
+       "addic.		%1, %1, -32	\n\t"
+#else
        "addic.		%1, %1, -64	\n\t"
+#endif
        "ble		two%=		\n\t"
 
        ".align	5		\n"
@@ -94,7 +99,13 @@ static void dcopy_kernel_64 (long n, double *x, double *y)
        "addi		%3, %3, 512	\n\t"
        "addi		%2, %2, 512	\n\t"
 
+#if !defined(COMPLEX) && !defined(DOUBLE)
+       "addic.		%1, %1, -128	\n\t"
+#elif defined(COMPLEX) && defined(DOUBLE)
+       "addic.		%1, %1, -32	\n\t"
+#else
        "addic.		%1, %1, -64	\n\t"
+#endif
        "bgt		one%=		\n"
 
      "two%=:				\n\t"
@@ -121,7 +132,7 @@ static void dcopy_kernel_64 (long n, double *x, double *y)
        "=m" (*y),
        "+r" (n),	// 1
        "+b" (x),	// 2
-       "+b" (y)		// 3
+       "+b" (y) 	// 3
      :
        "m" (*x)
      :
diff --git a/kernel/power/dcopy_power10.c b/kernel/power/dcopy_power10.c
index 32530d570..cd10b7136 100644
--- a/kernel/power/dcopy_power10.c
+++ b/kernel/power/dcopy_power10.c
@@ -28,12 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if defined(__VEC__) || defined(__ALTIVEC__)
-#include "dcopy_microk_power10.c"
+#include "copy_microk_power10.c"
 #endif
 
-#ifndef HAVE_KERNEL_64
+#ifndef HAVE_KERNEL
 
-static void dcopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
+static void copy_kernel(BLASLONG n, FLOAT *x, FLOAT *y)
 {
 
 	BLASLONG i=0;
@@ -89,7 +89,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 		BLASLONG n1 = n & -64;
 		if ( n1 > 0 )
 		{
-			dcopy_kernel_64(n1, x, y);
+			copy_kernel(n1, x, y);
 			i=n1;
 		}
 
diff --git a/kernel/power/scopy_power10.c b/kernel/power/scopy_power10.c
new file mode 100644
index 000000000..298a8998a
--- /dev/null
+++ b/kernel/power/scopy_power10.c
@@ -0,0 +1,123 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "copy_microk_power10.c"
+#endif
+
+#ifndef HAVE_KERNEL
+
+static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG i=0;
+	FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+
+	while ( i<n )
+	{
+
+		f0 = x1[0];
+		f1 = x1[1];
+		f2 = x1[2];
+		f3 = x1[3];
+		f4 = x1[4];
+		f5 = x1[5];
+		f6 = x1[6];
+		f7 = x1[7];
+
+		y1[0] = f0;
+		y1[1] = f1;
+		y1[2] = f2;
+		y1[3] = f3;
+		y1[4] = f4;
+		y1[5] = f5;
+		y1[6] = f6;
+		y1[7] = f7;
+
+		x1 += 8;
+		y1 += 8;
+
+		i+=8;
+	}
+	return;
+
+}
+
+
+#endif
+
+
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	if ( n <= 0     )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1 ))
+	{
+
+		BLASLONG n1 = n & -128;
+		if ( n1 > 0 )
+		{
+			copy_kernel (n1, x, y);
+			i=n1;
+		}
+
+		while(i < n)
+		{
+			y[i] = x[i] ;
+			i++ ;
+
+		}
+
+
+	}
+	else
+	{
+
+		while(i < n)
+		{
+			y[iy] = x[ix] ;
+			ix += inc_x ;
+			iy += inc_y ;
+			i++ ;
+
+		}
+
+	}
+	return(0);
+	
+
+}
+
+
diff --git a/kernel/power/zcopy_microk_power10.c b/kernel/power/zcopy_microk_power10.c
deleted file mode 100644
index f2f2119a3..000000000
--- a/kernel/power/zcopy_microk_power10.c
+++ /dev/null
@@ -1,134 +0,0 @@
-/***************************************************************************
-Copyright (c) 2020, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#define HAVE_KERNEL_32 1
-
-static void zcopy_kernel_32 (long n, double *x, double *y)
-{
-  __asm__
-    (
-       "lxvp		32, 0(%2)	\n\t"
-       "lxvp		34, 32(%2)	\n\t"
-       "lxvp		36, 64(%2)	\n\t"
-       "lxvp		38, 96(%2)	\n\t"
-       "lxvp		40, 128(%2)	\n\t"
-       "lxvp		42, 160(%2)	\n\t"
-       "lxvp		44, 192(%2)	\n\t"
-       "lxvp		46, 224(%2)	\n\t"
-
-       "lxvp		48, 256(%2)	\n\t"
-       "lxvp		50, 288(%2)	\n\t"
-       "lxvp		52, 320(%2)	\n\t"
-       "lxvp		54, 352(%2)	\n\t"
-       "lxvp		56, 384(%2)	\n\t"
-       "lxvp		58, 416(%2)	\n\t"
-       "lxvp		60, 448(%2)	\n\t"
-       "lxvp		62, 480(%2)	\n\t"
-       "addi		%2, %2, 512	\n\t"
-
-       "addic.		%1, %1, -32	\n\t"
-       "ble		two%=		\n\t"
-
-       ".align	5		\n"
-     "one%=:				\n\t"
-
-       "stxvp		32, 0(%3)	\n\t"
-       "lxvp		32, 0(%2)	\n\t"
-       "stxvp		34, 32(%3)	\n\t"
-       "lxvp		34, 32(%2)	\n\t"
-       "stxvp		36, 64(%3)	\n\t"
-       "lxvp		36, 64(%2)	\n\t"
-       "stxvp		38, 96(%3)	\n\t"
-       "lxvp		38, 96(%2)	\n\t"
-
-       "stxvp		40, 128(%3)	\n\t"
-       "lxvp		40, 128(%2)	\n\t"
-       "stxvp		42, 160(%3)	\n\t"
-       "lxvp		42, 160(%2)	\n\t"
-       "stxvp		44, 192(%3)	\n\t"
-       "lxvp		44, 192(%2)	\n\t"
-       "stxvp		46, 224(%3)	\n\t"
-       "lxvp		46, 224(%2)	\n\t"
-
-       "stxvp		48, 256(%3)	\n\t"
-       "lxvp		48, 256(%2)	\n\t"
-       "stxvp		50, 288(%3)	\n\t"
-       "lxvp		50, 288(%2)	\n\t"
-       "stxvp		52, 320(%3)	\n\t"
-       "lxvp		52, 320(%2)	\n\t"
-       "stxvp		54, 352(%3)	\n\t"
-       "lxvp		54, 352(%2)	\n\t"
-       "stxvp		56, 384(%3)	\n\t"
-       "lxvp		56, 384(%2)	\n\t"
-       "stxvp		58, 416(%3)	\n\t"
-       "lxvp		58, 416(%2)	\n\t"
-       "stxvp		60, 448(%3)	\n\t"
-       "lxvp		60, 448(%2)	\n\t"
-       "stxvp		62, 480(%3)	\n\t"
-       "lxvp		62, 480(%2)	\n\t"
-
-       "addi		%3, %3, 512	\n\t"
-       "addi		%2, %2, 512	\n\t"
-
-       "addic.		%1, %1, -32	\n\t"
-       "bgt		one%=		\n"
-
-     "two%=:				\n\t"
-
-       "stxvp		32, 0(%3)	\n\t"
-       "stxvp		34, 32(%3)	\n\t"
-       "stxvp		36, 64(%3)	\n\t"
-       "stxvp		38, 96(%3)	\n\t"
-       "stxvp		40, 128(%3)	\n\t"
-       "stxvp		42, 160(%3)	\n\t"
-       "stxvp		44, 192(%3)	\n\t"
-       "stxvp		46, 224(%3)	\n\t"
-       "stxvp		48, 256(%3)	\n\t"
-       "stxvp		50, 288(%3)	\n\t"
-       "stxvp		52, 320(%3)	\n\t"
-       "stxvp		54, 352(%3)	\n\t"
-       "stxvp		56, 384(%3)	\n\t"
-       "stxvp		58, 416(%3)	\n\t"
-       "stxvp		60, 448(%3)	\n\t"
-       "stxvp		62, 480(%3)	\n\t"
-
-     "#n=%1 x=%4=%2 y=%0=%3"
-     :
-       "=m" (*y),
-       "+r" (n),	// 1
-       "+b" (x),	// 2
-       "+b" (y)		// 3
-     :
-       "m" (*x)
-     :
-       "cr0",
-       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
-       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
-       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
-       "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
-     );
-}
diff --git a/kernel/power/zcopy_power10.c b/kernel/power/zcopy_power10.c
index 99d463b02..6b4e7a7d4 100644
--- a/kernel/power/zcopy_power10.c
+++ b/kernel/power/zcopy_power10.c
@@ -28,12 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if defined(__VEC__) || defined(__ALTIVEC__)
-#include "zcopy_microk_power10.c"
+#include "copy_microk_power10.c"
 #endif
 
-#ifndef HAVE_KERNEL_32
+#ifndef HAVE_KERNEL
 
-static void zcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
+static void copy_kernel(BLASLONG n, FLOAT *x, FLOAT *y)
 {
 
 	BLASLONG i=0;
@@ -89,7 +89,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 		BLASLONG n1 = n & -32;
 		if ( n1 > 0 )
 		{
-			zcopy_kernel_32(n1, x, y);
+			copy_kernel(n1, x, y);
 			i=n1;
 			ix=n1*2;
 			iy=n1*2;

From 47696b43e964ff2e0d7e869fd9dd2397a47566ea Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Wed, 21 Oct 2020 16:42:37 -0400
Subject: [PATCH 31/83] actually check that version is greater than 4.7

---
 Makefile.x86_64 | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/Makefile.x86_64 b/Makefile.x86_64
index 58264262e..117347c01 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -74,8 +74,10 @@ ifndef NO_AVX2
 ifeq ($(C_COMPILER), GCC)
 # AVX2 support was added in 4.7.0
 GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
+GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
 GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
-ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
+GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
+ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
 CCOMMON_OPT += -mavx2
 endif
 else 
@@ -86,8 +88,10 @@ endif
 ifeq ($(F_COMPILER), GFORTRAN)
 # AVX2 support was added in 4.7.0
 GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4)
+GCCVERSIONGTEQ5 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 5)
 GCCMINORVERSIONGTEQ7 := $(shell expr `$(FC) -dumpversion | cut -f2 -d.` \>= 7)
-ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
+GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCVERSIONMINORGTEQ7)
+ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
 FCOMMON_OPT += -mavx2
 endif
 else

From f95031204ee88a8976bc377e524abd09a8b5cac3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 22 Oct 2020 16:19:26 +0200
Subject: [PATCH 32/83] Fix macro used in argument conversion (LAPACK PR 458)

---
 lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c
index f58a5c4e9..4928b1bc0 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c
@@ -71,7 +71,7 @@ lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp,
         goto exit_level_0;
     }
     liwork = iwork_query;
-    lcwork = LAPACK_C2INT(cwork_query);
+    lcwork = LAPACK_Z2INT(cwork_query);
     lrwork = (lapack_int)rwork_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );

From 2e48d560bad5400f9a33a643f504a6eb707621f9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 22 Oct 2020 16:23:29 +0200
Subject: [PATCH 33/83] Fix compiler version check

---
 kernel/Makefile | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/kernel/Makefile b/kernel/Makefile
index 43318d475..e52781c6d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -22,20 +22,25 @@ ifeq ($(C_COMPILER), CLANG)
  override CFLAGS += -fno-integrated-as
 endif
 endif
+
 AVX2OPT = 
 ifeq ($(C_COMPILER), GCC)
 # AVX2 support was added in 4.7.0
-  GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
-  GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
-  ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
+GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
+GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
+GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
+GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
+ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
    AVX2OPT = -mavx2
   endif
 endif
 ifeq ($(C_COMPILER), CLANG)
 # Any clang posing as gcc 4.2 should be new enough (3.4 or later)
   GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
+  GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
   GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2)
-  ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2), 11)
+  GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
+  ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
    AVX2OPT = -mavx2
   endif
 endif

From ee90f30384875b82f84ea8f5c9284d64af247054 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 22 Oct 2020 18:47:07 +0200
Subject: [PATCH 34/83] Increase BUFFERSIZE for POWER8-10 and use same value
 for POWER6

to fix overflow warning for PWR8 ZGEMM and PWR9 C/ZGEMM and avoid size mismatches in DYNAMIC_ARCH
---
 common_power.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common_power.h b/common_power.h
index e0685f760..0f1001cb6 100644
--- a/common_power.h
+++ b/common_power.h
@@ -844,8 +844,8 @@ Lmcount$lazy_ptr:
 #define BUFFER_SIZE     (  2 << 20)
 #elif defined(PPC440FP2)
 #define BUFFER_SIZE     ( 16 << 20)
-#elif defined(POWER8) || defined(POWER9) || defined(POWER10)
-#define BUFFER_SIZE     ( 64 << 20)
+#elif defined(POWER6) || defined(POWER8) || defined(POWER9) || defined(POWER10)
+#define BUFFER_SIZE     ( 32 << 22)
 #else
 #define BUFFER_SIZE     ( 16 << 20)
 #endif

From 34c3c407efaaf9770f75f0b9bf8846d91ea3283b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 22 Oct 2020 22:14:26 +0200
Subject: [PATCH 35/83] label always_inline function as inline to silence a gcc
 warning

---
 kernel/power/zgemv_t_4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c
index 4ed27d96b..956d75ffc 100644
--- a/kernel/power/zgemv_t_4.c
+++ b/kernel/power/zgemv_t_4.c
@@ -513,7 +513,7 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT al
 
 #endif
 
-static __attribute__((always_inline)) void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
+static __attribute__((always_inline)) inline void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
     BLASLONG i;
     for (i = 0; i < n; i++) {
         *dest = *src;

From 1d4c96fa0c3506d7bfee45463b17ee2dbb3db3d5 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 23 Oct 2020 00:12:06 +0200
Subject: [PATCH 36/83] Increase BUFFERSIZE further

---
 common_power.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common_power.h b/common_power.h
index 0f1001cb6..a61e4e28a 100644
--- a/common_power.h
+++ b/common_power.h
@@ -845,7 +845,7 @@ Lmcount$lazy_ptr:
 #elif defined(PPC440FP2)
 #define BUFFER_SIZE     ( 16 << 20)
 #elif defined(POWER6) || defined(POWER8) || defined(POWER9) || defined(POWER10)
-#define BUFFER_SIZE     ( 32 << 22)
+#define BUFFER_SIZE     ( 64 << 22)
 #else
 #define BUFFER_SIZE     ( 16 << 20)
 #endif

From b23cb0523174bbd8bff06ca37be947140c9bfd9f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 23 Oct 2020 00:18:29 +0200
Subject: [PATCH 37/83] Fix twisted spelling that broke the gfortran version
 test again

---
 Makefile.x86_64 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.x86_64 b/Makefile.x86_64
index 117347c01..a849f0b01 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -90,7 +90,7 @@ ifeq ($(F_COMPILER), GFORTRAN)
 GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4)
 GCCVERSIONGTEQ5 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 5)
 GCCMINORVERSIONGTEQ7 := $(shell expr `$(FC) -dumpversion | cut -f2 -d.` \>= 7)
-GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCVERSIONMINORGTEQ7)
+GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
 ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
 FCOMMON_OPT += -mavx2
 endif

From 1917a4e7b842ef046de2401bf634682039883768 Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Thu, 22 Oct 2020 22:00:00 -0400
Subject: [PATCH 38/83] reuse variables defined in Makefile.system

---
 Makefile.x86_64 |  7 +------
 kernel/Makefile | 15 ++-------------
 2 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/Makefile.x86_64 b/Makefile.x86_64
index a849f0b01..49a9a0a23 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -47,8 +47,6 @@ ifndef DYNAMIC_ARCH
 ifndef NO_AVX512
 ifeq ($(C_COMPILER), GCC)
 # cooperlake support was added in 10.1
-GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
-GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 1)
 ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
 CCOMMON_OPT += -march=cooperlake
 FCOMMON_OPT += -march=cooperlake
@@ -73,10 +71,7 @@ ifndef DYNAMIC_ARCH
 ifndef NO_AVX2
 ifeq ($(C_COMPILER), GCC)
 # AVX2 support was added in 4.7.0
-GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
-GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
-GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
-GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
+GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
 ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
 CCOMMON_OPT += -mavx2
 endif
diff --git a/kernel/Makefile b/kernel/Makefile
index e52781c6d..e811ed43d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,11 +12,6 @@ ifdef HAVE_SSSE3
 CFLAGS += -mssse3
 endif
 
-ifeq ($(C_COMPILER), GCC)
-GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
-GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
-endif
-
 ifeq ($(ARCH), power)
 ifeq ($(C_COMPILER), CLANG)
  override CFLAGS += -fno-integrated-as
@@ -26,20 +21,14 @@ endif
 AVX2OPT = 
 ifeq ($(C_COMPILER), GCC)
 # AVX2 support was added in 4.7.0
-GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
-GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
-GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
-GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
+GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
 ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
    AVX2OPT = -mavx2
   endif
 endif
 ifeq ($(C_COMPILER), CLANG)
 # Any clang posing as gcc 4.2 should be new enough (3.4 or later)
-  GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
-  GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
-  GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2)
-  GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
+  GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2)
   ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
    AVX2OPT = -mavx2
   endif

From dd6ebdfdab65e5235da4887c943f7639639d19af Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Fri, 23 Oct 2020 10:32:03 +0800
Subject: [PATCH 39/83] Refactor the performance measurement system

---
 benchmark/amax.c     | 170 +++++++++++++--------------------------
 benchmark/amin.c     | 166 +++++++++++++-------------------------
 benchmark/asum.c     | 184 ++++++++++++++-----------------------------
 benchmark/axpby.c    |  86 +-------------------
 benchmark/axpy.c     |  81 +------------------
 benchmark/bench.h    | 103 ++++++++++++++++++++++++
 benchmark/cholesky.c |  50 +-----------
 benchmark/copy.c     |  86 +-------------------
 benchmark/dot.c      |  84 +-------------------
 benchmark/geev.c     |  80 +------------------
 benchmark/gemm.c     |  80 +------------------
 benchmark/gemm3m.c   |  83 +------------------
 12 files changed, 304 insertions(+), 949 deletions(-)
 create mode 100644 benchmark/bench.h

diff --git a/benchmark/amax.c b/benchmark/amax.c
index 19ae95c8b..29310dd71 100644
--- a/benchmark/amax.c
+++ b/benchmark/amax.c
@@ -25,125 +25,73 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef AMAX
 
 #ifdef COMPLEX
 #ifdef DOUBLE
-#define AMAX   BLASFUNC(dzamax)
+#define AMAX BLASFUNC(dzamax)
 #else
-#define AMAX   BLASFUNC(scamax)
+#define AMAX BLASFUNC(scamax)
 #endif
 #else
 #ifdef DOUBLE
-#define AMAX   BLASFUNC(damax)
+#define AMAX BLASFUNC(damax)
 #else
-#define AMAX   BLASFUNC(samax)
+#define AMAX BLASFUNC(samax)
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
-int main(int argc, char *argv[]){
+int main(int argc, char *argv[])
+{
 
   FLOAT *x;
   blasint m, i;
-  blasint inc_x=1;
+  blasint inc_x = 1;
   int loops = 1;
   int l;
   char *p;
 
+  int from = 1;
+  int to = 200;
+  int step = 1;
 
-  int from =   1;
-  int to   = 200;
-  int step =   1;
+  double time1, timeg;
 
-  struct timeval start, stop;
-  double time1,timeg;
+  argc--;
+  argv++;
 
-  argc--;argv++;
+  if (argc > 0)
+  {
+    from = atol(*argv);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    to = MAX(atol(*argv), from);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    step = atol(*argv);
+    argc--;
+    argv++;
+  }
 
-  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+  if ((p = getenv("OPENBLAS_LOOPS")))
+    loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))
+    inc_x = atoi(p);
 
-  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
 
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
-
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
+  {
+    fprintf(stderr, "Out of Memory!!\n");
+    exit(1);
   }
 
 #ifdef __linux
@@ -152,37 +100,31 @@ int main(int argc, char *argv[]){
 
   fprintf(stderr, "   SIZE       Flops\n");
 
-  for(m = from; m <= to; m += step)
+  for (m = from; m <= to; m += step)
   {
 
-   timeg=0;
+    timeg = 0;
+    fprintf(stderr, " %6d : ", (int)m);
 
-   fprintf(stderr, " %6d : ", (int)m);
+    for (l = 0; l < loops; l++)
+    {
 
+      for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
+      {
+        x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
+      }
 
-   for (l=0; l<loops; l++)
-   {
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-
-    	gettimeofday( &start, (struct timezone *)0);
-    	AMAX (&m, x, &inc_x);
-    	gettimeofday( &stop, (struct timezone *)0);
-
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-	timeg += time1;
-
+      begin();
+      AMAX(&m, x, &inc_x);
+      end();
+      timeg += getsec();
     }
 
     timeg /= loops;
 
     fprintf(stderr,
-	    " %10.2f MFlops %10.6f sec\n",
-	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
-
+            " %10.2f MFlops %10.6f sec\n",
+            COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
   }
 
   return 0;
diff --git a/benchmark/amin.c b/benchmark/amin.c
index d0cadbd3b..4bcff9bba 100644
--- a/benchmark/amin.c
+++ b/benchmark/amin.c
@@ -25,124 +25,74 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef AMIN
 
 #ifdef COMPLEX
 #ifdef DOUBLE
-#define AMIN   BLASFUNC(dzamin)
+#define AMIN BLASFUNC(dzamin)
 #else
-#define AMIN   BLASFUNC(scamin)
+#define AMIN BLASFUNC(scamin)
 #endif
 #else
 #ifdef DOUBLE
-#define AMIN   BLASFUNC(damin)
+#define AMIN BLASFUNC(damin)
 #else
-#define AMIN   BLASFUNC(samin)
+#define AMIN BLASFUNC(samin)
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
-int main(int argc, char *argv[]){
+int main(int argc, char *argv[])
+{
 
   FLOAT *x;
   blasint m, i;
-  blasint inc_x=1;
+  blasint inc_x = 1;
   int loops = 1;
   int l;
   char *p;
 
-  int from =   1;
-  int to   = 200;
-  int step =   1;
+  int from = 1;
+  int to = 200;
+  int step = 1;
 
   struct timeval start, stop;
-  double time1,timeg;
+  double time1, timeg;
 
-  argc--;argv++;
+  argc--;
+  argv++;
 
-  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+  if (argc > 0)
+  {
+    from = atol(*argv);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    to = MAX(atol(*argv), from);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    step = atol(*argv);
+    argc--;
+    argv++;
+  }
 
-  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+  if ((p = getenv("OPENBLAS_LOOPS")))
+    loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))
+    inc_x = atoi(p);
 
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
 
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
+  {
+    fprintf(stderr, "Out of Memory!!\n");
+    exit(1);
   }
 
 #ifdef __linux
@@ -151,39 +101,35 @@ int main(int argc, char *argv[]){
 
   fprintf(stderr, "   SIZE       Flops\n");
 
-  for(m = from; m <= to; m += step)
+  for (m = from; m <= to; m += step)
   {
 
-   timeg=0;
+    timeg = 0;
 
-   fprintf(stderr, " %6d : ", (int)m);
+    fprintf(stderr, " %6d : ", (int)m);
 
+    for (l = 0; l < loops; l++)
+    {
 
-   for (l=0; l<loops; l++)
-   {
+      for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
+      {
+        x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
+      }
 
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
+      begin();
 
-    	gettimeofday( &start, (struct timezone *)0);
+      AMIN(&m, x, &inc_x);
 
-    	AMIN (&m, x, &inc_x);
-
-    	gettimeofday( &stop, (struct timezone *)0);
-
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-	timeg += time1;
+      end();
 
+      timeg += getsec();
     }
 
     timeg /= loops;
 
     fprintf(stderr,
-	    " %10.2f MFlops %10.6f sec\n",
-	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
-
+            " %10.2f MFlops %10.6f sec\n",
+            COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
   }
 
   return 0;
diff --git a/benchmark/asum.c b/benchmark/asum.c
index bcccd9089..098ddc8ee 100644
--- a/benchmark/asum.c
+++ b/benchmark/asum.c
@@ -25,132 +25,74 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef ASUM
 
 #ifdef COMPLEX
 #ifdef DOUBLE
-#define ASUM   BLASFUNC(dzasum)
+#define ASUM BLASFUNC(dzasum)
 #else
-#define ASUM   BLASFUNC(scasum)
+#define ASUM BLASFUNC(scasum)
 #endif
 #else
 #ifdef DOUBLE
-#define ASUM   BLASFUNC(dasum)
+#define ASUM BLASFUNC(dasum)
 #else
-#define ASUM   BLASFUNC(sasum)
+#define ASUM BLASFUNC(sasum)
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
-int main(int argc, char *argv[]){
+int main(int argc, char *argv[])
+{
 
   FLOAT *x;
   FLOAT result;
   blasint m, i;
-  blasint inc_x=1;
+  blasint inc_x = 1;
   int loops = 1;
   int l;
   char *p;
 
-  int from =   1;
-  int to   = 200;
-  int step =   1;
-
-#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
-  struct timeval start, stop;
-  double time1,timeg;
-#else
-  struct timespec start = { 0, 0 }, stop = { 0, 0 };
+  int from = 1;
+  int to = 200;
+  int step = 1;
   double time1, timeg;
-#endif
 
-  argc--;argv++;
+  argc--;
+  argv++;
 
-  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
-
-  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
-
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
-
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  if (argc > 0)
+  {
+    from = atol(*argv);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    to = MAX(atol(*argv), from);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    step = atol(*argv);
+    argc--;
+    argv++;
   }
 
+  if ((p = getenv("OPENBLAS_LOOPS")))
+    loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))
+    inc_x = atoi(p);
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
+
+  if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
+  {
+    fprintf(stderr, "Out of Memory!!\n");
+    exit(1);
+  }
 
 #ifdef __linux
   srandom(getpid());
@@ -158,45 +100,33 @@ int main(int argc, char *argv[]){
 
   fprintf(stderr, "   SIZE       Flops\n");
 
-  for(m = from; m <= to; m += step)
+  for (m = from; m <= to; m += step)
   {
 
-   timeg=0;
+    timeg = 0;
 
-   fprintf(stderr, " %6d : ", (int)m);
+    fprintf(stderr, " %6d : ", (int)m);
 
-   for (l=0; l<loops; l++)
-   {
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
-    	gettimeofday( &start, (struct timezone *)0);
-#else
-        clock_gettime(CLOCK_REALTIME, &start);
-#endif
-    	result = ASUM (&m, x, &inc_x);
-#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
-    	clock_gettime(CLOCK_REALTIME, &stop);
-   	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-#else
-  	gettimeofday( &stop, (struct timezone *)0);
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
-#endif
-
-	timeg += time1;
+    for (l = 0; l < loops; l++)
+    {
 
+      for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
+      {
+        x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
+      }
+      begin();
+      result = ASUM(&m, x, &inc_x);
+      end();
+      timeg += getsec();
     }
-if (loops >1)
-    timeg /= loops;
+    if (loops > 1)
+      timeg /= loops;
 
 #ifdef COMPLEX
     fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 4. * (double)m / timeg * 1.e-6, timeg);
 #else
     fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 2. * (double)m / timeg * 1.e-6, timeg);
 #endif
-
   }
 
   return 0;
diff --git a/benchmark/axpby.c b/benchmark/axpby.c
index 793ee7e40..d02d9a889 100644
--- a/benchmark/axpby.c
+++ b/benchmark/axpby.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef AXPBY
 
@@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-             (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-             SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -129,7 +58,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -176,16 +104,10 @@ int main(int argc, char *argv[]){
 
     for (l=0; l<loops; l++)
     {
-        gettimeofday( &start, (struct timezone *)0);
-
+        begin();
         AXPBY (&m, alpha, x, &inc_x, beta, y, &inc_y );
-
-        gettimeofday( &stop, (struct timezone *)0);
-
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-        timeg += time1;
-
+        end();
+        timeg += getsec();
     }
 
     timeg /= loops;
diff --git a/benchmark/axpy.c b/benchmark/axpy.c
index 760703c1d..5a7dead33 100644
--- a/benchmark/axpy.c
+++ b/benchmark/axpy.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef AXPY
 
@@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -127,8 +56,6 @@ int main(int argc, char *argv[]){
   int from =   1;
   int to   = 200;
   int step =   1;
-
-  struct timespec start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -175,13 +102,13 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	clock_gettime( CLOCK_REALTIME, &start);
+    	begin();
 
     	AXPY (&m, alpha, x, &inc_x, y, &inc_y );
 
-    	clock_gettime( CLOCK_REALTIME, &stop);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/bench.h b/benchmark/bench.h
new file mode 100644
index 000000000..9055beaa7
--- /dev/null
+++ b/benchmark/bench.h
@@ -0,0 +1,103 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+  FILETIME ft;
+  unsigned __int64 tmpres = 0;
+  static int tzflag;
+
+  if (NULL != tv)
+    {
+      GetSystemTimeAsFileTime(&ft);
+
+      tmpres |= ft.dwHighDateTime;
+      tmpres <<= 32;
+      tmpres |= ft.dwLowDateTime;
+
+      /*converting file time to unix epoch*/
+      tmpres /= 10;  /*convert into microseconds*/
+      tmpres -= DELTA_EPOCH_IN_MICROSECS;
+      tv->tv_sec = (long)(tmpres / 1000000UL);
+      tv->tv_usec = (long)(tmpres % 1000000UL);
+    }
+
+  return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+  int shmid;
+  void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+  if ((shmid =shmget(IPC_PRIVATE,
+		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+    printf( "Memory allocation failed(shmget).\n");
+    exit(1);
+  }
+
+  address = shmat(shmid, NULL, SHM_RND);
+
+  if ((BLASLONG)address == -1){
+    printf( "Memory allocation failed(shmat).\n");
+    exit(1);
+  }
+
+  shmctl(shmid, IPC_RMID, 0);
+
+  return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
+  struct timeval start, stop;
+#else
+  struct timespec start = { 0, 0 }, stop = { 0, 0 };
+#endif
+
+double getsec()
+{
+#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
+    return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+#else
+    return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9;
+#endif
+}
+
+void begin() {
+#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
+    gettimeofday( &start, (struct timezone *)0);
+#else
+    clock_gettime(CLOCK_REALTIME, &start);
+#endif
+}
+
+void end() {
+#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
+    gettimeofday( &stop, (struct timezone *)0);
+#else
+    clock_gettime(CLOCK_REALTIME, &stop);
+#endif
+}
\ No newline at end of file
diff --git a/benchmark/cholesky.c b/benchmark/cholesky.c
index 5908b6085..65b20d039 100644
--- a/benchmark/cholesky.c
+++ b/benchmark/cholesky.c
@@ -36,12 +36,7 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 double fabs(double);
 
@@ -71,41 +66,6 @@ double fabs(double);
 #endif
 #endif
 
-
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-
 static __inline double getmflops(int ratio, int m, double secs){
 
   double mm = (double)m;
@@ -145,7 +105,6 @@ int main(int argc, char *argv[]){
 
   FLOAT maxerr;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -220,20 +179,19 @@ int main(int argc, char *argv[]){
 
       SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m);
 
-      gettimeofday( &start, (struct timezone *)0);
+      begin();
 
       POTRF(uplo[uplos], &m, b, &m, &info);
 
-      gettimeofday( &stop, (struct timezone *)0);
+      end();
 
       if (info != 0) {
 	fprintf(stderr, "Info = %d\n", info);
 	exit(1);
       }
 
-     time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+     time1 = getsec();
 
-      maxerr = 0.;
 
       if (!(uplos & 1)) {
 	for (j = 0; j < m; j++) {
diff --git a/benchmark/copy.c b/benchmark/copy.c
index eb5148fff..c5e447521 100644
--- a/benchmark/copy.c
+++ b/benchmark/copy.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef COPY
 
@@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -128,11 +57,9 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1 = 0.0, timeg = 0.0;
   long nanos = 0;
   time_t seconds = 0;
-  struct timespec time_start = { 0, 0 }, time_end = { 0, 0 };
 
   argc--;argv++;
 
@@ -176,15 +103,10 @@ int main(int argc, char *argv[]){
 
    for (l=0; l<loops; l++)
    {
-       clock_gettime(CLOCK_REALTIME, &time_start);
+       begin();
        COPY (&m, x, &inc_x, y, &inc_y );
-       clock_gettime(CLOCK_REALTIME, &time_end);
-
-       nanos = time_end.tv_nsec - time_start.tv_nsec;
-       seconds = time_end.tv_sec - time_start.tv_sec;
-
-       time1 = seconds + nanos / 1.e9;
-       timeg += time1;
+       end();
+       timeg += getsec();
    }
 
       timeg /= loops;
diff --git a/benchmark/dot.c b/benchmark/dot.c
index aae3c04b0..86f4e3828 100644
--- a/benchmark/dot.c
+++ b/benchmark/dot.c
@@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef DOT
 
-
 #ifdef DOUBLE
 #define DOT   BLASFUNC(ddot)
 #else
 #define DOT   BLASFUNC(sdot)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -169,15 +96,12 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	result = DOT (&m, x, &inc_x, y, &inc_y );
 
-    	gettimeofday( &stop, (struct timezone *)0);
-
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-	timeg += time1;
+    	end();
+	    timeg += getsec();
 
     }
 
diff --git a/benchmark/geev.c b/benchmark/geev.c
index 4fd2c8d6f..6e22cdfb6 100644
--- a/benchmark/geev.c
+++ b/benchmark/geev.c
@@ -36,13 +36,7 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef GEEV
 
@@ -74,71 +68,6 @@ extern void GEEV( char* jobvl, char* jobvr, blasint* n, FLOAT* a,
                 FLOAT* vr, blasint* ldvr, FLOAT* work, blasint* lwork, FLOAT *rwork, blasint* info );
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork;
@@ -154,7 +83,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -223,7 +151,7 @@ int main(int argc, char *argv[]){
   for(m = from; m <= to; m += step){
 
     fprintf(stderr, " %6d : ", (int)m);
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     lwork = -1;
 #ifndef COMPLEX
@@ -239,14 +167,14 @@ int main(int argc, char *argv[]){
     GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, work, &lwork,rwork, &info);
 #endif
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
     if (info) {
       fprintf(stderr, "failed to compute eigenvalues .. %d\n", info);
       exit(1);
     }
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops : %10.2f Sec : %d\n",
diff --git a/benchmark/gemm.c b/benchmark/gemm.c
index 8cd14bbed..35f5096f3 100644
--- a/benchmark/gemm.c
+++ b/benchmark/gemm.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef GEMM
 
@@ -55,71 +49,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   IFLOAT *a, *b;
@@ -139,7 +68,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1, timeg;
 
   argc--;argv++;
@@ -228,14 +156,14 @@ int main(int argc, char *argv[]){
     ldc = m;
 
     fprintf(stderr, " M=%4d, N=%4d, K=%4d : ", (int)m, (int)n, (int)k);
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     for (j=0; j<loops; j++) {
       GEMM (&transa, &transb, &m, &n, &k, alpha, a, &lda, b, &ldb, beta, c, &ldc);
     }
 
-    gettimeofday( &stop, (struct timezone *)0);
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    end();
+    time1 = getsec();
 
     timeg = time1/loops;
     fprintf(stderr,
diff --git a/benchmark/gemm3m.c b/benchmark/gemm3m.c
index 98c13e1be..76b8176b2 100644
--- a/benchmark/gemm3m.c
+++ b/benchmark/gemm3m.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef GEMM
 
@@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b, *c;
@@ -187,16 +116,12 @@ int main(int argc, char *argv[]){
       		}
     	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	GEMM (&trans, &trans, &m, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
 
-    	gettimeofday( &stop, (struct timezone *)0);
-
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-	timeg += time1;
-
+    	end();
+	    timeg += getsec();
     }
 
     timeg /= loops;

From 4c45cd629438f1626d96b32db0f2da8b29651080 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 23 Oct 2020 15:31:25 +0200
Subject: [PATCH 40/83] fix missing split of sladiv1/dladiv/ilaenv2stage by
 build type

---
 exports/gensymbol | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/exports/gensymbol b/exports/gensymbol
index d5ec45fad..824546052 100644
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -786,13 +786,13 @@ zpotri,
     zlamswlq,
     zgemlq,
     );
-    @lapackobjs2 = (@lapackobjs2,
-    sladiv1,
-    dladiv1,
+    @lapackobjs2s = (@lapackobjs2s,
+    sladiv1);
+    @lapackobjs2d = (@lapackobjs2d,
+    dladiv1);
+    @lapackobjs = (@lapackobjs,
     iparam2stage,
-    
     # functions added for lapack-3.8.0
-
     ilaenv2stage,
     );
     # functions added for lapack-3.9.0

From 0d140e61acf3c174ba0dd41dd50462498a887a90 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 23 Oct 2020 15:53:40 +0200
Subject: [PATCH 41/83] Fix wrong grouping of dcombssq

---
 exports/gensymbol | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/exports/gensymbol b/exports/gensymbol
index 824546052..22e470da5 100644
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -798,10 +798,10 @@ zpotri,
     # functions added for lapack-3.9.0
 @lapackobjs2c = (@lapackobjs2c,
     cgesvdq,
-    cungtsqr,
-    dcombssq,
+    cungtsqr
     );
 @lapackobjs2d = (@lapackobjs2d,
+    dcombssq,
     dgesvdq,
     dorgtsqr,
     );

From 365f28787c2048845b26c47127f547d92dd42f99 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 23 Oct 2020 23:32:06 +0200
Subject: [PATCH 42/83] Comment out  BUILD_SINGLE etc. and add a short
 explanation

---
 Makefile.rule | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/Makefile.rule b/Makefile.rule
index e8f8c2951..85a3a451b 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -295,10 +295,13 @@ COMMON_PROF = -pg
 
 
 
-# the below is not yet configurable, use cmake if you need to build only select types
-BUILD_SINGLE = 1
-BUILD_DOUBLE = 1
-BUILD_COMPLEX = 1
-BUILD_COMPLEX16 = 1
+# By default the library contains BLAS functions (and LAPACK if selected) for all input types.
+# To build a smaller library supporting e.g. only single precision real (SGEMM etc.) or only
+# the functions for complex numbers, uncomment the desired type(s) below
+# BUILD_SINGLE = 1
+# BUILD_DOUBLE = 1
+# BUILD_COMPLEX = 1
+# BUILD_COMPLEX16 = 1
+#
 #  End of user configuration
 #

From 26f658c9d262caa6a1fdecd5ae936b39c302a463 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 24 Oct 2020 12:14:45 +0200
Subject: [PATCH 43/83] Update version to 0.3.12 for release

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 21f0c9571..53c1709a8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 11.dev)
+set(OpenBLAS_PATCH_VERSION 12)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions

From e1c18e4eebcdc9193dd52e0248ae1f99cbfd8369 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 24 Oct 2020 12:15:33 +0200
Subject: [PATCH 44/83] Update version to 0.3.12 for release

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index 85a3a451b..a4d11dc7c 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.11.dev
+VERSION = 0.3.12
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

From 89db73569b5731d6a5f1a39b3941abc8e26ab374 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 24 Oct 2020 12:50:04 +0200
Subject: [PATCH 45/83] Update Changelog with 0.3.12 changes

---
 Changelog.txt | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/Changelog.txt b/Changelog.txt
index bd0e60992..1e843e38e 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,9 +1,36 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.12
+ 24-Oct-2020
+
+common:
+	* Fixed missibg LAPACK functions (inadvertently dropped during
+	  the build system restructuring)
+	* Fixed argument conversion macro in LAPACKE_zgesvdq (LAPACK #458)
+
+POWER:
+	* Added optimized SCOPY/CCOPY kernels for POWER10
+	* Increased and unified the default size of the GEMM BUFFER
+	* Fixed building for POWER1ß in DYNAMIC_ARCH mode
+	* POWER10 compatibility test now checks binutils version as well
+	* Cleaned up compiler warnings
+
+x86_64:
+	* corrected compiler version checks for AVX2 compatibility
+	* added compiler option -mavx2 for building with flang
+	* fixed direct SGEMM pathway for small matrix sizes (broken by
+	  the code refactoring in 0.3.11)
+	* fixed unhandled partial register clobbers in several kernels
+	  for AXPY,DOT,GEMV_N and GEMV_T flagged by gcc10 tree-vectorizer
+
+ARMV8:
+	* improved Apple Vortex support to include cross-compiling
+
 ====================================================================
 Version 0.3.11
  17-Oct-2020
 
- common:
+common:
  	* API change:
 	  the newly added BFLOAT16 functions were renamed to use the
 	  letter "B" instead of "H" to avoid potential confusion with
@@ -28,7 +55,7 @@ Version 0.3.11
 	* Makefile builds no longer misread NO_CBLAS=0 or NO_LAPACK=0 as 
 	  enabling these options
 	* Fixed detection of gfortran when invoked through an mpi wrapper
-	* Improve thread reinitialization performance with OpenMP xafter a fork 
+	* Improve thread reinitialization performance with OpenMP after a fork 
 	* Added support for building only the subset of the library required
 	  for a particular precision by specifying BUILD_SINGLE, BUILD_DOUBLE
 	* Optional function name prefixes and suffixes are now correctly
@@ -66,7 +93,6 @@ ARMV8:
 	* Fixed cpu detection on BSD-like systems
 	* Fixed compilation in -std=C18 mode
 
-
 IBM Z:
 	* Added support for compiling with the clang compiler
 	* Improved GEMM performance on Z14

From c5f280a7f0e875d83833d895b2b8b0e341efabf4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 24 Oct 2020 13:03:28 +0200
Subject: [PATCH 46/83] Fix typos

---
 Changelog.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Changelog.txt b/Changelog.txt
index 1e843e38e..edd3563ec 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -4,14 +4,14 @@ Version 0.3.12
  24-Oct-2020
 
 common:
-	* Fixed missibg LAPACK functions (inadvertently dropped during
+	* Fixed missing BLAS/LAPACK functions (inadvertently dropped during
 	  the build system restructuring)
 	* Fixed argument conversion macro in LAPACKE_zgesvdq (LAPACK #458)
 
 POWER:
 	* Added optimized SCOPY/CCOPY kernels for POWER10
 	* Increased and unified the default size of the GEMM BUFFER
-	* Fixed building for POWER1ß in DYNAMIC_ARCH mode
+	* Fixed building for POWER10 in DYNAMIC_ARCH mode
 	* POWER10 compatibility test now checks binutils version as well
 	* Cleaned up compiler warnings
 

From 81fcfd5ed3ecc3a5f1aefec9ab202d487af85da0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 24 Oct 2020 23:28:29 +0200
Subject: [PATCH 47/83] Update version to 0.3.12.dev

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 53c1709a8..aeb4399e4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 12)
+set(OpenBLAS_PATCH_VERSION 12.dev)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions

From 2f9fc9be30e33efb21b7873c8ee060af190aabd8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 24 Oct 2020 23:29:05 +0200
Subject: [PATCH 48/83] Update version to 0.3.12.dev

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index a4d11dc7c..1a0965d08 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.12
+VERSION = 0.3.12.dev
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

From fd7da56965a5af99f7ec2af161f0057f8b9d6bdb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 25 Oct 2020 12:01:50 +0100
Subject: [PATCH 49/83] Move definitions that are neither needed nor supported
 on SUNOS

---
 driver/others/memory.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index ba2bb55b9..f0521ab2d 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -1767,11 +1767,11 @@ int get_num_procs(void);
 int get_num_procs(void) {
 
   static int nums = 0;
+
+#if defined(__GLIBC_PREREQ)
   cpu_set_t cpuset,*cpusetp;
   size_t size;
   int ret;
-
-#if defined(__GLIBC_PREREQ)
 #if !__GLIBC_PREREQ(2, 7)
   int i;
 #if !__GLIBC_PREREQ(2, 6)

From eec517af0eb1bea187236ccd1072741fbabce01c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 26 Oct 2020 00:21:56 +0100
Subject: [PATCH 50/83] Expressly enable neon for use with intrinsics if
 available

---
 Makefile.arm | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Makefile.arm b/Makefile.arm
index fac6b56824..a27b58e84 100644
--- a/Makefile.arm
+++ b/Makefile.arm
@@ -12,3 +12,8 @@ ifeq ($(CORE), ARMV6)
 CCOMMON_OPT += -mfpu=vfp
 FCOMMON_OPT += -mfpu=vfp
 endif
+
+ifdef HAVE_NEON
+CCOMMON_OPT += -mfpu=neon
+FCOMMON_OPT += -mfpu=neon
+endif

From f917c26e83e040270cb98488b296a5c85cbb5ffb Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Mon, 26 Oct 2020 10:25:05 +0800
Subject: [PATCH 51/83] Refractoring remaining benchmark cases.

---
 benchmark/amin.c       |  1 -
 benchmark/bench.h      |  1 +
 benchmark/dot.c        |  1 -
 benchmark/gemm3m.c     |  1 -
 benchmark/gemv.c       | 86 ++++-------------------------------------
 benchmark/ger.c        | 86 +++--------------------------------------
 benchmark/gesv.c       | 83 ++--------------------------------------
 benchmark/getri.c      | 79 ++------------------------------------
 benchmark/hbmv.c       | 84 ++--------------------------------------
 benchmark/hemm.c       | 81 ++-------------------------------------
 benchmark/hemv.c       | 82 ++-------------------------------------
 benchmark/her.c        | 85 ++---------------------------------------
 benchmark/her2.c       | 85 ++---------------------------------------
 benchmark/her2k.c      | 81 ++-------------------------------------
 benchmark/herk.c       | 83 ++--------------------------------------
 benchmark/hpmv.c       | 82 ++-------------------------------------
 benchmark/iamax.c      | 80 ++------------------------------------
 benchmark/iamin.c      | 80 ++------------------------------------
 benchmark/imax.c       | 80 ++------------------------------------
 benchmark/imin.c       | 80 ++------------------------------------
 benchmark/linpack.c    | 85 ++++-------------------------------------
 benchmark/max.c        | 80 ++------------------------------------
 benchmark/min.c        | 80 ++------------------------------------
 benchmark/nrm2.c       | 80 ++------------------------------------
 benchmark/potrf.c      | 56 +++++----------------------
 benchmark/rot.c        | 79 ++------------------------------------
 benchmark/rotm.c       | 82 +++------------------------------------
 benchmark/scal.c       | 80 ++------------------------------------
 benchmark/spmv.c       | 81 ++-------------------------------------
 benchmark/spr.c        | 82 ++-------------------------------------
 benchmark/spr2.c       | 80 ++------------------------------------
 benchmark/swap.c       | 79 ++------------------------------------
 benchmark/symm.c       | 80 ++------------------------------------
 benchmark/symv.c       | 80 ++------------------------------------
 benchmark/syr.c        | 80 ++------------------------------------
 benchmark/syr2.c       | 81 ++-------------------------------------
 benchmark/syr2k.c      | 79 ++------------------------------------
 benchmark/syrk.c       | 80 ++------------------------------------
 benchmark/tpmv.c       | 48 ++---------------------
 benchmark/tpsv.c       | 48 ++---------------------
 benchmark/trmm.c       | 79 ++------------------------------------
 benchmark/trmv.c       | 48 ++---------------------
 benchmark/trsm.c       | 79 ++------------------------------------
 benchmark/trsv.c       | 87 ++----------------------------------------
 benchmark/zdot-intel.c | 83 +++-------------------------------------
 benchmark/zdot.c       | 81 ++-------------------------------------
 46 files changed, 184 insertions(+), 3114 deletions(-)

diff --git a/benchmark/amin.c b/benchmark/amin.c
index 4bcff9bba..54a1d266a 100644
--- a/benchmark/amin.c
+++ b/benchmark/amin.c
@@ -57,7 +57,6 @@ int main(int argc, char *argv[])
   int to = 200;
   int step = 1;
 
-  struct timeval start, stop;
   double time1, timeg;
 
   argc--;
diff --git a/benchmark/bench.h b/benchmark/bench.h
index 9055beaa7..1f9b8986c 100644
--- a/benchmark/bench.h
+++ b/benchmark/bench.h
@@ -67,6 +67,7 @@ static void *huge_malloc(BLASLONG size){
   return address;
 }
 
+
 #define malloc huge_malloc
 
 #endif
diff --git a/benchmark/dot.c b/benchmark/dot.c
index 86f4e3828..72a756249 100644
--- a/benchmark/dot.c
+++ b/benchmark/dot.c
@@ -49,7 +49,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
diff --git a/benchmark/gemm3m.c b/benchmark/gemm3m.c
index 76b8176b2..f505ca049 100644
--- a/benchmark/gemm3m.c
+++ b/benchmark/gemm3m.c
@@ -62,7 +62,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
diff --git a/benchmark/gemv.c b/benchmark/gemv.c
index fb1f541d3..a0001277a 100644
--- a/benchmark/gemv.c
+++ b/benchmark/gemv.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 
 #undef GEMV
@@ -52,72 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #endif
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *x, *y;
@@ -137,7 +66,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -211,10 +139,10 @@ int main(int argc, char *argv[]){
    			for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 				y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    			}
-    			gettimeofday( &start, (struct timezone *)0);
+    			begin();
     			GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
-    			gettimeofday( &stop, (struct timezone *)0);
-    			time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    			end();
+    			time1 = getsec();
 			timeg += time1;
 
     		}
@@ -248,10 +176,10 @@ int main(int argc, char *argv[]){
    			for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 				y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    			}
-    			gettimeofday( &start, (struct timezone *)0);
+    			begin();
     			GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
-    			gettimeofday( &stop, (struct timezone *)0);
-    			time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    			end();
+    			time1 = getsec();
 			timeg += time1;
 
     		}
diff --git a/benchmark/ger.c b/benchmark/ger.c
index d53d328f0..7ce08c3ad 100644
--- a/benchmark/ger.c
+++ b/benchmark/ger.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef GER
 
@@ -49,72 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *x, *y;
@@ -131,7 +59,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -198,16 +125,13 @@ int main(int argc, char *argv[]){
     for (l=0; l<loops; l++)
     {
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	GER (&m, &n, alpha, x, &inc_x, y, &inc_y, a , &m);
 
-    	gettimeofday( &stop, (struct timezone *)0);
-
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-	timeg += time1;
-
+    	end();
+      
+      timeg += getsec();
     }
 
     timeg /= loops;
diff --git a/benchmark/gesv.c b/benchmark/gesv.c
index 057cbd243..1806b31be 100644
--- a/benchmark/gesv.c
+++ b/benchmark/gesv.c
@@ -36,12 +36,7 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 double fabs(double);
 
@@ -66,71 +61,6 @@ double fabs(double);
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b;
@@ -142,7 +72,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -194,22 +123,18 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     GESV (&m, &m, a, &m, ipiv, b, &m,  &info);
 
-    gettimeofday( &stop, (struct timezone *)0);
-
-
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
+    end();
 
+    time1 = getsec();
 
     fprintf(stderr,
 	    "%10.2f MFlops %10.6f s\n",
 	    COMPSIZE * COMPSIZE * (2. / 3. * (double)m * (double)m * (double)m + 2. * (double)m * (double)m * (double)m ) / (time1) * 1.e-6 , time1);
 
-
   }
 
   return 0;
diff --git a/benchmark/getri.c b/benchmark/getri.c
index a07014768..98a860906 100644
--- a/benchmark/getri.c
+++ b/benchmark/getri.c
@@ -36,12 +36,7 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 #undef GETRF
 #undef GETRI
@@ -72,71 +67,6 @@
 
 extern void GETRI(blasint *m, FLOAT *a, blasint *lda, blasint *ipiv, FLOAT *work, blasint *lwork, blasint *info);
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a,*work;
@@ -148,7 +78,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -205,21 +134,21 @@ int main(int argc, char *argv[]){
       exit(1);
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     lwork = -1;
     GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info);
 
     lwork = (blasint)wkopt[0];
     GETRI(&m, a, &m, ipiv, work, &lwork, &info);
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
     if (info) {
       fprintf(stderr, "failed compute inverse matrix .. %d\n", info);
       exit(1);
     }
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops : %10.2f Sec : %d\n",
diff --git a/benchmark/hbmv.c b/benchmark/hbmv.c
index 60ba9fb89..35249bdf9 100644
--- a/benchmark/hbmv.c
+++ b/benchmark/hbmv.c
@@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HBMV
 
-
 #ifdef DOUBLE
 #define HBMV   BLASFUNC(zhbmv)
 #else
 #define HBMV   BLASFUNC(chbmv)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz) {
-
-    FILETIME ft;
-    unsigned __int64 tmpres = 0;
-    static int tzflag;
-
-    if (NULL != tv)
-    {
-        GetSystemTimeAsFileTime(&ft);
-
-        tmpres |= ft.dwHighDateTime;
-        tmpres <<= 32;
-        tmpres |= ft.dwLowDateTime;
-
-        /*converting file time to unix epoch*/
-        tmpres /= 10;  /*convert into microseconds*/
-        tmpres -= DELTA_EPOCH_IN_MICROSECS;
-        tv->tv_sec = (long)(tmpres / 1000000UL);
-        tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-    return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size) {
-    int shmid;
-    void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-    if ((shmid =shmget(IPC_PRIVATE,
-                (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-                SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-        printf( "Memory allocation failed(shmget).\n");
-        exit(1);
-    }
-
-    address = shmat(shmid, NULL, SHM_RND);
-
-    if ((BLASLONG)address == -1){
-        printf( "Memory allocation failed(shmat).\n");
-        exit(1);
-    }
-
-    shmctl(shmid, IPC_RMID, 0);
-
-    return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
     FLOAT *a, *x, *y;
@@ -125,7 +52,6 @@ int main(int argc, char *argv[]){
     int to   = 200;
     int step =   1;
 
-    struct timeval start, stop;
     double time1,timeg;
 
     argc--;argv++;
@@ -186,15 +112,13 @@ int main(int argc, char *argv[]){
             y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
         }
 
-        gettimeofday( &start, (struct timezone *)0);
+        begin();
 
         HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
 
-        gettimeofday( &stop, (struct timezone *)0);
+        end();
 
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-        timeg += time1;
+        timeg += getsec();
 
     }
 
diff --git a/benchmark/hemm.c b/benchmark/hemm.c
index 2bc165458..a0a9985ad 100644
--- a/benchmark/hemm.c
+++ b/benchmark/hemm.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HEMM
 
@@ -41,72 +35,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define HEMM   BLASFUNC(chemm)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b, *c;
@@ -126,7 +54,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -170,13 +97,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     HEMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
diff --git a/benchmark/hemv.c b/benchmark/hemv.c
index 98618a04e..ad130ddd0 100644
--- a/benchmark/hemv.c
+++ b/benchmark/hemv.c
@@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HEMV
 
-
 #ifdef DOUBLE
 #define HEMV   BLASFUNC(zhemv)
 #else
 #define HEMV   BLASFUNC(chemv)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *x, *y;
@@ -124,7 +51,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -182,13 +108,13 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	HEMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/her.c b/benchmark/her.c
index 010f8120d..cd1fb7f48 100644
--- a/benchmark/her.c
+++ b/benchmark/her.c
@@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HER
 
-
 #ifdef DOUBLE
 #define HER   BLASFUNC(zher)
 #else
 #define HER   BLASFUNC(cher)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-    FILETIME ft;
-    unsigned __int64 tmpres = 0;
-    static int tzflag;
-
-    if (NULL != tv)
-    {
-        GetSystemTimeAsFileTime(&ft);
-
-        tmpres |= ft.dwHighDateTime;
-        tmpres <<= 32;
-        tmpres |= ft.dwLowDateTime;
-
-        /*converting file time to unix epoch*/
-        tmpres /= 10;  /*convert into microseconds*/
-        tmpres -= DELTA_EPOCH_IN_MICROSECS;
-        tv->tv_sec = (long)(tmpres / 1000000UL);
-        tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-    return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
     FLOAT *a, *x;
@@ -126,8 +53,6 @@ int main(int argc, char *argv[]){
     int from =   1;
     int to   = 200;
     int step =   1;
-
-    struct timeval start, stop;
     double time1;
 
     argc--;argv++;
@@ -166,15 +91,13 @@ int main(int argc, char *argv[]){
             x[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
         }
 
-        gettimeofday( &start, (struct timezone *)0);
+        begin();
 
         HER (&uplo, &m, alpha, x, &incx, a, &m );
 
-        gettimeofday( &stop, (struct timezone *)0);
+        end();
 
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-        gettimeofday( &start, (struct timezone *)0);
+        time1 = getsec();
 
         fprintf(stderr,
                 " %10.2f MFlops\n",
diff --git a/benchmark/her2.c b/benchmark/her2.c
index 0f80f3ed9..d87bfd466 100644
--- a/benchmark/her2.c
+++ b/benchmark/her2.c
@@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HER2
 
-
 #ifdef DOUBLE
 #define HER2   BLASFUNC(zher2)
 #else
 #define HER2   BLASFUNC(cher2)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-    FILETIME ft;
-    unsigned __int64 tmpres = 0;
-    static int tzflag;
-
-    if (NULL != tv)
-    {
-        GetSystemTimeAsFileTime(&ft);
-
-        tmpres |= ft.dwHighDateTime;
-        tmpres <<= 32;
-        tmpres |= ft.dwLowDateTime;
-
-        /*converting file time to unix epoch*/
-        tmpres /= 10;  /*convert into microseconds*/
-        tmpres -= DELTA_EPOCH_IN_MICROSECS;
-        tv->tv_sec = (long)(tmpres / 1000000UL);
-        tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-    return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
     FLOAT *a, *x, *y;
@@ -127,7 +54,6 @@ int main(int argc, char *argv[]){
     int to   = 200;
     int step =   1;
 
-    struct timeval start, stop;
     double time1;
 
     argc--;argv++;
@@ -169,16 +95,13 @@ int main(int argc, char *argv[]){
             y[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
         }
 
-        gettimeofday( &start, (struct timezone *)0);
-
+        begin();
 
         HER2 (&uplo, &m, alpha, x, &inc, y, &inc, a, &m );
 
-        gettimeofday( &stop, (struct timezone *)0);
+        end();
 
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-        gettimeofday( &start, (struct timezone *)0);
+        time1 = getsec();
 
         fprintf(stderr,
                 " %10.2f MFlops\n",
diff --git a/benchmark/her2k.c b/benchmark/her2k.c
index 021873beb..d3cdce696 100644
--- a/benchmark/her2k.c
+++ b/benchmark/her2k.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HER2K
 #ifdef DOUBLE
@@ -40,72 +34,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define HER2K   BLASFUNC(cher2k)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b, *c;
@@ -125,7 +53,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -169,13 +96,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     HER2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
diff --git a/benchmark/herk.c b/benchmark/herk.c
index c09d35c1f..628dc2c11 100644
--- a/benchmark/herk.c
+++ b/benchmark/herk.c
@@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HERK
 
-
 #ifdef DOUBLE
 #define HERK   BLASFUNC(zherk)
 #else
 #define HERK   BLASFUNC(cherk)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *c;
@@ -127,7 +54,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -167,18 +93,17 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     HERK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
 	    COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
-
   }
 
   return 0;
diff --git a/benchmark/hpmv.c b/benchmark/hpmv.c
index b0157094e..907e2adc4 100644
--- a/benchmark/hpmv.c
+++ b/benchmark/hpmv.c
@@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HPMV
 
-
 #ifdef DOUBLE
 #define HPMV   BLASFUNC(zhpmv)
 #else
 #define HPMV   BLASFUNC(chpmv)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz) {
-
-    FILETIME ft;
-    unsigned __int64 tmpres = 0;
-    static int tzflag;
-
-    if (NULL != tv)
-    {
-        GetSystemTimeAsFileTime(&ft);
-
-        tmpres |= ft.dwHighDateTime;
-        tmpres <<= 32;
-        tmpres |= ft.dwLowDateTime;
-
-        /*converting file time to unix epoch*/
-        tmpres /= 10;  /*convert into microseconds*/
-        tmpres -= DELTA_EPOCH_IN_MICROSECS;
-        tv->tv_sec = (long)(tmpres / 1000000UL);
-        tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-    return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size) {
-    int shmid;
-    void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-    if ((shmid =shmget(IPC_PRIVATE,
-                (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-                SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-        printf( "Memory allocation failed(shmget).\n");
-        exit(1);
-    }
-
-    address = shmat(shmid, NULL, SHM_RND);
-
-    if ((BLASLONG)address == -1){
-        printf( "Memory allocation failed(shmat).\n");
-        exit(1);
-    }
-
-    shmctl(shmid, IPC_RMID, 0);
-
-    return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
     FLOAT *a, *x, *y;
@@ -124,7 +51,6 @@ int main(int argc, char *argv[]){
     int to   = 200;
     int step =   1;
 
-    struct timeval start, stop;
     double time1,timeg;
 
     argc--;argv++;
@@ -183,13 +109,13 @@ int main(int argc, char *argv[]){
             y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
         }
 
-        gettimeofday( &start, (struct timezone *)0);
+        begin();
 
         HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );
 
-        gettimeofday( &stop, (struct timezone *)0);
+        end();
 
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+        time1 = getsec();
 
         timeg += time1;
 
diff --git a/benchmark/iamax.c b/benchmark/iamax.c
index c87044ab4..15618cbcc 100644
--- a/benchmark/iamax.c
+++ b/benchmark/iamax.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef IAMAX
 
@@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x;
@@ -127,7 +56,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -166,13 +94,13 @@ int main(int argc, char *argv[]){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	IAMAX (&m, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/iamin.c b/benchmark/iamin.c
index e7c8e59e4..a57638ecc 100644
--- a/benchmark/iamin.c
+++ b/benchmark/iamin.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef IAMIN
 
@@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x;
@@ -127,7 +56,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -166,13 +94,13 @@ int main(int argc, char *argv[]){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	IAMIN (&m, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/imax.c b/benchmark/imax.c
index b56ef64ba..b96b17167 100644
--- a/benchmark/imax.c
+++ b/benchmark/imax.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef IMAX
 
@@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x;
@@ -121,7 +50,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -160,13 +88,13 @@ int main(int argc, char *argv[]){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	IMAX (&m, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/imin.c b/benchmark/imin.c
index 4a92c8bd0..095eacca9 100644
--- a/benchmark/imin.c
+++ b/benchmark/imin.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef IMIN
 
@@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x;
@@ -121,7 +50,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -160,13 +88,13 @@ int main(int argc, char *argv[]){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	IMIN (&m, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/linpack.c b/benchmark/linpack.c
index 661a44175..202035245 100644
--- a/benchmark/linpack.c
+++ b/benchmark/linpack.c
@@ -36,12 +36,7 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 double fabs(double);
 
@@ -72,71 +67,6 @@ double fabs(double);
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b;
@@ -151,7 +81,6 @@ int main(int argc, char *argv[]){
 
   FLOAT maxerr;
 
-  struct timeval start, stop;
   double time1, time2;
 
   argc--;argv++;
@@ -198,31 +127,31 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     GETRF (&m, &m, a, &m, ipiv, &info);
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
     if (info) {
       fprintf(stderr, "Matrix is not singular .. %d\n", info);
       exit(1);
     }
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     GETRS("N", &m, &unit, a, &m, ipiv, b, &m, &info);
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
     if (info) {
       fprintf(stderr, "Matrix is not singular .. %d\n", info);
       exit(1);
     }
 
-    time2 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time2 = getsec();
 
     maxerr = 0.;
 
diff --git a/benchmark/max.c b/benchmark/max.c
index a19a386a2..301b943a5 100644
--- a/benchmark/max.c
+++ b/benchmark/max.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef NAMAX
 
@@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x;
@@ -121,7 +50,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -160,13 +88,13 @@ int main(int argc, char *argv[]){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	NAMAX (&m, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/min.c b/benchmark/min.c
index 4df8fb0fd..39df37a29 100644
--- a/benchmark/min.c
+++ b/benchmark/min.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef NAMIN
 
@@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x;
@@ -121,7 +50,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -160,13 +88,13 @@ int main(int argc, char *argv[]){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	NAMIN (&m, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/nrm2.c b/benchmark/nrm2.c
index 0f416621a..cd64d564a 100644
--- a/benchmark/nrm2.c
+++ b/benchmark/nrm2.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef NRM2
 
@@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x;
@@ -127,7 +56,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -166,13 +94,13 @@ int main(int argc, char *argv[]){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	NRM2 (&m, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/potrf.c b/benchmark/potrf.c
index cb4c23bab..116d0cca5 100644
--- a/benchmark/potrf.c
+++ b/benchmark/potrf.c
@@ -36,12 +36,7 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 double fabs(double);
 
@@ -86,37 +81,7 @@ double fabs(double);
 // extern void POTRI(char *uplo, blasint *m, FLOAT *a, blasint *lda, blasint *info);
 // extern void POTRS(char *uplo, blasint *m, blasint *n, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb, blasint *info);
 
-#if defined(__WIN32__) || defined(__WIN64__)
 
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
 
 int main(int argc, char *argv[]){
 
@@ -141,7 +106,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -217,18 +181,18 @@ int main(int argc, char *argv[]){
 
       SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m);
 
-      gettimeofday( &start, (struct timezone *)0);
+      begin();
 
       POTRF(uplo[uplos], &m, b, &m, &info);
 
-      gettimeofday( &stop, (struct timezone *)0);
+      end();
 
       if (info != 0) {
 	fprintf(stderr, "Potrf info = %d\n", info);
 	exit(1);
       }
 
-      time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+      time1 = getsec();
       flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6;
 
       if ( btest == 'S' )
@@ -240,17 +204,17 @@ int main(int argc, char *argv[]){
       		}
     	}
 
-      	gettimeofday( &start, (struct timezone *)0);
+      	begin();
 
       	POTRS(uplo[uplos], &m, &m, b, &m, a, &m,  &info);
 
-      	gettimeofday( &stop, (struct timezone *)0);
+      	end();
 
       	if (info != 0) {
 		fprintf(stderr, "Potrs info = %d\n", info);
 		exit(1);
         }
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+        time1 = getsec();
         flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6;
 
       }
@@ -258,18 +222,18 @@ int main(int argc, char *argv[]){
       if ( btest == 'I' )
       {
 	
-      	gettimeofday( &start, (struct timezone *)0);
+      	begin();
 
       	POTRI(uplo[uplos], &m, b, &m, &info);
 
-      	gettimeofday( &stop, (struct timezone *)0);
+      	end();
 
       	if (info != 0) {
 		fprintf(stderr, "Potri info = %d\n", info);
 		exit(1);
         }
 
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+        time1 = getsec();
         flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6;
       }
 	
diff --git a/benchmark/rot.c b/benchmark/rot.c
index 69698988d..15b630e36 100644
--- a/benchmark/rot.c
+++ b/benchmark/rot.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 #undef ROT
 
@@ -52,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -133,7 +63,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -179,13 +108,13 @@ int main(int argc, char *argv[]){
 
    for (l=0; l<loops; l++)
    {
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	ROT (&m, x, &inc_x, y, &inc_y, c, s);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	    timeg += time1;
 
diff --git a/benchmark/rotm.c b/benchmark/rotm.c
index 17c8d5416..7f333e220 100644
--- a/benchmark/rotm.c
+++ b/benchmark/rotm.c
@@ -25,12 +25,7 @@ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
 THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 #undef ROTM
 
@@ -40,72 +35,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ROTM BLASFUNC(srotm)
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz)
-{
-
-    FILETIME ft;
-    unsigned __int64 tmpres = 0;
-    static int tzflag;
-
-    if (NULL != tv) {
-        GetSystemTimeAsFileTime(&ft);
-
-        tmpres |= ft.dwHighDateTime;
-        tmpres <<= 32;
-        tmpres |= ft.dwLowDateTime;
-
-        /*converting file time to unix epoch*/
-        tmpres /= 10; /*convert into microseconds*/
-        tmpres -= DELTA_EPOCH_IN_MICROSECS;
-        tv->tv_sec = (long)(tmpres / 1000000UL);
-        tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-    return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size)
-{
-    int shmid;
-    void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-    if ((shmid =
-             shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-                    SHM_HUGETLB | IPC_CREAT | 0600)) < 0) {
-        printf("Memory allocation failed(shmget).\n");
-        exit(1);
-    }
-
-    address = shmat(shmid, NULL, SHM_RND);
-
-    if ((BLASLONG)address == -1) {
-        printf("Memory allocation failed(shmat).\n");
-        exit(1);
-    }
-
-    shmctl(shmid, IPC_RMID, 0);
-
-    return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[])
 {
 
@@ -122,7 +51,7 @@ int main(int argc, char *argv[])
     int to = 200;
     int step = 1;
 
-    struct timeval start, stop;
+    
     double time1, timeg;
 
     argc--;
@@ -188,14 +117,13 @@ int main(int argc, char *argv[])
         }
 
         for (l = 0; l < loops; l++) {
-            gettimeofday(&start, (struct timezone *)0);
+            begin();
 
             ROTM(&m, x, &inc_x, y, &inc_y, param);
 
-            gettimeofday(&stop, (struct timezone *)0);
+            end();
 
-            time1 = (double)(stop.tv_sec - start.tv_sec) +
-                    (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+            time1 = getsec();
 
             timeg += time1;
         }
diff --git a/benchmark/scal.c b/benchmark/scal.c
index 8bd62c77c..8de6cfd04 100644
--- a/benchmark/scal.c
+++ b/benchmark/scal.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef SCAL
 
@@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -128,7 +57,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -174,13 +102,13 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	SCAL (&m, alpha, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/spmv.c b/benchmark/spmv.c
index cff504d3b..e4dcbf4ae 100644
--- a/benchmark/spmv.c
+++ b/benchmark/spmv.c
@@ -25,17 +25,10 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef SPMV
 
-
 #ifndef COMPLEX
 
 #ifdef DOUBLE
@@ -54,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *x, *y;
@@ -135,7 +63,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -193,13 +120,13 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/spr.c b/benchmark/spr.c
index 5dcaa4f8b..2fc9994f8 100755
--- a/benchmark/spr.c
+++ b/benchmark/spr.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef SPR
 
@@ -41,73 +35,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SPR   BLASFUNC(sspr)
 #endif
 
-
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a,*c;
@@ -129,7 +56,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -173,13 +99,13 @@ int main(int argc, char *argv[]){
 			c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
 		}
 
-		gettimeofday( &start, (struct timezone *)0);
+		begin();
 
 		SPR (&uplo, &m, alpha, c, &inc_x, a);
 
-		gettimeofday( &stop, (struct timezone *)0);
+		end();
 
-		time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+		time1 = getsec();
 		
 		timeg += time1;
    }
diff --git a/benchmark/spr2.c b/benchmark/spr2.c
index a5f2791f7..8f194e83a 100755
--- a/benchmark/spr2.c
+++ b/benchmark/spr2.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 
 #undef SPR2
@@ -42,72 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a,*b,*c;
@@ -129,7 +58,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -182,13 +110,13 @@ int main(int argc, char *argv[]){
 			c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
 		}
 
-		gettimeofday( &start, (struct timezone *)0);
+		begin();
 
 		SPR2 (&uplo, &m, alpha, c, &inc_x, b, &inc_y, a);
 
-		gettimeofday( &stop, (struct timezone *)0);
+		end();
 
-		time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+		time1 = getsec();
 		
 		timeg += time1;
    }
diff --git a/benchmark/swap.c b/benchmark/swap.c
index 76d545995..64ebe5e9b 100644
--- a/benchmark/swap.c
+++ b/benchmark/swap.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 
 #undef SWAP
@@ -49,71 +44,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -128,7 +58,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -175,13 +104,13 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	SWAP (&m, x, &inc_x, y, &inc_y );
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/symm.c b/benchmark/symm.c
index bb9849eb5..1c6d91d00 100644
--- a/benchmark/symm.c
+++ b/benchmark/symm.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef SYMM
 
@@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b, *c;
@@ -137,7 +66,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -181,13 +109,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     SYMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
diff --git a/benchmark/symv.c b/benchmark/symv.c
index e4c892b5a..0a35aaef0 100644
--- a/benchmark/symv.c
+++ b/benchmark/symv.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef SYMV
 
@@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *x, *y;
@@ -134,7 +63,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -192,13 +120,13 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	SYMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/syr.c b/benchmark/syr.c
index a9dd293e6..ebbf2bd3c 100644
--- a/benchmark/syr.c
+++ b/benchmark/syr.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 
 #undef SYR
@@ -42,72 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x,*a;
@@ -124,7 +53,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
   
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -165,13 +93,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     SYR (&uplo, &m, alpha, x, &inc_x, a, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
diff --git a/benchmark/syr2.c b/benchmark/syr2.c
index 9efbca315..acbc86987 100644
--- a/benchmark/syr2.c
+++ b/benchmark/syr2.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef SYR2
 
@@ -42,72 +36,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYR2   BLASFUNC(ssyr2)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y, *a;
@@ -125,7 +53,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -174,13 +101,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     SYR2 (&uplo, &m, alpha, x, &inc_x, y, &inc_y, a, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
diff --git a/benchmark/syr2k.c b/benchmark/syr2k.c
index a906559eb..3895c2861 100644
--- a/benchmark/syr2k.c
+++ b/benchmark/syr2k.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 
 #undef SYR2K
@@ -53,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b, *c;
@@ -137,7 +67,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -181,13 +110,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     SYR2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
diff --git a/benchmark/syrk.c b/benchmark/syrk.c
index 0fbb943f6..82606a21a 100644
--- a/benchmark/syrk.c
+++ b/benchmark/syrk.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef SYRK
 
@@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *c;
@@ -137,7 +66,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -177,13 +105,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     SYRK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
diff --git a/benchmark/tpmv.c b/benchmark/tpmv.c
index fe9d07534..41f2e0fb8 100644
--- a/benchmark/tpmv.c
+++ b/benchmark/tpmv.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 #undef TPMV
 
@@ -52,40 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size)
-{
-    int shmid;
-    void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-    if ((shmid =shmget(IPC_PRIVATE,
-             (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-             SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-        printf( "Memory allocation failed(shmget).\n");
-        exit(1);
-    }
-
-    address = shmat(shmid, NULL, SHM_RND);
-
-    if ((BLASLONG)address == -1) {
-        printf( "Memory allocation failed(shmat).\n");
-        exit(1);
-    }
-
-    shmctl(shmid, IPC_RMID, 0);
-
-    return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[])
 {
 
@@ -112,7 +73,6 @@ int main(int argc, char *argv[])
     int to   = 200;
     int step =   1;
 
-    struct timespec start = { 0, 0 }, stop = { 0, 0 };
     double time1, timeg;
 
     argc--;argv++;
@@ -153,11 +113,11 @@ int main(int argc, char *argv[])
         }
 
         for (l = 0; l < loops; l++) {
-            clock_gettime(CLOCK_REALTIME, &start);
+            begin();
             TPMV (&uplo, &trans, &diag, &n, a, x, &inc_x);
-            clock_gettime(CLOCK_REALTIME, &stop);
+            end();
 
-            time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
+            time1 = getsec();
             timeg += time1;
         }
 
diff --git a/benchmark/tpsv.c b/benchmark/tpsv.c
index 8472ac261..ebfa29692 100644
--- a/benchmark/tpsv.c
+++ b/benchmark/tpsv.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 #undef TPSV
 
@@ -52,40 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size)
-{
-    int shmid;
-    void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-    if ((shmid =shmget(IPC_PRIVATE,
-             (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-             SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-        printf( "Memory allocation failed(shmget).\n");
-        exit(1);
-    }
-
-    address = shmat(shmid, NULL, SHM_RND);
-
-    if ((BLASLONG)address == -1) {
-        printf( "Memory allocation failed(shmat).\n");
-        exit(1);
-    }
-
-    shmctl(shmid, IPC_RMID, 0);
-
-    return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[])
 {
 
@@ -112,7 +73,6 @@ int main(int argc, char *argv[])
     int to   = 200;
     int step =   1;
 
-    struct timespec start = { 0, 0 }, stop = { 0, 0 };
     double time1, timeg;
 
     argc--;argv++;
@@ -153,11 +113,11 @@ int main(int argc, char *argv[])
         }
 
         for (l = 0; l < loops; l++) {
-            clock_gettime(CLOCK_REALTIME, &start);
+            begin();
             TPSV (&uplo, &trans, &diag, &n, a, x, &inc_x);
-            clock_gettime(CLOCK_REALTIME, &stop);
+            end();
 
-            time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
+            time1 = getsec();
             timeg += time1;
         }
 
diff --git a/benchmark/trmm.c b/benchmark/trmm.c
index 23af122b4..3ab9fc255 100644
--- a/benchmark/trmm.c
+++ b/benchmark/trmm.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 
 #undef TRMM
@@ -53,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b;
@@ -141,7 +71,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -180,13 +109,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     TRMM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m);
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops  %10.6f sec\n",
diff --git a/benchmark/trmv.c b/benchmark/trmv.c
index 46641b3e4..0e8088b54 100644
--- a/benchmark/trmv.c
+++ b/benchmark/trmv.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 #undef TRMV
 
@@ -52,40 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size)
-{
-    int shmid;
-    void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-    if ((shmid =shmget(IPC_PRIVATE,
-             (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-             SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-        printf( "Memory allocation failed(shmget).\n");
-        exit(1);
-    }
-
-    address = shmat(shmid, NULL, SHM_RND);
-
-    if ((BLASLONG)address == -1) {
-        printf( "Memory allocation failed(shmat).\n");
-        exit(1);
-    }
-
-    shmctl(shmid, IPC_RMID, 0);
-
-    return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[])
 {
 
@@ -112,7 +73,6 @@ int main(int argc, char *argv[])
     int to   = 200;
     int step =   1;
 
-    struct timespec start = { 0, 0 }, stop = { 0, 0 };
     double time1, timeg;
 
     argc--;argv++;
@@ -153,11 +113,11 @@ int main(int argc, char *argv[])
         }
 
         for (l = 0; l < loops; l++) {
-            clock_gettime(CLOCK_REALTIME, &start);
+            begin();
             TRMV (&uplo, &trans, &diag, &n, a, &n, x, &inc_x);
-            clock_gettime(CLOCK_REALTIME, &stop);
+            end();
 
-            time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
+            time1 = getsec();
             timeg += time1;
         }
 
diff --git a/benchmark/trsm.c b/benchmark/trsm.c
index 17676946a..d2ebd7f54 100644
--- a/benchmark/trsm.c
+++ b/benchmark/trsm.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 
 #undef TRSM
@@ -53,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b;
@@ -151,7 +81,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -196,13 +125,13 @@ int main(int argc, char *argv[]){
       		 	}
     		 }
 
-    		gettimeofday( &start, (struct timezone *)0);
+    		begin();
 
     		TRSM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m);
 
-    		gettimeofday( &stop, (struct timezone *)0);
+    		end();
 
-    		time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    		time1 = getsec();
 
 		timeg += time1;
         }
diff --git a/benchmark/trsv.c b/benchmark/trsv.c
index 1734e2adb..66ac3a3c7 100644
--- a/benchmark/trsv.c
+++ b/benchmark/trsv.c
@@ -25,14 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include <time.h>
-#include "common.h"
-
+#include "bench.h"
 
 #undef GEMV
 #undef TRSV
@@ -55,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *x;
@@ -133,7 +61,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timespec time_start, time_end;
   time_t seconds = 0;
 
   double time1,timeg;
@@ -189,19 +116,13 @@ int main(int argc, char *argv[]){
 
       for(l =0;l< loops;l++){
 
-          clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_start);
-
+          begin();
           TRSV(&uplo,&transa,&diag,&n,a,&n,x,&inc_x);
-
-          clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_end);
-          nanos = time_end.tv_nsec - time_start.tv_nsec;
-          seconds = time_end.tv_sec - time_start.tv_sec;
-
-          time1 = seconds + nanos /1.e9;
+          end();
+          time1 = getsec();
           timeg += time1;
       }
 
-
       timeg /= loops;
       long long muls = n*(n+1)/2.0;
       long long adds = (n - 1.0)*n/2.0;
diff --git a/benchmark/zdot-intel.c b/benchmark/zdot-intel.c
index ba1515365..06cdde13a 100644
--- a/benchmark/zdot-intel.c
+++ b/benchmark/zdot-intel.c
@@ -25,90 +25,18 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#define RETURN_BY_STACK 1
-#include "common.h"
+#include "bench.h"
 
+#define RETURN_BY_STACK 1
 
 #undef DOT
 
-
 #ifdef DOUBLE
 #define DOT   BLASFUNC(zdotu)
 #else
 #define DOT   BLASFUNC(cdotu)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -123,7 +51,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -170,13 +97,13 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	DOT (&result, &m, x, &inc_x, y, &inc_y );
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/zdot.c b/benchmark/zdot.c
index fa624e859..23b3efcad 100644
--- a/benchmark/zdot.c
+++ b/benchmark/zdot.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef DOT
 
@@ -42,72 +36,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define DOT   BLASFUNC(cdotu)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -122,7 +50,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -169,15 +96,15 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 #ifdef RETURN_BY_STACK
     	DOT (&result , &m, x, &inc_x, y, &inc_y );
 #else
     	result = DOT (&m, x, &inc_x, y, &inc_y );
 #endif
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 

From c24ba8b1dd155b30eb5b7c4e7dc7b38c9e6597e3 Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Mon, 26 Oct 2020 13:24:59 -0500
Subject: [PATCH 52/83] Optimize saxpy for POWER10

This patch makes use of new POWER10 vector pair instructions for
loads and stores.
---
 kernel/power/KERNEL.POWER10         |   2 +-
 kernel/power/saxpy_microk_power10.c | 181 ++++++++++++++++++++++++++++
 kernel/power/saxpy_power10.c        | 119 ++++++++++++++++++
 3 files changed, 301 insertions(+), 1 deletion(-)
 create mode 100644 kernel/power/saxpy_microk_power10.c
 create mode 100644 kernel/power/saxpy_power10.c

diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index 86df7e3a2..1e514fcc9 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -141,7 +141,7 @@ DASUMKERNEL  = dasum.c
 CASUMKERNEL  = casum.c
 ZASUMKERNEL  = zasum.c
 #
-SAXPYKERNEL  = saxpy.c
+SAXPYKERNEL  = saxpy_power10.c
 DAXPYKERNEL  = daxpy_power10.c
 ifneq ($(GCCVERSIONGTEQ9),1)
 CAXPYKERNEL  = caxpy_power9.S
diff --git a/kernel/power/saxpy_microk_power10.c b/kernel/power/saxpy_microk_power10.c
new file mode 100644
index 000000000..6ede1dcdd
--- /dev/null
+++ b/kernel/power/saxpy_microk_power10.c
@@ -0,0 +1,181 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_8 1
+
+static void saxpy_kernel_64(long n, float *x, float *y, float alpha)
+{
+  __vector float t0 = {alpha, alpha,alpha, alpha};
+
+  __asm__
+    (
+
+       "dcbt		0, %2		\n\t"
+       "dcbt		0, %3		\n\t"
+
+       "lxvp		32, 0(%2)	\n\t"
+       "lxvp		34, 32(%2)	\n\t"
+       "lxvp		40, 64(%2)	\n\t"
+       "lxvp		42, 96(%2)	\n\t"
+       "lxvp		48, 128(%2)	\n\t"
+       "lxvp		50, 160(%2)	\n\t"
+       "lxvp		52, 192(%2)	\n\t"
+       "lxvp		54, 224(%2)	\n\t"
+
+       "lxvp		36, 0(%3)	\n\t"
+       "lxvp		38, 32(%3)	\n\t"
+       "lxvp		44, 64(%3)	\n\t"
+       "lxvp		46, 96(%3)	\n\t"
+       "lxvp		56, 128(%3)	\n\t"
+       "lxvp		58, 160(%3)	\n\t"
+       "lxvp		60, 192(%3)	\n\t"
+       "lxvp		62, 224(%3)	\n\t"
+
+       "addi		%2, %2, 256	\n\t"
+
+       "addic.		%1, %1, -64	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align 5			\n"
+     "one%=:				\n\t"
+
+       "xvmaddasp	36, 32, %x4	\n\t"
+       "xvmaddasp	37, 33, %x4	\n\t"
+
+       "lxvp		32, 0(%2)	\n\t"
+       "stxvp		36, 0(%3)	\n\t"
+
+       "xvmaddasp	38, 34, %x4	\n\t"
+       "xvmaddasp	39, 35, %x4	\n\t"
+
+       "lxvp		34, 32(%2)	\n\t"
+       "stxvp		38, 32(%3)	\n\t"
+
+       "lxvp		36, 256(%3)	\n\t"
+       "lxvp		38, 288(%3)	\n\t"
+
+       "xvmaddasp	44, 40, %x4	\n\t"
+       "xvmaddasp	45, 41, %x4	\n\t"
+
+       "lxvp		40, 64(%2)	\n\t"
+       "stxvp		44, 64(%3)	\n\t"
+
+       "xvmaddasp	46, 42, %x4	\n\t"
+       "xvmaddasp	47, 43, %x4	\n\t"
+
+       "lxvp		42, 96(%2)	\n\t"
+       "stxvp		46, 96(%3)	\n\t"
+
+       "lxvp		44, 320(%3)	\n\t"
+       "lxvp		46, 352(%3)	\n\t"
+
+       "xvmaddasp	56, 48, %x4	\n\t"
+       "xvmaddasp	57, 49, %x4	\n\t"
+
+       "lxvp		48, 128(%2)	\n\t"
+       "stxvp		56, 128(%3)	\n\t"
+
+       "xvmaddasp	58, 50, %x4	\n\t"
+       "xvmaddasp	59, 51, %x4	\n\t"
+
+       "lxvp		50, 160(%2)	\n\t"
+       "stxvp		58, 160(%3)	\n\t"
+
+       "lxvp		56, 384(%3)	\n\t"
+       "lxvp		58, 416(%3)	\n\t"
+
+       "xvmaddasp	60, 52, %x4	\n\t"
+       "xvmaddasp	61, 53, %x4	\n\t"
+
+       "lxvp		52, 192(%2)	\n\t"
+       "stxvp		60, 192(%3)	\n\t"
+
+       "xvmaddasp	62, 54, %x4	\n\t"
+       "xvmaddasp	63, 55, %x4	\n\t"
+
+       "lxvp		54, 224(%2)	\n\t"
+       "stxvp		62, 224(%3)	\n\t"
+
+       "lxvp		60, 448(%3)	\n\t"
+       "lxvp		62, 480(%3)	\n\t"
+
+       "addi		%2, %2, 256	\n\t"
+       "addi		%3, %3, 256	\n\t"
+
+       "addic.		%1, %1, -64	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "xvmaddasp	36, 32, %x4	\n\t"
+       "xvmaddasp	37, 33, %x4	\n\t"
+       "xvmaddasp	38, 34, %x4	\n\t"
+       "xvmaddasp	39, 35, %x4	\n\t"
+
+       "xvmaddasp	44, 40, %x4	\n\t"
+       "xvmaddasp	45, 41, %x4	\n\t"
+       "xvmaddasp	46, 42, %x4	\n\t"
+       "xvmaddasp	47, 43, %x4	\n\t"
+
+       "xvmaddasp	56, 48, %x4	\n\t"
+       "xvmaddasp	57, 49, %x4	\n\t"
+       "xvmaddasp	58, 50, %x4	\n\t"
+       "xvmaddasp	59, 51, %x4	\n\t"
+
+       "xvmaddasp	60, 52, %x4	\n\t"
+       "xvmaddasp	61, 53, %x4	\n\t"
+       "xvmaddasp	62, 54, %x4	\n\t"
+       "xvmaddasp	63, 55, %x4	\n\t"
+       "stxvp		36, 0(%3)	\n\t"
+       "stxvp		38, 32(%3)	\n\t"
+       "stxvp		44, 64(%3)	\n\t"
+       "stxvp		46, 96(%3)	\n\t"
+       "stxvp		56, 128(%3)	\n\t"
+       "stxvp		58, 160(%3)	\n\t"
+       "stxvp		60, 192(%3)	\n\t"
+       "stxvp		62, 224(%3)	\n\t"
+
+     "#n=%1 x=%5=%2 y=%0=%3 t0=%x4\n"
+     :
+       "+m" (*y),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y)		// 3
+     :
+       "wa" (t0),	// 4
+       "m" (*x)
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37", "vs38", "vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
+       "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
+     );
+
+}
+
+
diff --git a/kernel/power/saxpy_power10.c b/kernel/power/saxpy_power10.c
new file mode 100644
index 000000000..8c7c22390
--- /dev/null
+++ b/kernel/power/saxpy_power10.c
@@ -0,0 +1,119 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "saxpy_microk_power10.c"
+#endif
+
+#ifndef HAVE_KERNEL_8
+static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
+{
+	BLASLONG register i = 0;
+
+	while(i < n)
+        {
+              y[i]   += alpha * x[i];
+              y[i+1] += alpha * x[i+1];
+              y[i+2] += alpha * x[i+2];
+              y[i+3] += alpha * x[i+3];
+              y[i+4] += alpha * x[i+4];
+              y[i+5] += alpha * x[i+5];
+              y[i+6] += alpha * x[i+6];
+              y[i+7] += alpha * x[i+7];
+              i+=8 ;
+
+       }
+
+}
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	if ( n <= 0 )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1) )
+	{
+
+		BLASLONG n1 = n & -64;
+
+		if ( n1 )
+			saxpy_kernel_64(n1, x, y, da);
+
+		i = n1;
+		while(i < n)
+		{
+
+			y[i] += da * x[i] ;
+			i++ ;
+
+		}
+		return(0);
+
+
+	}
+
+	BLASLONG n1 = n & -4;
+
+	while(i < n1)
+	{
+
+		FLOAT m1      = da * x[ix] ;
+		FLOAT m2      = da * x[ix+inc_x] ;
+		FLOAT m3      = da * x[ix+2*inc_x] ;
+		FLOAT m4      = da * x[ix+3*inc_x] ;
+
+		y[iy]         += m1 ;
+		y[iy+inc_y]   += m2 ;
+		y[iy+2*inc_y] += m3 ;
+		y[iy+3*inc_y] += m4 ;
+
+		ix  += inc_x*4 ;
+		iy  += inc_y*4 ;
+		i+=4 ;
+
+	}
+
+	while(i < n)
+	{
+
+		y[iy] += da * x[ix] ;
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+

From 878b6d1f410c740372a9b5addf6c5033d893cc12 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 26 Oct 2020 21:35:40 +0100
Subject: [PATCH 53/83] Remove spurious expr in flang version check

---
 Makefile.system | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.system b/Makefile.system
index 30d8f4ccf..6d985786d 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -855,7 +855,7 @@ CCOMMON_OPT += -DF_INTERFACE_FLANG
 FCOMMON_OPT += -Mrecursive -Kieee
 ifeq ($(OSNAME), Linux)
 ifeq ($(ARCH), x86_64)
-FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`)
+FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`)
 ifeq ($(FLANG_VENDOR),AOCC)
 FCOMMON_OPT += -fno-unroll-loops
 endif

From 6a1f3e40af7bd018f47afbf8fc543327b6552e48 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 26 Oct 2020 21:37:04 +0100
Subject: [PATCH 54/83] Remove debug printout of object list

---
 interface/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/Makefile b/interface/Makefile
index 6b247b49f..7b60111f9 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -507,7 +507,7 @@ ifneq ($(BUILD_COMPLEX16),1)
 endif
 
 FUNCOBJS    = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS)
-$(info FUNCOBJS = {[$(FUNCOBJS)]} )
+
 ifdef EXPRECISION
 FUNCOBJS   += $(QBLASOBJS) $(XBLASOBJS)
 endif

From b937d78a6d87dbda61a14788c33d48b9c885c6ca Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 27 Oct 2020 17:51:32 +0100
Subject: [PATCH 55/83] Try to read cpu information from
 /sys/devices/system/cpu/cpu0 if HWCAP_CPUID fails

---
 driver/others/dynamic_arm64.c | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c
index be22b247c..007a221db 100644
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@@ -139,19 +139,30 @@ static gotoblas_t *force_coretype(char *coretype) {
 
 static gotoblas_t *get_coretype(void) {
   int implementer, variant, part, arch, revision, midr_el1;
+  char coremsg[128];
+
+#if (!defined OS_LINUX && !defined OS_ANDROID)
+  return NULL;
+#endif
 
-#if (defined OS_LINUX || defined OS_ANDROID)
   if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
-    char coremsg[128];
+#ifdef __linux
+        FILE *infile;
+        char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL;
+        p = (char *) NULL ;
+	infile = fopen("/sys/devices/system/cpu/cpu0/regs/identification/midr_el1","r");
+	if (!infile) return NULL;
+	fgets(buffer, sizeof(buffer), infile);
+	midr_el1=strtoul(buffer,NULL,16);
+	fclose(infile);
+#else
     snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
     openblas_warning(1, coremsg);
     return NULL;
-  }
-#else
-   return NULL;
 #endif
-
-  get_cpu_ftr(MIDR_EL1, midr_el1);
+  } else {
+    get_cpu_ftr(MIDR_EL1, midr_el1);
+  }
   /*
    * MIDR_EL1
    *
@@ -219,6 +230,9 @@ static gotoblas_t *get_coretype(void) {
           return &gotoblas_FALKOR;
       }
       break;
+    default:
+      snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
+      openblas_warning(1, coremsg);
   }
   return NULL;
 }

From e8cbf0fc50547e5b50bc2f15549515f64767d104 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 27 Oct 2020 23:01:19 +0100
Subject: [PATCH 56/83] Output predefined HAVE_ entries to Makefile.conf for
 ARM with specified TARGET

---
 getarch.c | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/getarch.c b/getarch.c
index 3f1448305..ab90f36d9 100644
--- a/getarch.c
+++ b/getarch.c
@@ -1405,8 +1405,41 @@ int main(int argc, char *argv[]){
 
     printf("NUM_CORES=%d\n", get_num_cores());
 
-#if defined(__arm__) && !defined(FORCE)
+#if defined(__arm__) 
+#if !defined(FORCE)
+    fprintf(stderr,"get features!\n");
         get_features();
+#else
+    fprintf(stderr,"split archconfig!\n");
+    sprintf(buffer, "%s", ARCHCONFIG);
+
+    p = &buffer[0];
+
+    while (*p) {
+      if ((*p == '-') && (*(p + 1) == 'D')) {
+	p += 2;
+        if (*p != 'H') {
+		while( (*p != ' ') && (*p != '-') && (*p != '\0') && (*p != '\n')) {p++; }
+		if (*p == '-') continue;
+	}
+	while ((*p != ' ') && (*p != '\0')) {
+
+	  if (*p == '=') {
+	    printf("=");
+	    p ++;
+	    while ((*p != ' ') && (*p != '\0')) {
+	      printf("%c", *p);
+	      p ++;
+	    }
+	  } else {
+	    printf("%c", *p);
+	    p ++;
+	    if ((*p == ' ') || (*p =='\0')) printf("=1\n");
+	  }
+	}
+      } else p ++;
+    }
+#endif
 #endif
 
 

From a7b1f9b1bbbfefb3f8b9dae126afdf054be97eda Mon Sep 17 00:00:00 2001
From: "Chen, Guobing" <guobing.chen@intel.com>
Date: Wed, 28 Oct 2020 08:49:12 +0800
Subject: [PATCH 57/83] Implementation of BF16 based gemv

1. Add a new API -- sbgemv to support bfloat16 based gemv
2. Implement a generic kernel for sbgemv
3. Implement an avx512-bf16 based kernel for sbgemv

Signed-off-by: Chen, Guobing <guobing.chen@intel.com>
---
 cblas.h                                       |    1 +
 cmake/kernel.cmake                            |    4 +-
 common_interface.h                            |    2 +
 common_level2.h                               |    4 +
 common_macro.h                                |   10 +-
 common_param.h                                |    4 +-
 common_sb.h                                   |    4 +
 driver/level2/Makefile                        |   16 +-
 driver/level2/sbgemv_thread.c                 |  149 +
 driver/others/blas_server_omp.c               |    1 -
 exports/gensymbol                             |    4 +-
 interface/Makefile                            |   17 +-
 interface/gemv.c                              |    1 -
 interface/sbgemv.c                            |  210 ++
 kernel/Makefile.L2                            |   22 +
 kernel/setparam-ref.c                         |    2 +-
 kernel/x86_64/KERNEL                          |    8 +
 kernel/x86_64/bf16_common_macros.h            |  795 +++++
 kernel/x86_64/sbgemv_n.c                      |  137 +
 kernel/x86_64/sbgemv_n_microk_cooperlake.c    |   76 +
 .../sbgemv_n_microk_cooperlake_template.c     |  234 ++
 kernel/x86_64/sbgemv_t.c                      |  142 +
 kernel/x86_64/sbgemv_t_microk_cooperlake.c    |  202 ++
 .../sbgemv_t_microk_cooperlake_template.c     | 3082 +++++++++++++++++
 24 files changed, 5111 insertions(+), 16 deletions(-)
 create mode 100644 driver/level2/sbgemv_thread.c
 create mode 100644 interface/sbgemv.c
 create mode 100644 kernel/x86_64/bf16_common_macros.h
 create mode 100644 kernel/x86_64/sbgemv_n.c
 create mode 100644 kernel/x86_64/sbgemv_n_microk_cooperlake.c
 create mode 100644 kernel/x86_64/sbgemv_n_microk_cooperlake_template.c
 create mode 100644 kernel/x86_64/sbgemv_t.c
 create mode 100644 kernel/x86_64/sbgemv_t_microk_cooperlake.c
 create mode 100644 kernel/x86_64/sbgemv_t_microk_cooperlake_template.c

diff --git a/cblas.h b/cblas.h
index bf310bed2..da00d46d6 100644
--- a/cblas.h
+++ b/cblas.h
@@ -393,6 +393,7 @@ void   cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE
 void   cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout);
 /* dot production of BFLOAT16 input arrays, and output as float */
 float  cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy);
+void   cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy);
 
 #ifdef __cplusplus
 }
diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake
index 7d7f5ffda..0c102bae5 100644
--- a/cmake/kernel.cmake
+++ b/cmake/kernel.cmake
@@ -184,8 +184,8 @@ macro(SetDefaultL2)
   set(XHEMV_V_KERNEL ../generic/zhemv_k.c)
   set(XHEMV_M_KERNEL ../generic/zhemv_k.c)
 if (BUILD_BFLOAT16)
-  set(SBGEMVNKERNEL ../arm/gemv_n.c)
-  set(SBGEMVTKERNEL ../arm/gemv_t.c)
+  set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
+  set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
   set(SHGERKERNEL ../generic/ger.c)
 endif ()
 endmacro ()
diff --git a/common_interface.h b/common_interface.h
index 032877fe1..b9ebb2772 100644
--- a/common_interface.h
+++ b/common_interface.h
@@ -250,6 +250,8 @@ void BLASFUNC(xgeru)(blasint *,    blasint *, xdouble *, xdouble *, blasint *,
 void BLASFUNC(xgerc)(blasint *,    blasint *, xdouble *, xdouble *, blasint *,
 		    xdouble *, blasint *, xdouble *, blasint *);
 
+void BLASFUNC(sbgemv)(char *, blasint *, blasint *, float  *, bfloat16 *, blasint *,
+            bfloat16  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(sgemv)(char *, blasint *, blasint *, float  *, float  *, blasint *,
 		    float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(dgemv)(char *, blasint *, blasint *, double *, double *, blasint *,
diff --git a/common_level2.h b/common_level2.h
index 640d4a073..9a5ebb4d9 100644
--- a/common_level2.h
+++ b/common_level2.h
@@ -44,6 +44,10 @@
 extern "C" {
 #endif
 
+int sbgemv_n(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG);
+int sbgemv_t(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG);
+int sbgemv_thread_n(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG, int);
+int sbgemv_thread_t(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG, int);
 int sger_k (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
 int dger_k (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
 int qger_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);
diff --git a/common_macro.h b/common_macro.h
index 54deed57c..c6ea1bfd9 100644
--- a/common_macro.h
+++ b/common_macro.h
@@ -646,10 +646,12 @@
 
 #elif defined(BFLOAT16)
 
-#define  D_TO_BF16_K    SBDTOBF16_K
-#define  D_BF16_TO_K    DBF16TOD_K
-#define  S_TO_BF16_K    SBSTOBF16_K
-#define  S_BF16_TO_K    SBF16TOS_K
+#define D_TO_BF16_K     SBDTOBF16_K
+#define D_BF16_TO_K     DBF16TOD_K
+#define S_TO_BF16_K     SBSTOBF16_K
+#define S_BF16_TO_K     SBF16TOS_K
+#define SBGEMV_N        SBGEMV_N_K
+#define SBGEMV_T        SBGEMV_T_K
 
 #define	AMAX_K			SAMAX_K
 #define	AMIN_K			SAMIN_K
diff --git a/common_param.h b/common_param.h
index b50e4ff80..3e3ae06f8 100644
--- a/common_param.h
+++ b/common_param.h
@@ -78,8 +78,8 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG);
   int    (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
   int    (*sbswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
 
-  int    (*sbgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
-  int    (*sbgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
+  int    (*sbgemv_n) (BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG);
+  int    (*sbgemv_t) (BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG);
   int    (*sbger_k)  (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
 
   int    (*sbsymv_L) (BLASLONG, BLASLONG, float,  float  *, BLASLONG, float  *, BLASLONG, float  *, BLASLONG, float *);
diff --git a/common_sb.h b/common_sb.h
index 66968ab00..9976e812e 100644
--- a/common_sb.h
+++ b/common_sb.h
@@ -8,6 +8,8 @@
 #define SBDTOBF16_K         sbdtobf16_k
 #define SBF16TOS_K          sbf16tos_k
 #define DBF16TOD_K          dbf16tod_k
+#define SBGEMV_N_K          sbgemv_n
+#define SBGEMV_T_K          sbgemv_t
 
 #define	SBGEMM_ONCOPY		sbgemm_oncopy
 #define	SBGEMM_OTCOPY		sbgemm_otcopy
@@ -29,6 +31,8 @@
 #define SBDTOBF16_K         gotoblas -> sbdtobf16_k
 #define SBF16TOS_K          gotoblas -> sbf16tos_k
 #define DBF16TOD_K          gotoblas -> dbf16tod_k
+#define SBGEMV_N_K          gotoblas -> sbgemv_n
+#define SBGEMV_T_K          gotoblas -> sbgemv_t
 
 #define	SBGEMM_ONCOPY		gotoblas -> sbgemm_oncopy
 #define	SBGEMM_OTCOPY		gotoblas -> sbgemm_otcopy
diff --git a/driver/level2/Makefile b/driver/level2/Makefile
index 7212d6662..caecf4f97 100644
--- a/driver/level2/Makefile
+++ b/driver/level2/Makefile
@@ -413,7 +413,13 @@ XBLASOBJS   += \
 	xtbmv_thread_RUU.$(SUFFIX)	xtbmv_thread_RUN.$(SUFFIX) \
 	xtbmv_thread_RLU.$(SUFFIX)	xtbmv_thread_RLN.$(SUFFIX) \
 	xtbmv_thread_CUU.$(SUFFIX)	xtbmv_thread_CUN.$(SUFFIX) \
-	xtbmv_thread_CLU.$(SUFFIX)	xtbmv_thread_CLN.$(SUFFIX) \
+	xtbmv_thread_CLU.$(SUFFIX)	xtbmv_thread_CLN.$(SUFFIX)
+
+ifeq ($(BUILD_BFLOAT16),1)
+SBBLASOBJS     += \
+        sbgemv_thread_n$(TSUFFIX).$(SUFFIX) \
+        sbgemv_thread_t$(TSUFFIX).$(SUFFIX)
+endif
 
 endif
 
@@ -3693,4 +3699,12 @@ xtrsv_CUU.$(SUFFIX)  xtrsv_CUU.$(PSUFFIX)  : ztrsv_L.c ../../param.h
 xtrsv_CUN.$(SUFFIX)  xtrsv_CUN.$(PSUFFIX)  : ztrsv_L.c ../../param.h
 	$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F)
 
+ifeq ($(BUILD_BFLOAT16),1)
+sbgemv_thread_n.$(SUFFIX) sbgemv_thread_n.$(PSUFFIX) : sbgemv_thread.c ../../common.h
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE  -UTRANSA -UCONJ -UXCONJ $< -o $(@F)
+sbgemv_thread_t.$(SUFFIX) sbgemv_thread_t.$(PSUFFIX) : sbgemv_thread.c ../../common.h
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE  -DTRANSA -UCONJ -UXCONJ $< -o $(@F)
+endif
+
+
 include ../../Makefile.tail
diff --git a/driver/level2/sbgemv_thread.c b/driver/level2/sbgemv_thread.c
new file mode 100644
index 000000000..534c60f95
--- /dev/null
+++ b/driver/level2/sbgemv_thread.c
@@ -0,0 +1,149 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+
+#ifndef TRANSA
+#define SBGEMV	SBGEMV_N
+#else
+#define SBGEMV	SBGEMV_T
+#endif
+
+static int sbgemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *dummy2, BLASLONG dummy3){
+
+    bfloat16 *a, *x;
+    float    *y;
+    BLASLONG lda, incx, incy;
+    BLASLONG m_from, m_to, n_from, n_to;
+
+    a = (bfloat16 *)args->a;
+    x = (bfloat16 *)args->b;
+    y = (float *)args->c;
+
+    lda  = args->lda;
+    incx = args->ldb;
+    incy = args->ldc;
+    
+#ifndef TRANSA          // N
+    m_from = *(range_m + 0);
+    m_to   = *(range_m + 1);
+    n_from = 0;
+    n_to   = args -> n;
+    a += m_from;
+    y += m_from * incy;
+#else                   // T
+    m_from = 0;
+    m_to   = args->m;
+    n_from = *(range_n + 0);
+    n_to   = *(range_n + 1);
+    a += n_from * lda;
+    y += n_from * incy;
+#endif
+
+    SBGEMV(m_to - m_from, n_to - n_from, *((FLOAT *)(args->alpha)), a, lda, x, incx, *((FLOAT *)(args->beta)), y, incy);
+
+    return 0;
+}
+
+int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy, int threads)
+{
+    blas_arg_t args;
+    blas_queue_t queue[MAX_CPU_NUMBER];
+    BLASLONG range[MAX_CPU_NUMBER + 1];
+
+#ifndef TRANSA
+    BLASLONG width_for_split = m;
+#else
+    BLASLONG width_for_split = n;
+#endif
+
+    BLASLONG BLOCK_WIDTH = width_for_split/threads;
+
+    int mode  =  BLAS_BFLOAT16  | BLAS_REAL;
+
+    args.m     = m;
+    args.n     = n;
+    args.a     = (void *)a;
+    args.b     = (void *)x;
+    args.c     = (void *)y;
+    args.lda   = lda;
+    args.ldb   = incx;
+    args.ldc   = incy;
+    args.alpha = (void *)&alpha;
+    args.beta  = (void *)&beta;
+
+    range[0] = 0;
+
+    int thread_idx;
+
+    for (thread_idx=0; thread_idx<threads; thread_idx++) {
+        if (thread_idx != threads-1) {
+            range[thread_idx + 1] = range[thread_idx] + BLOCK_WIDTH;
+        } else {
+            range[thread_idx + 1] = range[thread_idx] + width_for_split;
+        }
+
+        queue[thread_idx].mode    = mode;
+        queue[thread_idx].routine = sbgemv_kernel;
+        queue[thread_idx].args    = &args;
+#ifndef TRANSA
+        queue[thread_idx].range_m = &range[thread_idx];
+        queue[thread_idx].range_n = NULL;
+#else
+        queue[thread_idx].range_m = NULL;
+        queue[thread_idx].range_n = &range[thread_idx];
+#endif
+        queue[thread_idx].sa      = NULL;
+        queue[thread_idx].sb      = NULL;
+        queue[thread_idx].next    = &queue[thread_idx + 1];
+
+        width_for_split -= BLOCK_WIDTH;
+    }
+
+    if (thread_idx) {
+        queue[0].sa = NULL;
+        queue[0].sb = NULL;
+        queue[thread_idx - 1].next = NULL;
+
+        exec_blas(thread_idx, queue);
+    }
+
+    return 0;
+}
diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c
index d546553c1..a8b3e9a4b 100644
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -352,7 +352,6 @@ fprintf(stderr,"UNHANDLED COMPLEX\n");
           /* Other types in future */
 	  }
       }
-if (!sb) fprintf(stderr,"SB not declared!!!\n");
       queue->sb=sb;
     }
   }
diff --git a/exports/gensymbol b/exports/gensymbol
index 22e470da5..857a17a9e 100644
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -51,7 +51,7 @@
     zgeadd, dzsum);
 
 @blasobjs = (lsame, xerbla);
-@bfblasobjs = (sbgemm, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
+@bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
 @cblasobjsc = (
     cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv,
     cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k,
@@ -94,7 +94,7 @@
 
 @cblasobjs = (  cblas_xerbla );
 
-@bfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod);
+@bfcblasobjs = (cblas_sbgemm, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod);
 
 @exblasobjs = (
     qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
diff --git a/interface/Makefile b/interface/Makefile
index 7b60111f9..7b0bf1792 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -48,6 +48,7 @@ SBLAS3OBJS    = \
 
 ifeq ($(BUILD_BFLOAT16),1)
 SBBLAS1OBJS    = sbdot.$(SUFFIX)
+SBBLAS2OBJS    = sbgemv.$(SUFFIX)
 SBBLAS3OBJS    = sbgemm.$(SUFFIX)
 SBEXTOBJS      = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX)
 endif
@@ -284,6 +285,7 @@ CSBLAS3OBJS   = \
 
 ifeq ($(BUILD_BFLOAT16),1)
 CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX)
+CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX)
 CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX)
 CSBEXTOBJS   = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX)
 endif
@@ -382,6 +384,7 @@ SBLAS1OBJS   += $(CSBLAS1OBJS)
 SBLAS2OBJS   += $(CSBLAS2OBJS)
 SBLAS3OBJS   += $(CSBLAS3OBJS)
 SBBLAS1OBJS  += $(CSBBLAS1OBJS)
+SBBLAS2OBJS  += $(CSBBLAS2OBJS)
 SBBLAS3OBJS  += $(CSBBLAS3OBJS)
 DBLAS1OBJS   += $(CDBLAS1OBJS)
 DBLAS2OBJS   += $(CDBLAS2OBJS)
@@ -399,7 +402,7 @@ CBAUXOBJS += $(CXERBLAOBJ)
 endif
 
 SBLASOBJS    = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS)
-SBBLASOBJS   = $(SBBLAS1OBJS) $(SBBLAS3OBJS)
+SBBLASOBJS   = $(SBBLAS1OBJS) $(SBBLAS2OBJS) $(SBBLAS3OBJS)
 DBLASOBJS    = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS)
 QBLASOBJS    = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS)
 CBLASOBJS    = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS)
@@ -538,7 +541,7 @@ clean ::
 level1 : $(SBEXTOBJS) $(SBBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS)
 	$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
 
-level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS)
+level2 : $(SBBLAS2OBJS) $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS)
 	$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
 
 level3 : $(SBBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) 
@@ -929,6 +932,11 @@ xgeru.$(SUFFIX) xgeru.$(PSUFFIX) : zger.c
 xgerc.$(SUFFIX) xgerc.$(PSUFFIX) : zger.c
 	$(CC) -c $(CFLAGS) -DCONJ $< -o $(@F)
 
+ifeq ($(BUILD_BFLOAT16),1)
+sbgemv.$(SUFFIX) sbgemv.$(PSUFFIX) : sbgemv.c
+	$(CC) $(CFLAGS) -c $< -o $(@F)
+endif
+
 ifndef USE_NETLIB_GEMV
 sgemv.$(SUFFIX) sgemv.$(PSUFFIX): gemv.c
 	$(CC) -c $(CFLAGS) -o $(@F) $<
@@ -1656,6 +1664,11 @@ cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c
 cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c
 	$(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F)
 
+ifeq ($(BUILD_BFLOAT16),1)
+cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c
+	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
+endif
+
 cblas_sgemv.$(SUFFIX) cblas_sgemv.$(PSUFFIX): gemv.c
 	$(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $<
 
diff --git a/interface/gemv.c b/interface/gemv.c
index c9d52cd69..d5d739fb1 100644
--- a/interface/gemv.c
+++ b/interface/gemv.c
@@ -191,7 +191,6 @@ void CNAME(enum CBLAS_ORDER order,
   }
 
 #endif
-  //printf("m=%d, n=%d, trans=%d, incx=%d, incy=%d, alpha=%f, beta=%f\n", m, n, trans, incx, incy, alpha, beta);
   if ((m==0) || (n==0)) return;
 
   lenx = n;
diff --git a/interface/sbgemv.c b/interface/sbgemv.c
new file mode 100644
index 000000000..89debe82d
--- /dev/null
+++ b/interface/sbgemv.c
@@ -0,0 +1,210 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "l1param.h"
+#ifdef FUNCTION_PROFILE
+#include "functable.h"
+#endif
+
+#define ERROR_NAME "SBGEMV "
+
+#ifdef SMP
+static int (*sbgemv_thread[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG, int) = {
+    sbgemv_thread_n, sbgemv_thread_t,
+};
+#endif
+
+#ifndef CBLAS
+
+void NAME(char *TRANS, blasint *M, blasint *N, float *ALPHA, bfloat16 *a, blasint *LDA, bfloat16 *x, blasint *INCX, float *BETA, float *y, blasint *INCY)
+{
+    char trans = *TRANS;
+    blasint m = *M;
+    blasint n = *N;
+    blasint lda = *LDA;
+    blasint incx = *INCX;
+    blasint incy = *INCY;
+    float alpha = *ALPHA;
+    float beta  = *BETA;
+#ifdef SMP
+    int nthreads;
+#endif
+
+    int (*sbgemv[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG) = {
+        SBGEMV_N, SBGEMV_T,
+    };
+
+    blasint info;
+    blasint lenx, leny;
+    blasint i;
+
+    PRINT_DEBUG_NAME;
+
+    TOUPPER(trans);
+
+    info = 0;
+
+    i = -1;
+
+    if (trans == 'N') {i = 0;}
+    if (trans == 'T') {i = 1;}
+    if (trans == 'R') {i = 0;}
+    if (trans == 'C') {i = 1;}
+
+    if (incy == 0)       {info = 11;}
+    if (incx == 0)       {info = 8;}
+    if (lda < MAX(1, m)) {info = 6;}
+    if (n < 0)           {info = 3;}
+    if (m < 0)           {info = 2;}
+    if (i < 0)           {info = 1;}
+
+    trans = i;
+
+    if (info != 0) {
+        BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+        return;
+    }
+
+#else
+
+void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, float alpha, bfloat16 *a, blasint lda, bfloat16 *x, blasint incx, float beta, float *y, blasint incy)
+{
+    blasint lenx,  leny;
+    int     trans;
+    blasint info,  t;
+#ifdef SMP
+    int     nthreads;
+#endif
+
+    int (*sbgemv[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG,  bfloat16 * , BLASLONG, float, float *, BLASLONG) = {
+        SBGEMV_N, SBGEMV_T,
+    };
+
+    PRINT_DEBUG_CNAME;
+
+    trans = -1;
+    info  =  0;
+
+    if (order == CblasColMajor) {   // Column Major
+        if (TransA == CblasNoTrans || TransA == CblasConjNoTrans) {
+            trans = 0;
+        } else if (TransA == CblasTrans || TransA == CblasConjTrans) {
+            trans = 1;
+        }
+    } else {                        // Row Major
+        if (TransA == CblasNoTrans || TransA == CblasConjNoTrans) {
+            trans = 1;
+        } else if (TransA == CblasTrans || TransA == CblasConjTrans) {
+            trans = 0;
+        }
+
+        t = n;
+        n = m;
+        m = t;
+    }
+
+    info = -1;
+
+    if (incy == 0)       {info = 11;}
+    if (incx == 0)       {info = 8;}
+    if (lda < MAX(1, m)) {info = 6;}
+    if (n < 0)           {info = 3;}
+    if (m < 0)           {info = 2;}
+    if (trans < 0)       {info = 1;}
+
+    if (info >= 0) {
+        BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+        return;
+    }
+
+#endif
+
+    if ((m==0) || (n==0)) return;
+
+    if (trans) {
+        lenx = m;
+        leny = n;
+    } else {
+        lenx = n;
+        leny = m;
+    }
+
+    if (alpha == ZERO) {
+        if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
+        return;
+    }
+
+    IDEBUG_START;
+    FUNCTION_PROFILE_START();
+
+    if (incx < 0) {x -= (lenx - 1) * incx;}
+    if (incy < 0) {y -= (leny - 1) * incy;}
+
+#ifdef SMP
+    int thread_thres_row = 20480;
+    if (trans) {
+        if (n <= thread_thres_row) {
+            nthreads = 1;
+        } else {
+            nthreads = num_cpu_avail(1);
+        }
+    } else {
+        if (m <= thread_thres_row) {
+            nthreads = 1;
+        } else {
+            nthreads = num_cpu_avail(1);
+        }
+    }
+
+
+    if (nthreads == 1) {
+#endif
+        (sbgemv[(int)trans])(m, n, alpha, a, lda, x, incx, beta, y, incy);
+#ifdef SMP
+    } else {
+        (sbgemv_thread[(int)trans])(m, n, alpha, a, lda, x, incx, beta, y, incy, nthreads);
+    }
+#endif
+
+    FUNCTION_PROFILE_END(1, m * n + m + n,  2 * m * n);
+    IDEBUG_END;
+
+    return;
+}
diff --git a/kernel/Makefile.L2 b/kernel/Makefile.L2
index 79399c342..888a9b959 100644
--- a/kernel/Makefile.L2
+++ b/kernel/Makefile.L2
@@ -48,6 +48,16 @@ ifndef XGEMVTKERNEL
 XGEMVTKERNEL = zgemv_t.S
 endif
 
+ifeq ($(BUILD_BFLOAT16),1)
+ifndef SBGEMVNKERNEL
+SBGEMVNKERNEL = ../x86_64/sbgemv_n.c
+endif
+
+ifndef SBGEMVTKERNEL
+SBGEMVTKERNEL = ../x86_64/sbgemv_t.c
+endif
+endif
+
 ### GER ###
 
 ifndef SGERKERNEL
@@ -234,6 +244,12 @@ XBLASOBJS	+= \
 	xhemv_U$(TSUFFIX).$(SUFFIX) xhemv_L$(TSUFFIX).$(SUFFIX) xhemv_V$(TSUFFIX).$(SUFFIX) xhemv_M$(TSUFFIX).$(SUFFIX) \
 	xgeru_k$(TSUFFIX).$(SUFFIX) xgerc_k$(TSUFFIX).$(SUFFIX) xgerv_k$(TSUFFIX).$(SUFFIX) xgerd_k$(TSUFFIX).$(SUFFIX)
 
+ifeq ($(BUILD_BFLOAT16),1)
+SBBLASOBJS     += \
+        sbgemv_n$(TSUFFIX).$(SUFFIX) \
+        sbgemv_t$(TSUFFIX).$(SUFFIX)
+endif
+
 ifneq "$(or $(BUILD_SINGLE), $(BUILD_DOUBLE), $(BUILD_COMPLEX))" ""
 $(KDIR)sgemv_n$(TSUFFIX).$(SUFFIX)  $(KDIR)sgemv_n$(TSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(SGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
 	$(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX  -UTRANS $< -o $@
@@ -483,4 +499,10 @@ $(KDIR)xhemv_V$(TSUFFIX).$(SUFFIX)  $(KDIR)xhemv_V$(TSUFFIX).$(PSUFFIX)  : $(KER
 $(KDIR)xhemv_M$(TSUFFIX).$(SUFFIX)  $(KDIR)xhemv_M$(TSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(XHEMV_M_KERNEL)  ../symcopy.h
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@
 
+ifeq ($(BUILD_BFLOAT16),1)
+$(KDIR)sbgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sbgemv_n$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMVNKERNEL)
+	$(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@
+$(KDIR)sbgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)sbgemv_t$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMVTKERNEL)
+	$(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@
+endif
 
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index 849a4194a..d0317a745 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -69,7 +69,7 @@ gotoblas_t TABLE_NAME = {
   snrm2_kTS,  sasum_kTS, ssum_kTS, scopy_kTS, sbdot_kTS,
   dsdot_kTS,
   srot_kTS,   saxpy_kTS,  sscal_kTS, sswap_kTS,
-  sgemv_nTS,  sgemv_tTS, sger_kTS,
+  sbgemv_nTS, sbgemv_tTS, sger_kTS,
   ssymv_LTS, ssymv_UTS,
 
   sbgemm_kernelTS, sbgemm_betaTS,
diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL
index 855e1ff8c..b92f480e9 100644
--- a/kernel/x86_64/KERNEL
+++ b/kernel/x86_64/KERNEL
@@ -384,6 +384,14 @@ endif
 
 GEMVDEP = ../l2param.h
 
+ifndef SBGEMVNKERNEL
+SBGEMVNKERNEL = sbgemv_n.c
+endif
+
+ifndef SBGEMVTKERNEL
+SBGEMVTKERNEL = sbgemv_t.c
+endif
+
 ifndef SGEMVNKERNEL
 SGEMVNKERNEL = sgemv_n.c
 endif
diff --git a/kernel/x86_64/bf16_common_macros.h b/kernel/x86_64/bf16_common_macros.h
new file mode 100644
index 000000000..1014ecc4d
--- /dev/null
+++ b/kernel/x86_64/bf16_common_macros.h
@@ -0,0 +1,795 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#ifndef __BF16_COMMON_MACROS
+#define __BF16_COMMON_MACROS
+
+#include <immintrin.h>
+
+#define EXTRACT_LOW_256_FROM_512_2X(reg256, reg512)   \
+    reg256##_0 = _mm512_castps512_ps256(reg512##_0);  \
+    reg256##_1 = _mm512_castps512_ps256(reg512##_1);
+
+
+#define BF16_MATRIX_LOAD_8x32(regArray, a, lda, idx_m, idx_n)      \
+    regArray##_0 = _mm512_loadu_si512(&a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm512_loadu_si512(&a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm512_loadu_si512(&a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm512_loadu_si512(&a[(idx_m+3)*lda + idx_n]);  \
+    regArray##_4 = _mm512_loadu_si512(&a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_5 = _mm512_loadu_si512(&a[(idx_m+5)*lda + idx_n]);  \
+    regArray##_6 = _mm512_loadu_si512(&a[(idx_m+6)*lda + idx_n]);  \
+    regArray##_7 = _mm512_loadu_si512(&a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_LOAD_8x16(regArray, a, lda, idx_m, idx_n)      \
+    regArray##_0 = _mm256_loadu_si256(&a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm256_loadu_si256(&a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm256_loadu_si256(&a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm256_loadu_si256(&a[(idx_m+3)*lda + idx_n]);  \
+    regArray##_4 = _mm256_loadu_si256(&a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_5 = _mm256_loadu_si256(&a[(idx_m+5)*lda + idx_n]);  \
+    regArray##_6 = _mm256_loadu_si256(&a[(idx_m+6)*lda + idx_n]);  \
+    regArray##_7 = _mm256_loadu_si256(&a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_LOAD_8x8(regArray, a, lda, idx_m, idx_n)    \
+    regArray##_0 = _mm_loadu_si128(&a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm_loadu_si128(&a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm_loadu_si128(&a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm_loadu_si128(&a[(idx_m+3)*lda + idx_n]);  \
+    regArray##_4 = _mm_loadu_si128(&a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_5 = _mm_loadu_si128(&a[(idx_m+5)*lda + idx_n]);  \
+    regArray##_6 = _mm_loadu_si128(&a[(idx_m+6)*lda + idx_n]);  \
+    regArray##_7 = _mm_loadu_si128(&a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_LOAD_1x32(regArray, a, lda, idx_m, idx_n)       \
+    regArray = _mm512_loadu_si512(&a[idx_m*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_8x32(regArray, a, lda, idx_m, idx_n, mask)      \
+    regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]);  \
+    regArray##_4 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_5 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]);  \
+    regArray##_6 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]);  \
+    regArray##_7 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_8x16(regArray, a, lda, idx_m, idx_n, mask)      \
+    regArray##_0 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]);  \
+    regArray##_4 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_5 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]);  \
+    regArray##_6 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]);  \
+    regArray##_7 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_8x8(regArray, a, lda, idx_m, idx_n, mask)    \
+    regArray##_0 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]);  \
+    regArray##_4 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_5 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]);  \
+    regArray##_6 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]);  \
+    regArray##_7 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_4x32(regArray, a, lda, idx_m, idx_n, mask)      \
+    regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_4x16(regArray, a, lda, idx_m, idx_n, mask)      \
+    regArray##_0 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_8x32_2(regArray, a, lda, idx_m, idx_n, mask)    \
+    regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]);  \
+    regArray##_4 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+8)*lda + idx_n]);  \
+    regArray##_5 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+10)*lda + idx_n]);  \
+    regArray##_6 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+12)*lda + idx_n]);  \
+    regArray##_7 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+14)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_4x32_2(regArray, a, lda, idx_m, idx_n, mask)    \
+    regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]);
+
+#define BF16_MATRIX_MASKZ_LOAD_1x32(regArray, a, lda, idx_m, idx_n, mask)      \
+    regArray = _mm512_maskz_loadu_epi16(mask, &a[idx_m*lda + idx_n]);
+
+#define BF16_VECTOR_LOAD_1x32(reg, x, idx_n)     \
+    reg = _mm512_loadu_si512(x + idx_n);
+
+
+#define BF16_VECTOR_LOAD_1x16(reg, x, idx_n)     \
+    reg = _mm256_loadu_si256(x + idx_n);
+
+
+#define BF16_VECTOR_LOAD_1x8(reg, x, idx_n)      \
+    reg = _mm_loadu_si128(x + idx_n);
+
+
+#define BF16_VECTOR_MASKZ_LOAD_1x32(reg, x, idx_n, mask)     \
+    reg = _mm512_maskz_loadu_epi16(mask, x + idx_n);
+
+
+#define BF16_VECTOR_MASKZ_LOAD_1x16(reg, x, idx_n, mask)     \
+    reg = _mm256_maskz_loadu_epi16(mask, x + idx_n);
+
+
+#define BF16_VECTOR_MASKZ_LOAD_1x8(reg, x, idx_n, mask)      \
+    reg = _mm_maskz_loadu_epi16(mask, x + idx_n);
+
+
+/* 2-step interleave for matrix against 8 rows with 32 BF16 elements per row
+    Input  - register array of 8 rows of raw-major matrix
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for matrix
+    |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11|a16|a17|b16|b17|a18|a19|b18|b19|a24|a25|b24|b25|a26|a27|b26|b27
+    |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11|c16|c17|d16|d17|c18|c19|d18|d19|c24|c25|d24|d25|c26|c27|d26|d27
+    |e0|e1|f0|f1|e2|e3|f2|f3|e8 |e9 |f8 |f9 |e10|e11|f10|f11|e16|e17|f16|f17|e18|e19|f18|f19|e24|e25|f24|f25|e26|e27|f26|f27
+    |g0|g1|h0|h1|g2|g3|h2|h3|g8 |g9 |h8 |h9 |g10|g11|h10|h11|g16|g17|h16|h17|g18|g19|h18|h19|g24|g25|h24|h25|g26|g27|h26|h27
+    |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15|a20|a21|b20|b21|a22|a23|b22|b23|a28|a29|b28|b29|a30|a31|b30|b31
+    |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15|c20|c21|d20|d21|c22|c23|d22|d23|c28|c29|d28|d29|c30|c31|d30|d31
+    |e4|e5|f4|f5|e6|e7|f6|f7|e12|e13|f12|f13|e14|e15|f14|f15|e20|e21|f20|f21|e22|e23|f22|f23|e28|e29|f28|f29|e30|e31|f30|f31
+    |g4|g5|h4|h5|g6|g7|h6|h7|g12|g13|h12|h13|g14|g15|h14|h15|g20|g21|h20|h21|g22|g23|h22|h23|g28|g29|h28|h29|g30|g31|h30|h31
+
+    Step 2: 4-element interleave for matrix
+    |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 |a16|a17|b16|b17|c16|c17|d16|d17|a24|a25|b24|b25|c24|c25|d24|d25
+    |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11|a18|a19|b18|b19|c18|c19|d18|d19|a26|a27|b26|b27|c26|c27|d26|d27
+    |e0|e1|f0|f1|g0|g1|h0|h1|e8 |e9 |f8 |f9 |g8 |g9 |h8 |h9 |e16|e17|f16|f17|g16|g17|h16|h17|e24|e25|f24|f25|g24|g25|h24|h25
+    |e2|e3|f2|f3|g2|g3|h2|h3|e10|e11|f10|f11|g10|g11|h10|h11|e18|e19|f18|f19|g18|g19|h18|h19|e26|e27|f26|f27|g26|g27|h26|h27
+    |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13|a20|a21|b20|b21|c20|c21|d20|d21|a28|a29|b28|b29|c28|c29|d28|d29
+    |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15|a22|a23|b22|b23|c22|c23|d22|d23|a30|a31|b30|b31|c30|c31|d30|d31
+    |e4|e5|f4|f5|g4|g5|h4|h5|e12|e13|f12|f13|g12|g13|h12|h13|e20|e21|f20|f21|g20|g21|h20|h21|e28|e29|f28|f29|g28|g29|h28|h29
+    |e6|e7|f6|f7|g6|g7|h6|h7|e14|e15|f14|f15|g14|g15|h14|h15|e22|e23|f22|f23|g22|g23|h22|h23|e30|e31|f30|f31|g30|g31|h30|h31
+*/
+#define BF16_INTERLEAVE_8x32(regArray)                                  \
+    regArray##_8  = _mm512_unpacklo_epi32(regArray##_0, regArray##_1);  \
+    regArray##_9  = _mm512_unpacklo_epi32(regArray##_2, regArray##_3);  \
+    regArray##_10 = _mm512_unpacklo_epi32(regArray##_4, regArray##_5);  \
+    regArray##_11 = _mm512_unpacklo_epi32(regArray##_6, regArray##_7);  \
+    regArray##_12 = _mm512_unpackhi_epi32(regArray##_0, regArray##_1);  \
+    regArray##_13 = _mm512_unpackhi_epi32(regArray##_2, regArray##_3);  \
+    regArray##_14 = _mm512_unpackhi_epi32(regArray##_4, regArray##_5);  \
+    regArray##_15 = _mm512_unpackhi_epi32(regArray##_6, regArray##_7);  \
+                                                                        \
+    regArray##_0 = _mm512_unpacklo_epi64(regArray##_8,  regArray##_9);  \
+    regArray##_1 = _mm512_unpackhi_epi64(regArray##_8,  regArray##_9);  \
+    regArray##_2 = _mm512_unpacklo_epi64(regArray##_10, regArray##_11); \
+    regArray##_3 = _mm512_unpackhi_epi64(regArray##_10, regArray##_11); \
+    regArray##_4 = _mm512_unpacklo_epi64(regArray##_12, regArray##_13); \
+    regArray##_5 = _mm512_unpackhi_epi64(regArray##_12, regArray##_13); \
+    regArray##_6 = _mm512_unpacklo_epi64(regArray##_14, regArray##_15); \
+    regArray##_7 = _mm512_unpackhi_epi64(regArray##_14, regArray##_15);
+
+
+/* 2-step interleave for matrix against 8 rows with 16 BF16 elements per row
+    Input  - register array of 8 rows of raw-major matrix
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for matrix
+    |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11
+    |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11
+    |e0|e1|f0|f1|e2|e3|f2|f3|e8 |e9 |f8 |f9 |e10|e11|f10|f11
+    |g0|g1|h0|h1|g2|g3|h2|h3|g8 |g9 |h8 |h9 |g10|g11|h10|h11
+    |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15
+    |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15
+    |e4|e5|f4|f5|e6|e7|f6|f7|e12|e13|f12|f13|e14|e15|f14|f15
+    |g4|g5|h4|h5|g6|g7|h6|h7|g12|g13|h12|h13|g14|g15|h14|h15
+
+    Step 2: 4-element interleave for matrix
+    |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9
+    |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11
+    |e0|e1|f0|f1|g0|g1|h0|h1|e8 |e9 |f8 |f9 |g8 |g9 |h8 |h9
+    |e2|e3|f2|f3|g2|g3|h2|h3|e10|e11|f10|f11|g10|g11|h10|h11
+    |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13
+    |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15
+    |e4|e5|f4|f5|g4|g5|h4|h5|e12|e13|f12|f13|g12|g13|h12|h13
+    |e6|e7|f6|f7|g6|g7|h6|h7|e14|e15|f14|f15|g14|g15|h14|h15
+*/
+#define BF16_INTERLEAVE_8x16(regArray)                                  \
+    regArray##_8  = _mm256_unpacklo_epi32(regArray##_0, regArray##_1);  \
+    regArray##_9  = _mm256_unpacklo_epi32(regArray##_2, regArray##_3);  \
+    regArray##_10 = _mm256_unpacklo_epi32(regArray##_4, regArray##_5);  \
+    regArray##_11 = _mm256_unpacklo_epi32(regArray##_6, regArray##_7);  \
+    regArray##_12 = _mm256_unpackhi_epi32(regArray##_0, regArray##_1);  \
+    regArray##_13 = _mm256_unpackhi_epi32(regArray##_2, regArray##_3);  \
+    regArray##_14 = _mm256_unpackhi_epi32(regArray##_4, regArray##_5);  \
+    regArray##_15 = _mm256_unpackhi_epi32(regArray##_6, regArray##_7);  \
+                                                                        \
+    regArray##_0  = _mm256_unpacklo_epi64(regArray##_8,  regArray##_9);    \
+    regArray##_1  = _mm256_unpackhi_epi64(regArray##_8,  regArray##_9);    \
+    regArray##_2  = _mm256_unpacklo_epi64(regArray##_10, regArray##_11);   \
+    regArray##_3  = _mm256_unpackhi_epi64(regArray##_10, regArray##_11);   \
+    regArray##_4  = _mm256_unpacklo_epi64(regArray##_12, regArray##_13);   \
+    regArray##_5  = _mm256_unpackhi_epi64(regArray##_12, regArray##_13);   \
+    regArray##_6  = _mm256_unpacklo_epi64(regArray##_14, regArray##_15);   \
+    regArray##_7  = _mm256_unpackhi_epi64(regArray##_14, regArray##_15);
+
+/* 2-step interleave for matrix against 8 rows with 32 BF16 elements per row
+    Input  - register array of 8 rows of raw-major matrix
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for matrix
+    |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11|a16|a17|b16|b17|a18|a19|b18|b19|a24|a25|b24|b25|a26|a27|b26|b27
+    |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11|c16|c17|d16|d17|c18|c19|d18|d19|c24|c25|d24|d25|c26|c27|d26|d27
+    |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15|a20|a21|b20|b21|a22|a23|b22|b23|a28|a29|b28|b29|a30|a31|b30|b31
+    |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15|c20|c21|d20|d21|c22|c23|d22|d23|c28|c29|d28|d29|c30|c31|d30|d31
+
+    Step 2: 4-element interleave for matrix
+    |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 |a16|a17|b16|b17|c16|c17|d16|d17|a24|a25|b24|b25|c24|c25|d24|d25
+    |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11|a18|a19|b18|b19|c18|c19|d18|d19|a26|a27|b26|b27|c26|c27|d26|d27
+    |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13|a20|a21|b20|b21|c20|c21|d20|d21|a28|a29|b28|b29|c28|c29|d28|d29
+    |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15|a22|a23|b22|b23|c22|c23|d22|d23|a30|a31|b30|b31|c30|c31|d30|d31
+*/
+#define BF16_INTERLEAVE_4x32(regArray)                                 \
+    regArray##_4 = _mm512_unpacklo_epi32(regArray##_0, regArray##_1);  \
+    regArray##_5 = _mm512_unpacklo_epi32(regArray##_2, regArray##_3);  \
+    regArray##_6 = _mm512_unpackhi_epi32(regArray##_0, regArray##_1);  \
+    regArray##_7 = _mm512_unpackhi_epi32(regArray##_2, regArray##_3);  \
+                                                                       \
+    regArray##_0 = _mm512_unpacklo_epi64(regArray##_4, regArray##_5);  \
+    regArray##_1 = _mm512_unpackhi_epi64(regArray##_4, regArray##_5);  \
+    regArray##_2 = _mm512_unpacklo_epi64(regArray##_6, regArray##_7);  \
+    regArray##_3 = _mm512_unpackhi_epi64(regArray##_6, regArray##_7);
+
+
+/* 2-step interleave for matrix against 8 rows with 16 BF16 elements per row
+    Input  - register array of 8 rows of raw-major matrix
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for matrix
+    |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11
+    |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11
+    |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15
+    |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15
+
+    Step 2: 4-element interleave for matrix
+    |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9
+    |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11
+    |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13
+    |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15
+*/
+#define BF16_INTERLEAVE_4x16(regArray)                                 \
+    regArray##_4 = _mm256_unpacklo_epi32(regArray##_0, regArray##_1);  \
+    regArray##_5 = _mm256_unpacklo_epi32(regArray##_2, regArray##_3);  \
+    regArray##_6 = _mm256_unpackhi_epi32(regArray##_0, regArray##_1);  \
+    regArray##_7 = _mm256_unpackhi_epi32(regArray##_2, regArray##_3);  \
+                                                                       \
+    regArray##_0 = _mm256_unpacklo_epi64(regArray##_4, regArray##_5);  \
+    regArray##_1 = _mm256_unpackhi_epi64(regArray##_4, regArray##_5);  \
+    regArray##_2 = _mm256_unpacklo_epi64(regArray##_6, regArray##_7);  \
+    regArray##_3 = _mm256_unpackhi_epi64(regArray##_6, regArray##_7);
+
+
+/* 2-step interleave for x with 32 BF16 elements
+    Input  - original vector
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for x:
+    |x0|x1|x0|x1|x2|x3|x2|x3|x8 |x9 |x8 |x9 |x10|x11|x10|x11|x16|x17|x16|x17|x18|x19|x18|x19|x24|x25|x24|x25|x26|x27|x26|x27
+    |x4|x5|x4|x5|x6|x7|x6|x7|x12|x13|x12|x13|x14|x15|x14|x15|x20|x21|x20|x21|x22|x23|x22|x23|x28|x29|x28|x29|x30|x31|x30|x31
+ 
+    Step 2: 4-element interleave for x:
+    |x0|x1|x0|x1|x0|x1|x0|x1|x8 |x9 |x8 |x9 |x8 |x9 |x8 |x9 |x16|x17|x16|x17|x16|x17|x16|x17|x24|x25|x24|x25|x24|x25|x24|x25
+    |x2|x3|x2|x3|x2|x3|x2|x3|x10|x11|x10|x11|x10|x11|x10|x11|x18|x19|x18|x19|x18|x19|x18|x19|x26|x27|x26|x27|x26|x27|x26|x27
+    |x4|x5|x4|x5|x4|x5|x4|x5|x12|x13|x12|x13|x12|x13|x12|x13|x20|x21|x20|x21|x20|x21|x20|x21|x28|x29|x28|x29|x28|x29|x28|x29
+    |x6|x7|x6|x7|x6|x7|x6|x7|x14|x15|x14|x15|x14|x15|x14|x15|x22|x23|x22|x23|x22|x23|x22|x23|x30|x31|x30|x31|x30|x31|x30|x31
+*/
+#define BF16_INTERLEAVE_1x32(regArray)                                 \
+    regArray##_1 = _mm512_unpacklo_epi32(regArray##_0, regArray##_0);  \
+    regArray##_3 = _mm512_unpackhi_epi32(regArray##_0, regArray##_0);  \
+                                                                       \
+    regArray##_0 = _mm512_unpacklo_epi64(regArray##_1, regArray##_1);  \
+    regArray##_1 = _mm512_unpackhi_epi64(regArray##_1, regArray##_1);  \
+    regArray##_2 = _mm512_unpacklo_epi64(regArray##_3, regArray##_3);  \
+    regArray##_3 = _mm512_unpackhi_epi64(regArray##_3, regArray##_3);
+
+
+/* 2-step interleave for x with 16 BF16 elements
+    Input  - original vector
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for x:
+    |x0|x1|x0|x1|x2|x3|x2|x3|x8 |x9 |x8 |x9 |x10|x11|x10|x11
+    |x4|x5|x4|x5|x6|x7|x6|x7|x12|x13|x12|x13|x14|x15|x14|x15
+
+    Step 2: 4-element interleave for x:
+    |x0|x1|x0|x1|x0|x1|x0|x1|x8 |x9 |x8 |x9 |x8 |x9 |x8 |x9
+    |x2|x3|x2|x3|x2|x3|x2|x3|x10|x11|x10|x11|x10|x11|x10|x11
+    |x4|x5|x4|x5|x4|x5|x4|x5|x12|x13|x12|x13|x12|x13|x12|x13
+    |x6|x7|x6|x7|x6|x7|x6|x7|x14|x15|x14|x15|x14|x15|x14|x15
+*/
+#define BF16_INTERLEAVE_1x16(regArray)                                 \
+    regArray##_1 = _mm256_unpacklo_epi32(regArray##_0, regArray##_0);  \
+    regArray##_3 = _mm256_unpackhi_epi32(regArray##_0, regArray##_0);  \
+                                                                       \
+    regArray##_0 = _mm256_unpacklo_epi64(regArray##_1, regArray##_1);  \
+    regArray##_1 = _mm256_unpackhi_epi64(regArray##_1, regArray##_1);  \
+    regArray##_2 = _mm256_unpacklo_epi64(regArray##_3, regArray##_3);  \
+    regArray##_3 = _mm256_unpackhi_epi64(regArray##_3, regArray##_3);
+
+/* 1-step interleave to exchange the high-256s bit and low-256 bits of 4 pair of registers
+   |a0|a1|...|a14|a15|i0|i1|...|i14|i15|
+   |b0|b1|...|b14|b15|j0|j1|...|j14|j15|
+   |c0|c1|...|c14|c15|k0|k1|...|k14|k15|
+   |d0|d1|...|d14|d15|l0|l1|...|l14|l15|
+   |e0|e1|...|e14|e15|m0|m1|...|m14|m15|
+   |f0|f1|...|f14|f15|n0|n1|...|n14|n15|
+   |g0|g1|...|g14|g15|o0|o1|...|o14|o15|
+   |h0|h1|...|h14|h15|p0|p1|...|p14|p15|
+*/
+#define BF16_INTERLEAVE256_8x32(regArray)                                     \
+    regArray##_0 = _mm512_shuffle_i32x4(regArray##_8,  regArray##_12, 0x44);  \
+    regArray##_1 = _mm512_shuffle_i32x4(regArray##_8,  regArray##_12, 0xee);  \
+    regArray##_2 = _mm512_shuffle_i32x4(regArray##_9,  regArray##_13, 0x44);  \
+    regArray##_3 = _mm512_shuffle_i32x4(regArray##_9,  regArray##_13, 0xee);  \
+    regArray##_4 = _mm512_shuffle_i32x4(regArray##_10, regArray##_14, 0x44);  \
+    regArray##_5 = _mm512_shuffle_i32x4(regArray##_10, regArray##_14, 0xee);  \
+    regArray##_6 = _mm512_shuffle_i32x4(regArray##_11, regArray##_15, 0x44);  \
+    regArray##_7 = _mm512_shuffle_i32x4(regArray##_11, regArray##_15, 0xee);
+
+
+/* 1-step interleave to exchange the high-256s bit and low-256 bits of 2 pair of registers
+   |a0|a1|...|a14|a15|e0|e1|...|e14|e15|
+   |b0|b1|...|b14|b15|f0|f1|...|f14|f15|
+   |c0|c1|...|c14|c15|g0|g1|...|g14|g15|
+   |d0|d1|...|d14|d15|h0|h1|...|h14|h15|
+*/
+#define BF16_INTERLEAVE256_4x32(regArray)                                    \
+    regArray##_0 = _mm512_shuffle_i32x4(regArray##_4,  regArray##_6, 0x44);  \
+    regArray##_1 = _mm512_shuffle_i32x4(regArray##_4,  regArray##_6, 0xee);  \
+    regArray##_2 = _mm512_shuffle_i32x4(regArray##_5,  regArray##_7, 0x44);  \
+    regArray##_3 = _mm512_shuffle_i32x4(regArray##_5,  regArray##_7, 0xee);
+
+
+#define BF16_PERMUTE_8x32(idx, regArray) \
+    regArray##_8  = _mm512_permutexvar_epi16(idx, regArray##_0);  \
+    regArray##_9  = _mm512_permutexvar_epi16(idx, regArray##_1);  \
+    regArray##_10 = _mm512_permutexvar_epi16(idx, regArray##_2);  \
+    regArray##_11 = _mm512_permutexvar_epi16(idx, regArray##_3);  \
+    regArray##_12 = _mm512_permutexvar_epi16(idx, regArray##_4);  \
+    regArray##_13 = _mm512_permutexvar_epi16(idx, regArray##_5);  \
+    regArray##_14 = _mm512_permutexvar_epi16(idx, regArray##_6);  \
+    regArray##_15 = _mm512_permutexvar_epi16(idx, regArray##_7);
+
+
+#define BF16_PERMUTE_8x32_2(idx, regArray) \
+    regArray##_8  = _mm512_permutexvar_epi32(idx, regArray##_0);  \
+    regArray##_9  = _mm512_permutexvar_epi32(idx, regArray##_1);  \
+    regArray##_10 = _mm512_permutexvar_epi32(idx, regArray##_2);  \
+    regArray##_11 = _mm512_permutexvar_epi32(idx, regArray##_3);  \
+    regArray##_12 = _mm512_permutexvar_epi32(idx, regArray##_4);  \
+    regArray##_13 = _mm512_permutexvar_epi32(idx, regArray##_5);  \
+    regArray##_14 = _mm512_permutexvar_epi32(idx, regArray##_6);  \
+    regArray##_15 = _mm512_permutexvar_epi32(idx, regArray##_7);
+
+
+#define BF16_PERMUTE_4x32(idx, regArray) \
+    regArray##_4 = _mm512_permutexvar_epi16(idx, regArray##_0);  \
+    regArray##_5 = _mm512_permutexvar_epi16(idx, regArray##_1);  \
+    regArray##_6 = _mm512_permutexvar_epi16(idx, regArray##_2);  \
+    regArray##_7 = _mm512_permutexvar_epi16(idx, regArray##_3);
+
+
+#define BF16_PERMUTE_4x32_2(idx, regArray) \
+    regArray##_4 = _mm512_permutexvar_epi32(idx, regArray##_0);  \
+    regArray##_5 = _mm512_permutexvar_epi32(idx, regArray##_1);  \
+    regArray##_6 = _mm512_permutexvar_epi32(idx, regArray##_2);  \
+    regArray##_7 = _mm512_permutexvar_epi32(idx, regArray##_3);
+
+
+/* Calculate the dot result for 2-step interleaved matrix and vector
+   (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_2STEP_INTERLEAVED_DOT_8x32(accumArray, matArray, xArray)                                   \
+    accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray##_0);  \
+    accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_2, (__m512bh) xArray##_0);  \
+    accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_1, (__m512bh) xArray##_1);  \
+    accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_3, (__m512bh) xArray##_1);  \
+    accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_4, (__m512bh) xArray##_2);  \
+    accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_6, (__m512bh) xArray##_2);  \
+    accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_5, (__m512bh) xArray##_3);  \
+    accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_7, (__m512bh) xArray##_3);
+
+
+/* Calculate the dot result for 2-step interleaved matrix and vector
+   (Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_2STEP_INTERLEAVED_DOT_8x16(accumArray, matArray, xArray)                                   \
+    accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray##_0);  \
+    accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_2, (__m256bh) xArray##_0);  \
+    accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_1, (__m256bh) xArray##_1);  \
+    accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_3, (__m256bh) xArray##_1);  \
+    accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_4, (__m256bh) xArray##_2);  \
+    accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_6, (__m256bh) xArray##_2);  \
+    accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_5, (__m256bh) xArray##_3);  \
+    accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_7, (__m256bh) xArray##_3);
+
+/* Calculate the dot result for 2-step interleaved matrix and vector
+   (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_2STEP_INTERLEAVED_DOT_4x32(accumArray, matArray, xArray)                                   \
+    accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray##_0);  \
+    accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_1, (__m512bh) xArray##_1);  \
+    accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_2, (__m512bh) xArray##_2);  \
+    accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_3, (__m512bh) xArray##_3);
+
+
+/* Calculate the dot result for 2-step interleaved matrix and vector
+   (Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_2STEP_INTERLEAVED_DOT_4x16(accumArray, matArray, xArray)                                   \
+    accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray##_0);  \
+    accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_1, (__m256bh) xArray##_1);  \
+    accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_2, (__m256bh) xArray##_2);  \
+    accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_3, (__m256bh) xArray##_3);
+
+
+/* Calculate the dot result for matrix and vector at 32 elements per row
+   (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_DOT_8x32(accumArray, matArray, xArray)                                                 \
+    accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray);  \
+    accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_1, (__m512bh) xArray);  \
+    accumArray##_2 = _mm512_dpbf16_ps(accumArray##_2, (__m512bh) matArray##_2, (__m512bh) xArray);  \
+    accumArray##_3 = _mm512_dpbf16_ps(accumArray##_3, (__m512bh) matArray##_3, (__m512bh) xArray);  \
+    accumArray##_4 = _mm512_dpbf16_ps(accumArray##_4, (__m512bh) matArray##_4, (__m512bh) xArray);  \
+    accumArray##_5 = _mm512_dpbf16_ps(accumArray##_5, (__m512bh) matArray##_5, (__m512bh) xArray);  \
+    accumArray##_6 = _mm512_dpbf16_ps(accumArray##_6, (__m512bh) matArray##_6, (__m512bh) xArray);  \
+    accumArray##_7 = _mm512_dpbf16_ps(accumArray##_7, (__m512bh) matArray##_7, (__m512bh) xArray);
+
+/* Calculate the dot result for matrix and vector at 32 elements per row
+   (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_DOT_1x32(accumArray, matArray, xArray)                                                 \
+    accumArray = _mm512_dpbf16_ps(accumArray, (__m512bh) matArray, (__m512bh) xArray);
+
+/* Calculate the dot result for matrix and vector at 16 elements per row
+   (Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_DOT_8x16(accumArray, matArray, xArray)                                                 \
+    accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray);  \
+    accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_1, (__m256bh) xArray);  \
+    accumArray##_2 = _mm256_dpbf16_ps(accumArray##_2, (__m256bh) matArray##_2, (__m256bh) xArray);  \
+    accumArray##_3 = _mm256_dpbf16_ps(accumArray##_3, (__m256bh) matArray##_3, (__m256bh) xArray);  \
+    accumArray##_4 = _mm256_dpbf16_ps(accumArray##_4, (__m256bh) matArray##_4, (__m256bh) xArray);  \
+    accumArray##_5 = _mm256_dpbf16_ps(accumArray##_5, (__m256bh) matArray##_5, (__m256bh) xArray);  \
+    accumArray##_6 = _mm256_dpbf16_ps(accumArray##_6, (__m256bh) matArray##_6, (__m256bh) xArray);  \
+    accumArray##_7 = _mm256_dpbf16_ps(accumArray##_7, (__m256bh) matArray##_7, (__m256bh) xArray);
+
+
+/* 2-step interleave for matrix against 8 rows with 16 fp32 elements per row
+    Input  - register array of 8 rows of raw-major matrix
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for matrix
+    |a0|b0|a1|b1|a4|b4|a5|b5|a8 |b8 |a9 |b9 |a12|b12|a13|b13|
+    |c0|d0|c1|d1|c4|d4|c5|d5|c8 |d8 |c9 |d9 |c12|d12|c13|d13|
+    |e0|f0|e1|f1|e4|f4|e5|f5|e8 |f8 |e9 |f9 |e12|f12|e13|f13|
+    |g0|h0|g1|h1|g4|h4|g5|h5|g8 |h8 |g9 |h9 |g12|h12|g13|h13|
+    |a2|b2|a3|b3|a6|b6|a7|b7|a10|b10|a11|b11|a14|b14|a15|b15|
+    |c2|d2|c3|d3|c6|d6|c7|d7|c10|d10|c11|d11|c14|d14|c15|d15|
+    |e2|f2|e3|f3|e6|f6|e7|f7|e10|f10|e11|f11|e14|f14|e15|f15|
+    |g2|h2|g3|h3|g6|h6|g7|h7|g10|h10|g11|h11|g14|h14|g15|h15|
+
+    Step 2: 4-element interleave for matrix
+    |a0|b0|c0|d0|a4|b4|c4|d4|a8 |b8 |c8 |d8 |a12|b12|c12|d12|
+    |a1|b1|c1|d1|a5|b5|c5|d5|a9 |b9 |c9 |d9 |a13|b13|c13|d13|
+    |e0|f0|g0|h0|e4|f4|g4|h4|e8 |f8 |g8 |h8 |e12|f12|g12|h12|
+    |e1|f1|g1|h1|e5|f5|g5|h5|e9 |f9 |g9 |h9 |e13|f13|g13|h13|
+    |a2|b2|c2|d2|a6|b6|c6|d6|a10|b10|c10|d10|a14|b14|c14|d14|
+    |a3|b3|c3|d3|a7|b7|c7|d7|a11|b11|c11|d11|a15|b15|c15|d15|
+    |e2|f2|g2|h2|e6|f6|g6|h6|e10|f10|g10|h10|e14|f14|g14|h14|
+    |e3|f3|g3|h3|e7|f7|g7|h7|e11|f11|g11|h11|e15|f15|g15|h15|
+*/
+#define FP32_INTERLEAVE_8x16(regArray)                               \
+    regArray##_8  = _mm512_unpacklo_ps(regArray##_0, regArray##_1);  \
+    regArray##_9  = _mm512_unpacklo_ps(regArray##_2, regArray##_3);  \
+    regArray##_10 = _mm512_unpacklo_ps(regArray##_4, regArray##_5);  \
+    regArray##_11 = _mm512_unpacklo_ps(regArray##_6, regArray##_7);  \
+    regArray##_12 = _mm512_unpackhi_ps(regArray##_0, regArray##_1);  \
+    regArray##_13 = _mm512_unpackhi_ps(regArray##_2, regArray##_3);  \
+    regArray##_14 = _mm512_unpackhi_ps(regArray##_4, regArray##_5);  \
+    regArray##_15 = _mm512_unpackhi_ps(regArray##_6, regArray##_7);  \
+                                                                     \
+    regArray##_0 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_8,  (__m512d) regArray##_9);  \
+    regArray##_1 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_8,  (__m512d) regArray##_9);  \
+    regArray##_4 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_10, (__m512d) regArray##_11); \
+    regArray##_5 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_10, (__m512d) regArray##_11); \
+    regArray##_2 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_12, (__m512d) regArray##_13); \
+    regArray##_3 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_12, (__m512d) regArray##_13); \
+    regArray##_6 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_14, (__m512d) regArray##_15); \
+    regArray##_7 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_14, (__m512d) regArray##_15);
+
+#define FP32_INTERLEAVE_8x16_ARRAY(regArray)                               \
+    regArray[8]  = _mm512_unpacklo_ps(regArray[0], regArray[1]);  \
+    regArray[9]  = _mm512_unpacklo_ps(regArray[2], regArray[3]);  \
+    regArray[10] = _mm512_unpacklo_ps(regArray[4], regArray[5]);  \
+    regArray[11] = _mm512_unpacklo_ps(regArray[6], regArray[7]);  \
+    regArray[12] = _mm512_unpackhi_ps(regArray[0], regArray[1]);  \
+    regArray[13] = _mm512_unpackhi_ps(regArray[2], regArray[3]);  \
+    regArray[14] = _mm512_unpackhi_ps(regArray[4], regArray[5]);  \
+    regArray[15] = _mm512_unpackhi_ps(regArray[6], regArray[7]);  \
+                                                                     \
+    regArray[0] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[8],  (__m512d) regArray[9]);  \
+    regArray[1] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[8],  (__m512d) regArray[9]);  \
+    regArray[4] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[10], (__m512d) regArray[11]); \
+    regArray[5] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[10], (__m512d) regArray[11]); \
+    regArray[2] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[12], (__m512d) regArray[13]); \
+    regArray[3] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[12], (__m512d) regArray[13]); \
+    regArray[6] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[14], (__m512d) regArray[15]); \
+    regArray[7] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[14], (__m512d) regArray[15]);
+
+/* 2-step interleave for matrix against 8 rows with 8 fp32 elements per row
+    Input  - register array of 8 rows of raw-major matrix
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for matrix
+    |a0|b0|a1|b1|a4|b4|a5|b5|
+    |c0|d0|c1|d1|c4|d4|c5|d5|
+    |e0|f0|e1|f1|e4|f4|e5|f5|
+    |g0|h0|g1|h1|g4|h4|g5|h5|
+    |a2|b2|a3|b3|a6|b6|a7|b7|
+    |c2|d2|c3|d3|c6|d6|c7|d7|
+    |e2|f2|e3|f3|e6|f6|e7|f7|
+    |g2|h2|g3|h3|g6|h6|g7|h7|
+
+    Step 2: 4-element interleave for matrix
+    |a0|b0|c0|d0|a4|b4|c4|d4|
+    |a1|b1|c1|d1|a5|b5|c5|d5|
+    |e0|f0|g0|h0|e4|f4|g4|h4|
+    |e1|f1|g1|h1|e5|f5|g5|h5|
+    |a2|b2|c2|d2|a6|b6|c6|d6|
+    |a3|b3|c3|d3|a7|b7|c7|d7|
+    |e2|f2|g2|h2|e6|f6|g6|h6|
+    |e3|f3|g3|h3|e7|f7|g7|h7|
+*/
+#define FP32_INTERLEAVE_8x8(regArray)                                \
+    regArray##_8  = _mm256_unpacklo_ps(regArray##_0, regArray##_1);  \
+    regArray##_9  = _mm256_unpacklo_ps(regArray##_2, regArray##_3);  \
+    regArray##_10 = _mm256_unpacklo_ps(regArray##_4, regArray##_5);  \
+    regArray##_11 = _mm256_unpacklo_ps(regArray##_6, regArray##_7);  \
+    regArray##_12 = _mm256_unpackhi_ps(regArray##_0, regArray##_1);  \
+    regArray##_13 = _mm256_unpackhi_ps(regArray##_2, regArray##_3);  \
+    regArray##_14 = _mm256_unpackhi_ps(regArray##_4, regArray##_5);  \
+    regArray##_15 = _mm256_unpackhi_ps(regArray##_6, regArray##_7);  \
+                                                                     \
+    regArray##_0 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_8,  (__m256d) regArray##_9);  \
+    regArray##_1 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_8,  (__m256d) regArray##_9);  \
+    regArray##_4 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_10, (__m256d) regArray##_11); \
+    regArray##_5 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_10, (__m256d) regArray##_11); \
+    regArray##_2 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_12, (__m256d) regArray##_13); \
+    regArray##_3 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_12, (__m256d) regArray##_13); \
+    regArray##_6 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_14, (__m256d) regArray##_15); \
+    regArray##_7 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_14, (__m256d) regArray##_15);
+
+
+/* Accumulate the result for 2 batch of 4-registers
+*/
+#define FP32_ACCUM2_8x16(regArray)                             \
+    regArray##_0 = _mm512_add_ps(regArray##_0, regArray##_1);  \
+    regArray##_2 = _mm512_add_ps(regArray##_2, regArray##_3);  \
+    regArray##_4 = _mm512_add_ps(regArray##_4, regArray##_5);  \
+    regArray##_6 = _mm512_add_ps(regArray##_6, regArray##_7);  \
+    regArray##_0 = _mm512_add_ps(regArray##_0, regArray##_2);  \
+    regArray##_4 = _mm512_add_ps(regArray##_4, regArray##_6);
+
+#define FP32_ACCUM2_8x16_ARRAY(regArray)                             \
+    regArray[0] = _mm512_add_ps(regArray[0], regArray[1]);  \
+    regArray[2] = _mm512_add_ps(regArray[2], regArray[3]);  \
+    regArray[4] = _mm512_add_ps(regArray[4], regArray[5]);  \
+    regArray[6] = _mm512_add_ps(regArray[6], regArray[7]);  \
+    regArray[0] = _mm512_add_ps(regArray[0], regArray[2]);  \
+    regArray[4] = _mm512_add_ps(regArray[4], regArray[6]);
+
+/* Accumulate the result for 2 batch of 4-registers
+*/
+#define FP32_ACCUM2_8x8(regArray)                              \
+    regArray##_0 = _mm256_add_ps(regArray##_0, regArray##_1);  \
+    regArray##_2 = _mm256_add_ps(regArray##_2, regArray##_3);  \
+    regArray##_4 = _mm256_add_ps(regArray##_4, regArray##_5);  \
+    regArray##_6 = _mm256_add_ps(regArray##_6, regArray##_7);  \
+    regArray##_0 = _mm256_add_ps(regArray##_0, regArray##_2);  \
+    regArray##_4 = _mm256_add_ps(regArray##_4, regArray##_6);
+
+
+/* Store 16 (alpha * result + beta * y) to y
+*/
+#define STORE16_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr)                                                 \
+    regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_mul_ps(BETAVECTOR, _mm512_loadu_ps(targetAddr)));  \
+    _mm512_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 16 (alpha * result + beta * y) to y
+*/
+#define STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask)                                                  \
+    regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_mul_ps(BETAVECTOR, _mm512_maskz_loadu_ps(mask, targetAddr)));  \
+    _mm512_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 8 (alpha * result + beta * y) to y
+*/
+#define STORE8_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr)                                                                                                  \
+    regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_mul_ps(_mm512_castps512_ps256(BETAVECTOR), _mm256_loadu_ps(targetAddr)));  \
+    _mm256_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 8 (alpha * result + beta * y) to y
+*/
+#define STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask)                                                                                                   \
+    regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_mul_ps(_mm512_castps512_ps256(BETAVECTOR), _mm256_maskz_loadu_ps(mask, targetAddr)));  \
+    _mm256_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 4 (alpha * result + beta * y) to y
+*/
+#define STORE4_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr)                                                                                         \
+    regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_mul_ps(_mm512_castps512_ps128(BETAVECTOR), _mm_loadu_ps(targetAddr)));  \
+    _mm_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 4 (alpha * result + beta * y) to y
+*/
+#define STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask)                                                                                          \
+    regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_mul_ps(_mm512_castps512_ps128(BETAVECTOR), _mm_maskz_loadu_ps(mask, targetAddr)));  \
+    _mm_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 16 (alpha * result + y) to y
+*/
+#define STORE16_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr)                       \
+    regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_loadu_ps(targetAddr));  \
+    _mm512_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 16 (alpha * result + y) to y
+*/
+#define STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask)                        \
+    regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_maskz_loadu_ps(mask, targetAddr));  \
+    _mm512_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 8 (alpha * result + y) to y
+*/
+#define STORE8_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr)                                                \
+    regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_loadu_ps(targetAddr));  \
+    _mm256_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 8 (alpha * result + y) to y
+*/
+#define STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask)                                                 \
+    regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_maskz_loadu_ps(mask, targetAddr));  \
+    _mm256_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 4 (alpha * result + y) to y
+*/
+#define STORE4_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr)                                          \
+    regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_loadu_ps(targetAddr));  \
+    _mm_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 4 (alpha * result + y) to y
+*/
+#define STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask)                                           \
+    regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_maskz_loadu_ps(mask, targetAddr));  \
+    _mm_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 16 (alpha * result) to y
+*/
+#define STORE16_COMPLETE_RESULT_ALPHA(regResult, targetAddr)  \
+    _mm512_storeu_ps(targetAddr, _mm512_mul_ps(ALPHAVECTOR, regResult));
+
+
+/* Masked store 16 (alpha * result) to y
+*/
+#define STORE16_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask)  \
+    _mm512_mask_storeu_ps(targetAddr, mask, _mm512_mul_ps(ALPHAVECTOR, regResult));
+
+
+/* Store 8 (alpha * result) to y
+*/
+#define STORE8_COMPLETE_RESULT_ALPHA(regResult, targetAddr)  \
+    _mm256_storeu_ps(targetAddr, _mm256_mul_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult));
+
+
+/* Masked store 8 (alpha * result) to y
+*/
+#define STORE8_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask)  \
+    _mm256_mask_storeu_ps(targetAddr, mask, _mm256_mul_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult));
+
+
+/* Store 4 (alpha * result) to y
+*/
+#define STORE4_COMPLETE_RESULT_ALPHA(regResult, targetAddr)  \
+    _mm_storeu_ps(targetAddr, _mm_mul_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult));
+
+
+/* Masked store 4 (alpha * result) to y
+*/
+#define STORE4_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask)  \
+    _mm_mask_storeu_ps(targetAddr, mask, _mm_mul_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult));
+
+
+/* Store 16 result to y
+*/
+#define STORE16_COMPLETE_RESULT_DIRECT(regResult, targetAddr)  \
+    _mm512_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 16 result to y
+*/
+#define STORE16_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask)  \
+    _mm512_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 8 result to y
+*/
+#define STORE8_COMPLETE_RESULT_DIRECT(regResult, targetAddr)  \
+    _mm256_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 8 result to y
+*/
+#define STORE8_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask)  \
+    _mm256_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 4 result to y
+*/
+#define STORE4_COMPLETE_RESULT_DIRECT(regResult, targetAddr)  \
+    _mm_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 4 result to y
+*/
+#define STORE4_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask)  \
+    _mm_mask_storeu_ps(targetAddr, mask, regResult);
+
+#endif
diff --git a/kernel/x86_64/sbgemv_n.c b/kernel/x86_64/sbgemv_n.c
new file mode 100644
index 000000000..18e64dc3f
--- /dev/null
+++ b/kernel/x86_64/sbgemv_n.c
@@ -0,0 +1,137 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined (COOPERLAKE)
+#include "sbgemv_n_microk_cooperlake.c"
+#endif
+
+#define ALIGN64_ALLOC(alloc_size, TYPE, ptr_align, ptr)   \
+    ptr = (TYPE *) malloc(sizeof(TYPE)*alloc_size + 63); \
+    ptr_align = ((int)(((uintptr_t)ptr & (uintptr_t)0x3F))!=0) ? (TYPE *)((char *)ptr + (64 - (int)((uintptr_t)ptr & (uintptr_t)0x3F))) : ptr
+
+#define ALIGN64_FREE(ptr) \
+    free(ptr)
+
+#ifndef HAVE_SBGEMV_N_ACCL_KERNEL
+static void sbgemv_kernel_n(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+{
+	BLASLONG offset_lda, offset_m;
+    float accum = 0.0;
+    float tmp_x = 0.0;
+
+    bfloat16 * a_bf16 = malloc(sizeof(bfloat16)*m*n);
+    float *    a_fp32 = malloc(sizeof(float)*m*n);
+    float *    x_fp32 = malloc(sizeof(float)*n);
+
+    for (BLASLONG j=0; j<n; j++) {
+        offset_lda = lda * j;
+        offset_m = m * j;
+        for (BLASLONG i=0; i<m; i++) {
+            a_bf16[offset_m + i] = a[offset_lda + i];
+        }
+    }
+
+    SBF16TOS_K(n, x, 1, x_fp32, 1);
+    SBF16TOS_K(m*n, a_bf16, 1, a_fp32, 1);
+
+    for (BLASLONG i=0; i<m; i++) {
+        accum = 0.0;
+		for (BLASLONG j=0; j<n; j++) {
+		    accum += a_fp32[j*m + i] * x_fp32[j];
+		}
+        if (beta == ZERO) {
+		    y[i] = alpha * accum;
+        } else {
+            y[i] = alpha * accum + beta * y[i];
+        }
+	}
+
+    free(a_bf16);
+    free(a_fp32);
+    free(x_fp32);
+}
+#endif
+
+static void bf16_compress_vector(BLASLONG n, bfloat16 * src, bfloat16 * target, BLASLONG inc)
+{
+    for(BLASLONG i=0; i<n; i++) {
+        target[i] = src[i*inc];
+    }
+}
+
+static void fp32_compress_vector(BLASLONG n, float * src, float * target, BLASLONG inc)
+{
+    for(BLASLONG i=0; i<n; i++) {
+        target[i] = src[i*inc];
+    }
+}
+
+static void fp32_expand_vector(BLASLONG n, float * src, float * target, BLASLONG inc)
+{
+    for(BLASLONG i=0; i<n; i++) {
+        target[i*inc] = src[i];
+    }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float * y, BLASLONG incy)
+{
+    if ( m < 1 || n < 1) return(0);
+
+    bfloat16 * xbuffer_align = x;
+    float    * ybuffer_align = y;
+
+    bfloat16 * xbuffer = NULL;
+    float    * ybuffer = NULL;
+
+    if (incx != 1) {
+        ALIGN64_ALLOC(n, bfloat16, xbuffer_align, xbuffer);
+        bf16_compress_vector(n, x, xbuffer_align, incx);
+    }
+
+    if (incy != 1) {
+        ALIGN64_ALLOC(m, float, ybuffer_align, ybuffer);
+        if (beta != ZERO) {
+            fp32_compress_vector(m, y, ybuffer_align, incy);
+        }
+    }
+
+    sbgemv_kernel_n(m, n, alpha, a, lda, xbuffer_align, beta, ybuffer_align);
+
+    if (incy != 1) {
+        fp32_expand_vector(m, ybuffer_align, y, incy);
+        ALIGN64_FREE(ybuffer);
+    }
+
+    if (incx != 1) {
+        ALIGN64_FREE(xbuffer);
+    }
+
+	return(0);
+}
diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake.c b/kernel/x86_64/sbgemv_n_microk_cooperlake.c
new file mode 100644
index 000000000..d875e0d96
--- /dev/null
+++ b/kernel/x86_64/sbgemv_n_microk_cooperlake.c
@@ -0,0 +1,76 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_SBGEMV_N_ACCL_KERNEL 1
+#include "common.h"
+#include <immintrin.h>
+
+// Define micro kernels for ALPHA not ONE && BETA effective && BETA not ONE scenarios
+#undef  ZERO_BETA
+#undef  ONE_BETA
+#undef  ONE_ALPHA
+#include "sbgemv_n_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA not ONE && BETA as ONE scenarios
+#undef  ZERO_BETA
+#define ONE_BETA  1
+#undef  ONE_ALPHA
+#include "sbgemv_n_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA not ONE && BETA in-effective (BETA == 0) scenarios
+#define ZERO_BETA 1
+#undef  ONE_ALPHA
+#include "sbgemv_n_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA as ONE && BETA in-effective (BETA == 0) scenarios
+#define ZERO_BETA 1
+#define ONE_ALPHA 1
+#include "sbgemv_n_microk_cooperlake_template.c"
+
+static int sbgemv_kernel_n(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+{
+    if (beta == ZERO) {          // BETA == 0.0, no need to accumulate the original Y data
+        if (alpha == ONE) {           // ALPHA == 1.0, no need to multipy ALPHA
+            sbgemv_kernel_32xN_lda_direct(m, n, alpha, a, lda, x, y);
+        } else {                      // ALPHA != 1.0, need to multipy ALPHA
+            sbgemv_kernel_32xN_lda_direct_alpha(m, n, alpha, a, lda, x, y);
+        }
+    } else {                     // BETA != 0.0, need to accumulate the original Y data no matter what ALPHA is
+        if (beta == ONE) {
+            sbgemv_kernel_32xN_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y);
+        } else {
+            sbgemv_kernel_32xN_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y);
+        }
+    }
+
+    return 0;
+}
+
+#endif
diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c
new file mode 100644
index 000000000..46e6d0ff9
--- /dev/null
+++ b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c
@@ -0,0 +1,234 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#include <immintrin.h>
+#include "common.h"
+
+// Include common macros for BF16 based operations with IA intrinsics
+#include "bf16_common_macros.h"
+
+#ifndef ZERO_BETA  // Beta is non-zero
+
+#ifndef ONE_BETA       // BETA is not ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_ALPHA_BETA
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_ALPHA_BETA
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_ALPHA_BETA
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA
+
+#else                  // BETA is ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_ALPHA_ONE
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_ALPHA_ONE
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_ALPHA_ONE
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE
+
+#endif
+
+#else  // BETA is zero
+
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_ALPHA
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_ALPHA
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_ALPHA
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_ALPHA
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_ALPHA
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_ALPHA
+
+#else                  // ALPHA is ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_DIRECT
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_DIRECT
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_DIRECT
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_DIRECT
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_DIRECT
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_DIRECT
+
+#endif
+
+#endif
+
+
+
+// 8 rows parallel processing BF16 GEMV kernel for big N && lda effective scenario (process before interleave)
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_32xN_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_32xN_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_32xN_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_32x = m & (~31);
+    BLASLONG tag_m_128x = m & (~127);
+
+    __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \
+           accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    __m512i matrixArray_seed_0, matrixArray_seed_1, matrixArray_seed_2, matrixArray_seed_3;
+    __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7;
+    __m512i xArray_0;
+
+    __m512i ZERO512 = _mm512_setzero_si512();
+
+    unsigned int blend_hi_mask_value = ((unsigned int)0xaaaaaaaa);
+    __mmask32 blend_hi_mask = *((__mmask32*) &blend_hi_mask_value);
+    unsigned int blend_lo_mask_value = ((unsigned int)0x55555555);
+    __mmask32 blend_lo_mask = *((__mmask32*) &blend_lo_mask_value);
+
+    __m512i M512_EPI32_8 = _mm512_set1_epi32(8);
+    __m512i idx_base_0   = _mm512_set_epi32(23,  7, 22,  6, 21,  5,  20,  4, 19,  3, 18,  2, 17,  1,  16,  0);
+    __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_8);
+
+    for (BLASLONG idx_m = 0; idx_m < tag_m_128x; idx_m+=128) {
+        accum512_0 = _mm512_setzero_ps();
+        accum512_1 = _mm512_setzero_ps();
+        accum512_2 = _mm512_setzero_ps();
+        accum512_3 = _mm512_setzero_ps();
+        accum512_4 = _mm512_setzero_ps();
+        accum512_5 = _mm512_setzero_ps();
+        accum512_6 = _mm512_setzero_ps();
+        accum512_7 = _mm512_setzero_ps();
+ 
+        for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
+            xArray_0 = _mm512_set1_epi16(x[idx_n]);
+ 
+            BF16_MATRIX_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, idx_m +  0)
+            BF16_MATRIX_LOAD_1x32(matrixArray_seed_1, a, lda, idx_n, idx_m + 32)
+            BF16_MATRIX_LOAD_1x32(matrixArray_seed_2, a, lda, idx_n, idx_m + 64)
+            BF16_MATRIX_LOAD_1x32(matrixArray_seed_3, a, lda, idx_n, idx_m + 96)
+
+            matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0);
+            matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0);
+            matrixArray_2 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_1);
+            matrixArray_3 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_1);
+            matrixArray_4 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_2);
+            matrixArray_5 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_2);
+            matrixArray_6 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_3);
+            matrixArray_7 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_3);
+
+            BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
+            BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0)
+            BF16_DOT_1x32(accum512_2, matrixArray_2, xArray_0)
+            BF16_DOT_1x32(accum512_3, matrixArray_3, xArray_0)
+            BF16_DOT_1x32(accum512_4, matrixArray_4, xArray_0)
+            BF16_DOT_1x32(accum512_5, matrixArray_5, xArray_0)
+            BF16_DOT_1x32(accum512_6, matrixArray_6, xArray_0)
+            BF16_DOT_1x32(accum512_7, matrixArray_7, xArray_0)
+        }
+        accum512_8  = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+        accum512_9  = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+        accum512_10 = _mm512_permutex2var_ps(accum512_2, idx_base_0, accum512_3);
+        accum512_11 = _mm512_permutex2var_ps(accum512_2, idx_base_1, accum512_3);
+        accum512_12 = _mm512_permutex2var_ps(accum512_4, idx_base_0, accum512_5);
+        accum512_13 = _mm512_permutex2var_ps(accum512_4, idx_base_1, accum512_5);
+        accum512_14 = _mm512_permutex2var_ps(accum512_6, idx_base_0, accum512_7);
+        accum512_15 = _mm512_permutex2var_ps(accum512_6, idx_base_1, accum512_7);
+
+        STORE16_COMPLETE_RESULT(accum512_8,  y+idx_m+0)
+        STORE16_COMPLETE_RESULT(accum512_9,  y+idx_m+16)
+        STORE16_COMPLETE_RESULT(accum512_10, y+idx_m+32)
+        STORE16_COMPLETE_RESULT(accum512_11, y+idx_m+48)
+        STORE16_COMPLETE_RESULT(accum512_12, y+idx_m+64)
+        STORE16_COMPLETE_RESULT(accum512_13, y+idx_m+80)
+        STORE16_COMPLETE_RESULT(accum512_14, y+idx_m+96)
+        STORE16_COMPLETE_RESULT(accum512_15, y+idx_m+112)
+    }
+
+    for (BLASLONG idx_m = tag_m_128x; idx_m < tag_m_32x; idx_m+=32) {
+        accum512_0 = _mm512_setzero_ps();
+        accum512_1 = _mm512_setzero_ps();
+ 
+        for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
+            xArray_0 = _mm512_set1_epi16(x[idx_n]);
+
+            BF16_MATRIX_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, idx_m)
+
+            matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0);
+            matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0);
+
+            BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
+            BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0)
+        }
+        accum512_8  = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+        accum512_9  = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+
+        STORE16_COMPLETE_RESULT(accum512_8, y+idx_m+0)
+        STORE16_COMPLETE_RESULT(accum512_9, y+idx_m+16)
+    }
+
+    if (tag_m_32x != m) {
+        unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(m&31)));
+        __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
+
+        unsigned short store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15)));
+        __mmask32 store_tail_mask = *((__mmask32*) &store_tail_mask_value);
+
+        accum512_0 = _mm512_setzero_ps();
+        accum512_1 = _mm512_setzero_ps();
+ 
+        for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
+            xArray_0 = _mm512_set1_epi16(x[idx_n]);
+
+            BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, tag_m_32x, tail_mask)
+
+            matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0);
+            matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0);
+
+            BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
+            BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0)
+        }
+        accum512_8  = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+        accum512_9  = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+
+        if ((m-tag_m_32x) > 16) {
+            STORE16_COMPLETE_RESULT(accum512_8, y+tag_m_32x+0)
+            STORE16_MASK_COMPLETE_RESULT(accum512_9, y+tag_m_32x+16, store_tail_mask)
+        } else {
+            STORE16_MASK_COMPLETE_RESULT(accum512_8, y+tag_m_32x+0, store_tail_mask)
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/x86_64/sbgemv_t.c b/kernel/x86_64/sbgemv_t.c
new file mode 100644
index 000000000..22b099116
--- /dev/null
+++ b/kernel/x86_64/sbgemv_t.c
@@ -0,0 +1,142 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined (COOPERLAKE)
+#include "sbgemv_t_microk_cooperlake.c"
+#endif
+
+#define ALIGN64_ALLOC(alloc_size, TYPE, ptr_align, ptr)   \
+    ptr = (TYPE *) malloc(sizeof(TYPE)*alloc_size + 63); \
+    ptr_align = ((int)(((uintptr_t)ptr & (uintptr_t)0x3F))!=0) ? (TYPE *)((char *)ptr + (64 - (int)((uintptr_t)ptr & (uintptr_t)0x3F))) : ptr
+
+#define ALIGN64_FREE(ptr) \
+    free(ptr)
+
+#ifndef HAVE_SBGEMV_T_ACCL_KERNEL
+static void sbgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+{
+	BLASLONG offset_lda, offset_n;
+    float accum = 0.0;
+
+    bfloat16 * a_bf16 = malloc(sizeof(bfloat16)*m*n);
+    float *    a_fp32 = malloc(sizeof(float)*m*n);
+    float *    x_fp32 = malloc(sizeof(float)*n);
+
+    for (BLASLONG i=0; i<m; i++)  {
+        offset_lda = lda * i;
+        offset_n = n * i;
+        for (BLASLONG j=0; j<n; j++) {
+            a_bf16[offset_n + j] = a[offset_lda + j];
+        }
+    }
+
+    SBF16TOS_K(n, x, 1, x_fp32, 1);
+    SBF16TOS_K(m*n, a_bf16, 1, a_fp32, 1);
+
+	for (BLASLONG i=0; i<m; i++) {
+		offset_n = n * i;
+        accum = 0.0;
+		for (BLASLONG j=0; j<n; j++) {
+		    accum += a_fp32[offset_n + j] * x_fp32[j];
+		}
+        if (beta == ZERO) {
+		    y[i] = alpha * accum;
+        } else {
+            y[i] = alpha * accum + beta * y[i];
+        }
+	}
+
+    free(a_bf16);
+    free(a_fp32);
+    free(x_fp32);
+}
+#endif
+
+static void bf16_compress_vector(BLASLONG n, bfloat16 * src, bfloat16 * target, BLASLONG inc)
+{
+    for(BLASLONG i=0; i<n; i++) {
+        target[i] = src[i*inc];
+    }
+}
+
+static void fp32_compress_vector(BLASLONG n, float * src, float * target, BLASLONG inc)
+{
+    for(BLASLONG i=0; i<n; i++) {
+        target[i] = src[i*inc];
+    }
+}
+
+static void fp32_expand_vector(BLASLONG n, float * src, float * target, BLASLONG inc)
+{
+    for(BLASLONG i=0; i<n; i++) {
+        target[i*inc] = src[i];
+    }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float * y, BLASLONG incy)
+{
+    if ( m < 1 || n < 1) return(0);
+
+    bfloat16 * xbuffer_align = x;
+    float    * ybuffer_align = y;
+
+    bfloat16 * xbuffer = NULL;
+    float    * ybuffer = NULL;
+
+    // Switch m and n
+    BLASLONG t = m;
+    m = n;
+    n = t;
+
+    if (incx != 1) {
+        ALIGN64_ALLOC(n, bfloat16, xbuffer_align, xbuffer);
+        bf16_compress_vector(n, x, xbuffer_align, incx);
+    }
+
+    if (incy != 1) {
+        ALIGN64_ALLOC(m, float, ybuffer_align, ybuffer);
+        if (beta != ZERO) {
+            fp32_compress_vector(m, y, ybuffer_align, incy);
+        }
+    }
+
+    sbgemv_kernel_t(m, n, alpha, a, lda, xbuffer_align, beta, ybuffer_align);
+
+    if (incy != 1) {
+        fp32_expand_vector(m, ybuffer_align, y, incy);
+        ALIGN64_FREE(ybuffer);
+    }
+
+    if (incx != 1) {
+        ALIGN64_FREE(xbuffer);
+    }
+
+	return(0);
+}
diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake.c b/kernel/x86_64/sbgemv_t_microk_cooperlake.c
new file mode 100644
index 000000000..23da2e809
--- /dev/null
+++ b/kernel/x86_64/sbgemv_t_microk_cooperlake.c
@@ -0,0 +1,202 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_SBGEMV_T_ACCL_KERNEL 1
+
+// Define micro kernels for ALPHA not ONE && BETA effective && BETA not ONE scenarios
+#undef  ZERO_BETA
+#undef  ONE_BETA
+#undef  ONE_ALPHA
+#include "sbgemv_t_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA not ONE && BETA as ONE scenarios
+#undef  ZERO_BETA
+#define ONE_BETA  1
+#undef  ONE_ALPHA
+#include "sbgemv_t_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA not ONE && BETA in-effective (BETA == 0) scenarios
+#define ZERO_BETA 1
+#undef  ONE_ALPHA
+#include "sbgemv_t_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA as ONE && BETA in-effective (BETA == 0) scenarios
+#define ZERO_BETA 1
+#define ONE_ALPHA 1
+#include "sbgemv_t_microk_cooperlake_template.c"
+
+static int sbgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+{
+    if (beta == ZERO) {          // BETA == 0.0, no need to accumulate the original Y data
+        if (alpha == ONE) {           // ALPHA == 1.0, no need to multipy ALPHA
+            if (n > 127) {
+                sbgemv_kernel_1x128_lda_direct(m, n, alpha, a, lda, x, y);
+            } else if (n > 32) {
+                sbgemv_kernel_8x32_lda_direct(m, n, alpha, a, lda, x, y);
+            } else {
+                if (n > 16) {
+                    sbgemv_kernel_8x16p_lda(m, n, alpha, a, lda, x, y);
+                } else {
+                    if (lda == n) {
+                        switch(n) {
+                            case 1:  sbgemv_kernel_32x1 (m, alpha, a, x, y); break;
+                            case 2:  sbgemv_kernel_32x2 (m, alpha, a, x, y); break;
+                            case 3:  sbgemv_kernel_32x3 (m, alpha, a, x, y); break;
+                            case 4:  sbgemv_kernel_16x4 (m, alpha, a, x, y); break;
+                            case 5:  sbgemv_kernel_30x5 (m, alpha, a, x, y); break;
+                            case 6:  sbgemv_kernel_16x6 (m, alpha, a, x, y); break;
+                            case 7:  sbgemv_kernel_16x7 (m, alpha, a, x, y); break;
+                            case 8:  sbgemv_kernel_16x8 (m, alpha, a, x, y); break;
+                            case 9:  sbgemv_kernel_14x9 (m, alpha, a, x, y); break;
+                            case 10: sbgemv_kernel_12x10(m, alpha, a, x, y); break;
+                            case 11: sbgemv_kernel_15x11(m, alpha, a, x, y); break;
+                            case 12: sbgemv_kernel_15x12(m, alpha, a, x, y); break;
+                            case 13: sbgemv_kernel_16x13(m, alpha, a, x, y); break;
+                            case 14: sbgemv_kernel_16x14(m, alpha, a, x, y); break;
+                            case 15: sbgemv_kernel_16x15(m, alpha, a, x, y); break;
+                            case 16: sbgemv_kernel_16x16(m, alpha, a, x, y); break;
+                            default: break;
+                        }
+                    } else {
+                        sbgemv_kernel_8x16m_lda(m, n, alpha, a, lda, x, y);
+                    }
+                }
+            }
+        } else {                      // ALPHA != 1.0, need to multipy ALPHA
+            if (n > 127) {
+                sbgemv_kernel_1x128_lda_direct_alpha(m, n, alpha, a, lda, x, y);
+            } else if (n > 32) {
+                sbgemv_kernel_8x32_lda_direct_alpha(m, n, alpha, a, lda, x, y);
+            } else {
+                if (n > 16) {
+                    sbgemv_kernel_8x16p_lda_alpha(m, n, alpha, a, lda, x, y);
+                } else {
+                    if (lda == n) {
+                        switch(n) {
+                            case 1:  sbgemv_kernel_32x1_alpha (m, alpha, a, x, y); break;
+                            case 2:  sbgemv_kernel_32x2_alpha (m, alpha, a, x, y); break;
+                            case 3:  sbgemv_kernel_32x3_alpha (m, alpha, a, x, y); break;
+                            case 4:  sbgemv_kernel_16x4_alpha (m, alpha, a, x, y); break;
+                            case 5:  sbgemv_kernel_30x5_alpha (m, alpha, a, x, y); break;
+                            case 6:  sbgemv_kernel_16x6_alpha (m, alpha, a, x, y); break;
+                            case 7:  sbgemv_kernel_16x7_alpha (m, alpha, a, x, y); break;
+                            case 8:  sbgemv_kernel_16x8_alpha (m, alpha, a, x, y); break;
+                            case 9:  sbgemv_kernel_14x9_alpha (m, alpha, a, x, y); break;
+                            case 10: sbgemv_kernel_12x10_alpha(m, alpha, a, x, y); break;
+                            case 11: sbgemv_kernel_15x11_alpha(m, alpha, a, x, y); break;
+                            case 12: sbgemv_kernel_15x12_alpha(m, alpha, a, x, y); break;
+                            case 13: sbgemv_kernel_16x13_alpha(m, alpha, a, x, y); break;
+                            case 14: sbgemv_kernel_16x14_alpha(m, alpha, a, x, y); break;
+                            case 15: sbgemv_kernel_16x15_alpha(m, alpha, a, x, y); break;
+                            case 16: sbgemv_kernel_16x16_alpha(m, alpha, a, x, y); break;
+                            default: break;
+                        }
+                    } else {
+                        sbgemv_kernel_8x16m_lda_alpha(m, n, alpha, a, lda, x, y);
+                    }
+                }
+            }
+        }
+    } else {                     // BETA != 0.0, need to accumulate the original Y data no matter what ALPHA is
+        if (beta == ONE) {
+            if (n > 127) {
+                sbgemv_kernel_1x128_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y);
+            } else if (n > 32) {
+                sbgemv_kernel_8x32_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y);
+            } else {
+                if (n > 16) {
+                    sbgemv_kernel_8x16p_lda_alpha_one(m, n, alpha, a, lda, x, beta, y);
+                } else {
+                    if (lda == n) {
+                        switch(n) {
+                            case 1:  sbgemv_kernel_32x1_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 2:  sbgemv_kernel_32x2_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 3:  sbgemv_kernel_32x3_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 4:  sbgemv_kernel_16x4_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 5:  sbgemv_kernel_30x5_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 6:  sbgemv_kernel_16x6_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 7:  sbgemv_kernel_16x7_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 8:  sbgemv_kernel_16x8_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 9:  sbgemv_kernel_14x9_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 10: sbgemv_kernel_12x10_alpha_one(m, alpha, a, x, beta, y); break;
+                            case 11: sbgemv_kernel_15x11_alpha_one(m, alpha, a, x, beta, y); break;
+                            case 12: sbgemv_kernel_15x12_alpha_one(m, alpha, a, x, beta, y); break;
+                            case 13: sbgemv_kernel_16x13_alpha_one(m, alpha, a, x, beta, y); break;
+                            case 14: sbgemv_kernel_16x14_alpha_one(m, alpha, a, x, beta, y); break;
+                            case 15: sbgemv_kernel_16x15_alpha_one(m, alpha, a, x, beta, y); break;
+                            case 16: sbgemv_kernel_16x16_alpha_one(m, alpha, a, x, beta, y); break;
+                            default: break;
+                        }
+                    } else {
+                        sbgemv_kernel_8x16m_lda_alpha_one(m, n, alpha, a, lda, x, beta, y);
+                    }
+                }
+            }
+        } else {
+            if (n > 127) {
+                sbgemv_kernel_1x128_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y);
+            } else if (n > 32) {
+                sbgemv_kernel_8x32_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y);
+            } else {
+                if (n > 16) {
+                    sbgemv_kernel_8x16p_lda_alpha_beta(m, n, alpha, a, lda, x, beta, y);
+                } else {
+                    if (lda == n) {
+                        switch(n) {
+                            case 1:  sbgemv_kernel_32x1_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 2:  sbgemv_kernel_32x2_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 3:  sbgemv_kernel_32x3_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 4:  sbgemv_kernel_16x4_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 5:  sbgemv_kernel_30x5_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 6:  sbgemv_kernel_16x6_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 7:  sbgemv_kernel_16x7_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 8:  sbgemv_kernel_16x8_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 9:  sbgemv_kernel_14x9_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 10: sbgemv_kernel_12x10_alpha_beta(m, alpha, a, x, beta, y); break;
+                            case 11: sbgemv_kernel_15x11_alpha_beta(m, alpha, a, x, beta, y); break;
+                            case 12: sbgemv_kernel_15x12_alpha_beta(m, alpha, a, x, beta, y); break;
+                            case 13: sbgemv_kernel_16x13_alpha_beta(m, alpha, a, x, beta, y); break;
+                            case 14: sbgemv_kernel_16x14_alpha_beta(m, alpha, a, x, beta, y); break;
+                            case 15: sbgemv_kernel_16x15_alpha_beta(m, alpha, a, x, beta, y); break;
+                            case 16: sbgemv_kernel_16x16_alpha_beta(m, alpha, a, x, beta, y); break;
+                            default: break;
+                        }
+                    } else {
+                        sbgemv_kernel_8x16m_lda_alpha_beta(m, n, alpha, a, lda, x, beta, y);
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+#endif
diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c
new file mode 100644
index 000000000..51e681add
--- /dev/null
+++ b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c
@@ -0,0 +1,3082 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#include <immintrin.h>
+#include "common.h"
+// Include common macros for BF16 based operations with IA intrinsics
+#include "bf16_common_macros.h"
+
+#ifndef ZERO_BETA  // Beta is non-zero
+
+#ifndef ONE_BETA       // BETA is not ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_ALPHA_BETA
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_ALPHA_BETA
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_ALPHA_BETA
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA
+
+#else                  // BETA is ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_ALPHA_ONE
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_ALPHA_ONE
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_ALPHA_ONE
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE
+
+#endif
+
+#else  // BETA is zero
+
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_ALPHA
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_ALPHA
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_ALPHA
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_ALPHA
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_ALPHA
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_ALPHA
+
+#else                  // ALPHA is ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_DIRECT
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_DIRECT
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_DIRECT
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_DIRECT
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_DIRECT
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_DIRECT
+
+#endif
+
+#endif
+
+
+// 32 rows parallel processing BF16 GEMV kernel for n=1 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_32x1_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_32x1_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_32x1_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_32x1(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_32x  = m & (~31);
+
+    __m512i matrixArray_0, matrixArray_1, matrixArray_2;
+    __m512i xArray;
+    __m512  result_0, result_1;
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+#endif
+
+    __m512i load_idx_lo   = _mm512_set_epi16(0, 15,  0, 14,  0, 13,  0, 12,  0, 11,  0, 10,  0,  9,  0,  8,\
+                                             0,  7,  0,  6,  0,  5,  0,  4,  0,  3,  0,  2,  0,  1,  0,  0);
+    __m512i M512_EPI16_16 = _mm512_set1_epi16(16);
+    __m512i load_idx_hi   = _mm512_add_epi16(load_idx_lo, M512_EPI16_16);
+
+    unsigned int interleve_mask_value = ((unsigned int) 0x55555555);
+    __mmask32 interleave_mask = *((__mmask32*) &interleve_mask_value);
+
+    xArray = _mm512_set1_epi16((short) x[0]);
+    xArray = _mm512_mask_blend_epi16(interleave_mask, _mm512_setzero_si512(), xArray);
+
+    if (tag_m_32x > 0) {
+        for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)]);  // Load 32 rows with n=1
+            matrixArray_1 = _mm512_permutexvar_epi16(load_idx_lo, matrixArray_0);  // Expand the low 16 elements
+            matrixArray_2 = _mm512_permutexvar_epi16(load_idx_hi, matrixArray_0);  // Expand the high 16 elements
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_1, (__m512bh) xArray);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_2, (__m512bh) xArray);
+
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+            STORE16_COMPLETE_RESULT(result_1, y+idx_m+16)
+        }
+    }
+
+    BLASLONG tail_num = m - tag_m_32x;
+    if (tail_num > 16) {
+        result_0 = _mm512_setzero_ps();
+        result_1 = _mm512_setzero_ps();
+
+        unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-tail_num));
+        __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
+        matrixArray_0 = _mm512_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)]);  // Load 32 rows with n=1
+        matrixArray_1 = _mm512_permutexvar_epi16(load_idx_lo, matrixArray_0);  // Expand the low 16 elements
+        matrixArray_2 = _mm512_permutexvar_epi16(load_idx_hi, matrixArray_0);  // Expand the high 16 elements
+
+        result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_1, (__m512bh) xArray);
+        result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_2, (__m512bh) xArray);
+
+        unsigned short store_mask_value = (((unsigned short)0xffff) >> (32-tail_num));
+        __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+        STORE16_COMPLETE_RESULT(result_0, y+tag_m_32x)
+        STORE16_MASK_COMPLETE_RESULT(result_1, y+tag_m_32x+16, store_mask)
+    } else if (tail_num > 8) {
+        __m256 result256_0 = _mm256_setzero_ps();
+        __m256 result256_1 = _mm256_setzero_ps();
+
+        __m256i load_idx_lo256 = _mm512_castsi512_si256(load_idx_lo);
+        __m256i load_idx_hi256 = _mm512_extracti32x8_epi32(load_idx_lo, 0x1);
+        __m256i xArray256 = _mm512_castsi512_si256(xArray);
+
+        unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-tail_num));
+        __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+        __m256i matrixArray256_0 = _mm256_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)]);  // Load 16 rows with n=1
+        __m256i matrixArray256_1 = _mm256_permutexvar_epi16(load_idx_lo256, matrixArray256_0);  // Expand the low 8 elements
+        __m256i matrixArray256_2 = _mm256_permutexvar_epi16(load_idx_hi256, matrixArray256_0);  // Expand the high 8 elements
+
+        result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_1, (__m256bh) xArray256);
+        result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_2, (__m256bh) xArray256);
+
+        unsigned char store_mask_value = (((unsigned char)0xff) >> (16-tail_num));
+        __mmask8 store_mask = *((__mmask8*) &store_mask_value);
+        STORE8_COMPLETE_RESULT(result256_0, y+tag_m_32x)
+        STORE8_MASK_COMPLETE_RESULT(result256_1, y+tag_m_32x+8, store_mask)
+    } else {
+        __m128 result128_0 = _mm_setzero_ps();
+        __m128 result128_1 = _mm_setzero_ps();
+
+        __m128i load_idx_lo128 = _mm_set_epi16(0, 3, 0, 2, 0, 1, 0, 0);
+        __m128i M128_EPI16_4   = _mm_set1_epi16(4);
+        __m128i load_idx_hi128 = _mm_add_epi16(load_idx_lo128, M128_EPI16_4);
+
+        __m128i xArray128 = _mm512_castsi512_si128(xArray);
+
+        unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-tail_num));
+        __mmask8 tail_mask = *((__mmask8*) &tail_mask_value);
+        __m128i matrixArray128_0 = _mm_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)]);  // Load 8 rows with n=1
+        __m128i matrixArray128_1 = _mm_permutexvar_epi16(load_idx_lo128, matrixArray128_0);  // Expand the low 4 elements
+        __m128i matrixArray128_2 = _mm_permutexvar_epi16(load_idx_hi128, matrixArray128_0);  // Expand the high 4 elements
+
+        result128_0 = _mm_dpbf16_ps(result128_0, (__m128bh) matrixArray128_1, (__m128bh) xArray128);
+        result128_1 = _mm_dpbf16_ps(result128_1, (__m128bh) matrixArray128_2, (__m128bh) xArray128);
+
+        if (tail_num > 4) {
+            unsigned char store_mask_value = (((unsigned char)0xf) >> (8-tail_num));
+            __mmask8 store_mask = *((__mmask8*) &store_mask_value);
+            STORE4_COMPLETE_RESULT(result128_0, y+tag_m_32x)
+            STORE4_MASK_COMPLETE_RESULT(result128_1, y+tag_m_32x+4, store_mask)
+        } else {
+            unsigned char store_mask_value = (((unsigned char)0xf) >> (4-tail_num));
+            __mmask8 store_mask = *((__mmask8*) &store_mask_value);
+            STORE4_MASK_COMPLETE_RESULT(result128_0, y+tag_m_32x, store_mask)
+        }
+    }
+
+    return 0;
+}
+
+// 32 rows parallel processing BF16 GEMV kernel for n=2 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_32x2_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_32x2_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_32x2_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_32x  = m & (~31);
+
+    __m512i matrixArray_0, matrixArray_1;
+    __m512i xArray;
+    __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    unsigned char load_mask_value = (((unsigned char)0xff) >> 6);
+    __mmask8 load_mask = *((__mmask8*) &load_mask_value);
+    xArray = _mm512_broadcastd_epi32(_mm_maskz_loadu_epi16(load_mask, x));
+
+    if (tag_m_32x > 0) {
+        for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*2]);     // Load 16 rows as n=2
+            matrixArray_1 = _mm512_loadu_si512(&a[(idx_m+16)*2]);  // Load 16 rows as n=2
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray);
+
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+            STORE16_COMPLETE_RESULT(result_1, y+idx_m+16)
+        }
+    }
+
+    if (m - tag_m_32x >= 16) {
+        result_0 = _mm512_setzero_ps();
+
+        matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_32x)*2]);     // Load 16 rows with n=2
+
+        result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray);
+
+        STORE16_COMPLETE_RESULT(result_0, y+tag_m_32x)
+
+        tag_m_32x += 16;
+    }
+
+    BLASLONG tail_num = m - tag_m_32x;
+    if (tail_num > 8) {
+        result_0 = _mm512_setzero_ps();
+
+        unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-(m&15)));
+        __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+        matrixArray_0 = _mm512_maskz_loadu_epi32(tail_mask, &a[(tag_m_32x)*2]);  // Load 16 rows with n=2
+
+        result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray);
+
+        STORE16_MASK_COMPLETE_RESULT(result_0, y+tag_m_32x, tail_mask)
+    } else if (tail_num == 8) {
+        __m256 result256 = _mm256_setzero_ps();
+
+        __m256i matrixArray256 = _mm256_loadu_si256(&a[(tag_m_32x)*2]);     // Load 8 rows with n=2
+        __m256i xArray256 = _mm512_castsi512_si256(xArray);
+        result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256);
+
+        STORE8_COMPLETE_RESULT(result256, y+tag_m_32x)
+    } else {
+        __m256 result256 = _mm256_setzero_ps();
+
+        unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-(m&7)));
+        __mmask8 tail_mask = *((__mmask8*) &tail_mask_value);
+        __m256i matrixArray256 = _mm256_maskz_loadu_epi32(tail_mask, &a[(tag_m_32x)*2]);  // Load 8 rows with n=2
+        __m256i xArray256 = _mm512_castsi512_si256(xArray);
+        result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256);
+
+        STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_32x, tail_mask)
+    }
+
+    return 0;
+}
+
+// 32 rows parallel processing BF16 GEMV kernel for n=3 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_32x3_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_32x3_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_32x3_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_32x  = m & (~31);
+
+    __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i xTmp = _mm_maskz_loadu_epi16(x_load_mask, x); // x0|x1|x2|0|0|0|0|0|
+    __m512i xArray_0 = _mm512_broadcastd_epi32(xTmp);                          // x0|x1|x0|x1|...|x0|x1|
+    __m512i xArray_1 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(xTmp, 0x1));  // x2| 0|x2| 0|...|x2| 0|
+
+    __m512i load_idx_base;
+    __m512i M512_EPI16_2, M512_EPI16_8, M512_EPI16_16;
+    M512_EPI16_2  = _mm512_set1_epi16(2);
+    M512_EPI16_8  = _mm512_add_epi16(M512_EPI16_2, M512_EPI16_2);
+    M512_EPI16_8  = _mm512_add_epi16(M512_EPI16_8, M512_EPI16_8);
+    M512_EPI16_16 = _mm512_add_epi16(M512_EPI16_8, M512_EPI16_8);
+    load_idx_base = _mm512_set_epi16(46, 45, 43, 42, 40, 39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24,
+                                     22, 21, 19, 18, 16, 15, 13, 12, 10,  9,  7,  6,  4,  3,  1,  0);
+
+    if (tag_m_32x > 0) {
+        __m512i load_idx01_1st, load_idx01_2nd, load_idx2_1st, load_idx2_2nd;
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6;
+
+        unsigned int idx_blend_mask_value = ((unsigned int)0x80000000);
+        __mmask32 idx_blend_mask = *((__mmask32*) &idx_blend_mask_value);
+
+        load_idx01_1st = load_idx_base;
+        load_idx01_2nd = _mm512_add_epi16(load_idx01_1st, M512_EPI16_16);
+        load_idx2_1st  = _mm512_add_epi16(load_idx01_1st, M512_EPI16_2);
+        load_idx2_2nd  = _mm512_add_epi16(load_idx01_2nd, M512_EPI16_2);
+        load_idx2_2nd  = _mm512_mask_blend_epi16(idx_blend_mask, load_idx2_2nd, _mm512_setzero_si512());
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*3]);           // Load 10 rows with n=3 plus 2 element
+            matrixArray_1 = _mm512_loadu_si512(&a[((idx_m+10)*3 + 2)]);  // Load 10 rows with n=3 plus 2 element
+            matrixArray_2 = _mm512_loadu_si512(&a[((idx_m+21)*3 + 1)]);  // Load 10 rows with n=3 plus 2 element
+
+            matrixArray_3 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_1st, matrixArray_1);  // Select the first 2 elements for each row
+            matrixArray_4 = _mm512_permutex2var_epi16(matrixArray_1, load_idx01_2nd, matrixArray_2);  // Select the first 2 elements for each row
+            matrixArray_5 = _mm512_permutex2var_epi16(matrixArray_0, load_idx2_1st,  matrixArray_1);  // Select the third element for each row
+            matrixArray_6 = _mm512_permutex2var_epi16(matrixArray_1, load_idx2_2nd,  matrixArray_2);  // Select the third element for each row
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_3, (__m512bh) xArray_0);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_5, (__m512bh) xArray_1);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_4, (__m512bh) xArray_0);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_6, (__m512bh) xArray_1);
+
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+            STORE16_COMPLETE_RESULT(result_1, y+idx_m+16)
+        }
+    }
+
+    if (tag_m_32x != m) {
+        __m256i load256_idx01_1st, load256_idx01_2nd, load256_idx2_1st, load256_idx2_2nd;
+        __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6;
+        __m256 result256_0, result256_1;
+
+        unsigned short idx256_blend_mask_value = ((unsigned short)0x8000);
+        __mmask16 idx256_blend_mask = *((__mmask16*) &idx256_blend_mask_value);
+
+        load256_idx01_1st = _mm512_castsi512_si256(load_idx_base);
+        load256_idx01_2nd = _mm256_add_epi16(load256_idx01_1st, _mm512_castsi512_si256(M512_EPI16_8));
+        load256_idx2_1st  = _mm256_add_epi16(load256_idx01_1st, _mm512_castsi512_si256(M512_EPI16_2)); 
+        load256_idx2_2nd  = _mm256_add_epi16(load256_idx01_2nd, _mm512_castsi512_si256(M512_EPI16_2));
+        load256_idx2_2nd  = _mm256_mask_blend_epi16(idx256_blend_mask, load256_idx2_2nd, _mm256_setzero_si256());
+
+        if (m - tag_m_32x > 15) {
+            result256_0 = _mm256_setzero_ps();
+            result256_1 = _mm256_setzero_ps();
+
+            matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]);       // Load 5 rows with n=3 plus 1 element
+            matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]);   // Load 5 rows with n=3 plus 1 element
+            matrixArray256_2 = _mm256_loadu_si256(&a[((tag_m_32x+10)*3 + 2)]);  // Load 5 rows with n=3 plus 1 element
+
+            matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1);  // Select the first 2 elements for each row
+            matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2);  // Select the first 2 elements for each row
+            matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st,  matrixArray256_1);  // Select the third element for each row
+            matrixArray256_6 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx2_2nd,  matrixArray256_2);  // Select the third element for each row
+
+            result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0));
+            result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1));
+            result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_4, (__m256bh) _mm512_castsi512_si256(xArray_0));
+            result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_6, (__m256bh) _mm512_castsi512_si256(xArray_1));
+
+            STORE8_COMPLETE_RESULT(result256_0, y+tag_m_32x)
+            STORE8_COMPLETE_RESULT(result256_1, y+tag_m_32x+8)
+
+            tag_m_32x += 16;
+        }
+
+        if (tag_m_32x != m) {
+            result256_0 = _mm256_setzero_ps();
+            result256_1 = _mm256_setzero_ps();
+            BLASLONG tail_num = m-tag_m_32x;
+
+            if (tail_num > 10) {
+                unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-10-1)*3+1)));
+                __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+                matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]);       // Load 5 rows with n=3 plus 1 element
+                matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]);   // Load 5 rows with n=3 plus 1 element
+                matrixArray256_2 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+10)*3 + 2)]);  // Load m-tag_m_32x-10 rows
+
+                matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1);  // Select the first 2 elements for each row
+                matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2);  // Select the first 2 elements for each row
+                matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st,  matrixArray256_1);  // Select the third element for each row
+                matrixArray256_6 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx2_2nd,  matrixArray256_2);  // Select the third element for each row
+
+                result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0));
+                result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1));
+                result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_4, (__m256bh) _mm512_castsi512_si256(xArray_0));
+                result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_6, (__m256bh) _mm512_castsi512_si256(xArray_1));
+            } else if (tail_num > 5) {
+                unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-5-1)*3+2)));
+                __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+                matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]);       // Load 5 rows with n=3 plus 1 element
+                matrixArray256_1 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+5)*3+1)]);   // Load m-tag_m_32x-5 rows
+                matrixArray256_2 = _mm256_setzero_si256();
+
+                matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1);  // Select the first 2 elements for each row
+                matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2);  // Select the first 2 elements for each row
+                matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st,  matrixArray256_1);  // Select the third element for each row
+                matrixArray256_6 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx2_2nd,  matrixArray256_2);  // Select the third element for each row
+
+                result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0));
+                result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1));
+                result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_4, (__m256bh) _mm512_castsi512_si256(xArray_0));
+                result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_6, (__m256bh) _mm512_castsi512_si256(xArray_1));
+            } else {
+                unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-(tail_num*3)));
+                __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+                matrixArray256_0 = _mm256_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)*3]);       // Load m-tag_m_32x rows
+                matrixArray256_1 = _mm256_setzero_si256();
+
+                matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1);  // Select the first 2 elements for each row
+                matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st,  matrixArray256_1);  // Select the third element for each row
+
+                result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0));
+                result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1));
+            }
+
+            unsigned short store_tail_mask_value = (((unsigned short)0xffff) >> (16-(tail_num)));
+            __mmask16 store_tail_mask = *((__mmask16*) &store_tail_mask_value);
+            __m512 result512 = _mm512_insertf32x8(_mm512_castps256_ps512(result256_0), result256_1, 0x1);
+            STORE16_MASK_COMPLETE_RESULT(result512, y+tag_m_32x, store_tail_mask)
+        }
+    }
+
+    return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=4 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x4_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x4_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x4_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x4(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+    __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3;
+    __m512i xArray_01, xArray_23, xArray_remix;
+    __m512  result;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    __m512i M512_EPI32_1 = _mm512_set1_epi32(1);
+    __m512i idx_base_0 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_1);
+    __m512i idx_base_remix = _mm512_inserti32x8(idx_base_0, _mm512_castsi512_si256(idx_base_1), 0x1);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xf) >> 2);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i xTmp = _mm_maskz_loadu_epi32(x_load_mask, x);               // |x0|x1|x2|x3|0|0|0|0|
+    xArray_01 = _mm512_broadcastd_epi32(xTmp);                          // |x0|x1|x0|x1|...|x0|x1|
+    xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(xTmp, 0x1));  // |x2|x3|x2|x3|...|x2|x3|
+    unsigned short blend_mask_value = ((unsigned short)0xff00);
+    __mmask16 blend_mask = *((__mmask16*) &blend_mask_value);
+    xArray_remix = _mm512_mask_blend_epi32(blend_mask, xArray_01, xArray_23); // |x0|x1|x0|x1|x0|x1|x0|x1|...|x2|x3|x2|x3|x2|x3|x2|x3| 
+
+    if (tag_m_16x > 0) {
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            result = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*4]);      // Load 8 rows with n=4
+            matrixArray_1 = _mm512_loadu_si512(&a[(idx_m+8)*4]);    // Load 8 rows with n=4
+
+            matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_0, idx_base_0, matrixArray_1);  // |a0|a1|...|h0|h1|i0|i1|...|p0|p1|
+            matrixArray_3 = _mm512_permutex2var_epi32(matrixArray_0, idx_base_1, matrixArray_1);  // |a2|a3|...|h2|h3|i2|i3|...|p2|p3|
+
+            result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_2, (__m512bh) xArray_01);
+            result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_3, (__m512bh) xArray_23);
+
+            STORE16_COMPLETE_RESULT(result, y+idx_m)
+        }
+    }
+
+    if (m - tag_m_16x > 7) {
+        result = _mm512_setzero_ps();
+
+        matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x)*4]);                // Load 8 rows with n=4
+        matrixArray_2 = _mm512_permutexvar_epi32(idx_base_remix, matrixArray_0);  // a0|a1|...|h0|h1|a2|a3|...|h2|h3|
+
+        result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_2, (__m512bh) xArray_remix);
+        __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result), _mm512_extractf32x8_ps(result, 1));
+
+        STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+        tag_m_16x += 8;
+    }
+
+    BLASLONG tail_num = m-tag_m_16x;
+    if (tail_num != 0) {
+        result = _mm512_setzero_ps();
+
+        unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-tail_num*2));
+        __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+        matrixArray_0 = _mm512_maskz_loadu_epi32(tail_mask, &a[(tag_m_16x)*4]);  // Load 8 rows with n=4
+        matrixArray_2 = _mm512_permutexvar_epi32(idx_base_remix, matrixArray_0);  // a0|a1|...|h0|h1|a2|a3|...|h2|h3|
+
+        result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_2, (__m512bh) xArray_remix);
+        __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result), _mm512_extractf32x8_ps(result, 1));
+
+        unsigned char store_tail_mask_value = (((unsigned char)0xff) >> (8-tail_num));
+        __mmask8 store_tail_mask = *((__mmask8*) &store_tail_mask_value);
+        STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_16x, store_tail_mask)
+    }
+
+    return 0;
+}
+
+// 30 rows parallel processing BF16 GEMV kernel for n=5 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_30x5_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_30x5_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_30x5_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_30x5(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_30x = m - (m%30);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xff) >> 3);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i x128 = _mm_maskz_loadu_epi16(x_load_mask, x);                       // x0|x1|x2|x3|x4|0|0|0|
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    __m512  result_0, result_1;
+    __m512i xArray_01 = _mm512_broadcastd_epi32(x128);                          // x0|x1|x0|x1|...|x0|x1|
+    __m512i xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1));  // x2|x3|x2|x3|...|x2|x3|
+    __m512i xArray_4  = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2));  // x4| 0|x4| 0|...|x4| 0|
+
+    __m512i M512_EPI16_2 = _mm512_set1_epi16(2);
+    __m512i load_idx01_stage1_1st = _mm512_set_epi16( 0,  0,  0,  0,  0,  0,  0,  0, 58, 57, 53, 52, 48, 47, 43, 42, 
+                                                        38, 37, 33, 32, 26, 25, 21, 20, 16, 15, 11, 10,  6,  5,  1,  0);
+    __m512i load_idx01_stage1_2nd = _mm512_shuffle_i32x4(load_idx01_stage1_1st, load_idx01_stage1_1st, 0x39);
+    __m512i load_idx01_stage1_3rd = _mm512_shuffle_i32x4(load_idx01_stage1_1st, load_idx01_stage1_1st, 0x4f);
+
+    __m512i load_idx23_stage1_1st = _mm512_add_epi16(load_idx01_stage1_1st, M512_EPI16_2);
+    __m512i load_idx23_stage1_2nd = _mm512_add_epi16(load_idx01_stage1_2nd, M512_EPI16_2);
+    __m512i load_idx23_stage1_3rd = _mm512_add_epi16(load_idx01_stage1_3rd, M512_EPI16_2);
+
+    __m512i load_idx4_stage1_1st  = _mm512_add_epi16(load_idx23_stage1_1st, M512_EPI16_2);
+    __m512i load_idx4_stage1_2nd  = _mm512_add_epi16(load_idx23_stage1_2nd, M512_EPI16_2);
+    __m512i load_idx4_stage1_3rd  = _mm512_add_epi16(load_idx23_stage1_3rd, M512_EPI16_2);
+
+    __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4;
+    __m512i matrixArray_stage1_0, matrixArray_stage1_1, matrixArray_stage1_2;
+    __m512i matrixArray_stage2_0, matrixArray_stage2_1;
+
+    unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2);
+    __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+    unsigned short store_mask_value = (((unsigned short)0xffff) >> 2);
+    __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+
+    if (tag_m_30x > 0) {
+        unsigned short blend_mask_value_0 = ((unsigned short)0xf000);
+        __mmask16 blend_mask_0 = *((__mmask16*) &blend_mask_value_0);
+        unsigned short blend_mask_value_1 = ((unsigned short)0x3f00);
+        __mmask16 blend_mask_1 = *((__mmask16*) &blend_mask_value_1);
+        for (BLASLONG idx_m = 0; idx_m < tag_m_30x; idx_m+=30) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m)*5]);       // Load 6 rows with n=5
+            matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+6)*5)]);   // Load 6 rows with n=5
+            matrixArray_2 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+12)*5)]);  // Load 6 rows with n=5
+            matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+18)*5)]);  // Load 6 rows with n=5
+            matrixArray_4 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+24)*5)]);  // Load 6 rows with n=5
+
+            // Process the 0|1 elements
+            // Stage 1: Select the 0|1 elements for each row
+            matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_stage1_1st, matrixArray_1);
+            matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_2, load_idx01_stage1_2nd, matrixArray_3);
+            matrixArray_stage1_2 = _mm512_permutexvar_epi16(load_idx01_stage1_3rd, matrixArray_4);
+            // Stage 2: Reorder and compress all the 0|1 elements
+            matrixArray_stage2_0 = _mm512_mask_blend_epi32(blend_mask_0, matrixArray_stage1_0, matrixArray_stage1_1);
+            matrixArray_stage2_1 = _mm512_mask_blend_epi32(blend_mask_1, matrixArray_stage1_1, matrixArray_stage1_2);
+            // Calculate the result of the 0|1 elements
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage2_0, (__m512bh) xArray_01);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage2_1, (__m512bh) xArray_01);
+
+            // Process the 2|3 elements
+            // Stage 1: Select the 2|3 elements for each row
+            matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx23_stage1_1st, matrixArray_1);
+            matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_2, load_idx23_stage1_2nd, matrixArray_3);
+            matrixArray_stage1_2 = _mm512_permutexvar_epi16(load_idx23_stage1_3rd, matrixArray_4);
+            // Stage 2: Reorder and compress all the 2|3 elements
+            matrixArray_stage2_0 = _mm512_mask_blend_epi32(blend_mask_0, matrixArray_stage1_0, matrixArray_stage1_1);
+            matrixArray_stage2_1 = _mm512_mask_blend_epi32(blend_mask_1, matrixArray_stage1_1, matrixArray_stage1_2);
+            // Calculate the result of the 2|3 elements and accumulate the result of 0|1 elements
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage2_0, (__m512bh) xArray_23);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage2_1, (__m512bh) xArray_23);
+
+            // Process the for 4 elements
+            // Stage 1: Select the 4 elements for each row
+            matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx4_stage1_1st, matrixArray_1);
+            matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_2, load_idx4_stage1_2nd, matrixArray_3);
+            matrixArray_stage1_2 = _mm512_permutexvar_epi16(load_idx4_stage1_3rd, matrixArray_4);
+            // Stage 2: Reorder and compress all the 4 elements
+            matrixArray_stage2_0 = _mm512_mask_blend_epi32(blend_mask_0, matrixArray_stage1_0, matrixArray_stage1_1);
+            matrixArray_stage2_1 = _mm512_mask_blend_epi32(blend_mask_1, matrixArray_stage1_1, matrixArray_stage1_2);
+            // Calculate the result of the 4 element and accumulate the result of 0|1 and 2|3 elements
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage2_0,  (__m512bh) xArray_4);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage2_1,  (__m512bh) xArray_4);
+
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+            STORE16_MASK_COMPLETE_RESULT(result_1, y+idx_m+16, store_mask)
+        }
+    }
+
+    if (m - tag_m_30x > 11) {
+        BLASLONG tag_m_12x = m - ((m-tag_m_30x)%12);
+        for (BLASLONG idx_m = tag_m_30x; idx_m < tag_m_12x; idx_m+=12) {
+            unsigned short store_less_mask_value = (((unsigned short)0xffff) >> 4);
+            __mmask16 store_less_mask = *((__mmask16*) &store_less_mask_value);
+            result_0 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m)*5]);       // Load 6 rows with n=5
+            matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+6)*5)]);   // Load 6 rows with n=5
+
+            // Interleave the elements
+            matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_stage1_1st, matrixArray_1);
+            matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_0, load_idx23_stage1_1st, matrixArray_1);
+            matrixArray_stage1_2 = _mm512_permutex2var_epi16(matrixArray_0, load_idx4_stage1_1st, matrixArray_1);
+            // Calculate and accumulate the result
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_0, (__m512bh) xArray_01);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_1, (__m512bh) xArray_23);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_2, (__m512bh) xArray_4);
+
+            STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_less_mask)
+            tag_m_30x += 12;
+        }
+    }
+
+    BLASLONG tail_num = m - tag_m_30x;
+    if (tail_num > 6) {
+        unsigned short store_less_mask_value = (((unsigned short)0xffff) >> (4+(12-tail_num)));
+        __mmask16 store_less_mask = *((__mmask16*) &store_less_mask_value);
+        unsigned int load_less_mask_value = (((unsigned int)0xffffffff) >> (2+(12-tail_num)*5));
+        __mmask32 load_less_mask = *((__mmask32*) &load_less_mask_value);
+        result_0 = _mm512_setzero_ps();
+
+        matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_30x)*5]);           // Load 6 rows with n=5
+        matrixArray_1 = _mm512_maskz_loadu_epi16(load_less_mask, &a[((tag_m_30x+6)*5)]);  // Load x rows with n=5
+
+        // Interleave the elements
+        matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_stage1_1st, matrixArray_1);
+        matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_0, load_idx23_stage1_1st, matrixArray_1);
+        matrixArray_stage1_2 = _mm512_permutex2var_epi16(matrixArray_0, load_idx4_stage1_1st, matrixArray_1);
+        // Calculate and accumulate the result
+        result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_0, (__m512bh) xArray_01);
+        result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_1, (__m512bh) xArray_23);
+        result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_2, (__m512bh) xArray_4);
+
+        STORE16_MASK_COMPLETE_RESULT(result_0, y+tag_m_30x, store_less_mask)
+    } else {
+        __m128i matrixArray128;
+        __m128  result128, tmp128;
+        for (BLASLONG i = tag_m_30x; i < m; i++) {
+            result128 = _mm_setzero_ps();
+            matrixArray128 = _mm_maskz_loadu_epi16(x_load_mask, &a[(i)*5]);       // Load 1 rows with n=5
+            result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+
+        }
+    }
+
+    return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=6 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x6_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x6_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x6_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xff) >> 2);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i x128 = _mm_maskz_loadu_epi16(x_load_mask, x);                       // x0|x1|x2|x3|x4|x5|0|0|
+
+    if (tag_m_16x > 0) {
+        __m512  result_0;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i M512_EPI32_1 = _mm512_set1_epi32(1);
+        __m512i load_idx01_1st = _mm512_set_epi32( 0,  0,  0,  0,  0, 30, 27, 24, 21, 18, 15, 12,  9,  6,  3,  0);
+        __m512i load_idx01_2nd = _mm512_set_epi32(13, 10,  7,  4,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0);
+
+        __m512i load_idx23_1st = _mm512_add_epi32(load_idx01_1st, M512_EPI32_1);
+        __m512i load_idx23_2nd = _mm512_add_epi32(load_idx01_2nd, M512_EPI32_1);
+
+        __m512i load_idx45_1st = _mm512_add_epi32(load_idx23_1st, M512_EPI32_1);
+        __m512i load_idx45_2nd = _mm512_add_epi32(load_idx23_2nd, M512_EPI32_1);
+
+        unsigned short blend_mask_value = ((unsigned short)0x0400);
+        __mmask16 blend_mask = *((__mmask16*) &blend_mask_value);
+        // Set the 11th element to be 0 as invalid index for a 512 bit epi32 register
+        load_idx45_1st = _mm512_mask_blend_epi32(blend_mask, load_idx45_1st, load_idx01_2nd);
+        // Set the 11th element to be 0 as 0 is the correct index
+        load_idx45_2nd = _mm512_mask_blend_epi32(blend_mask, load_idx45_2nd, load_idx01_2nd);
+
+        __m512i xArray_01 = _mm512_broadcastd_epi32(x128);                          // x0|x1|x0|x1|...|x0|x1|
+        __m512i xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1));  // x2|x3|x2|x3|...|x2|x3|
+        __m512i xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2));  // x4|x5|x4|x5|...|x4|x5|
+
+        unsigned short permute_mask01_uint = (((unsigned short)0xf800));
+        __mmask16 permute_mask01 = *((__mmask16*) &permute_mask01_uint);
+        unsigned short permute_mask45_uint = (((unsigned short)0xfc00));
+        __mmask16 permute_mask45 = *((__mmask16*) &permute_mask45_uint);
+
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2;
+        __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2;
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            result_0 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*6]);           // Load 5 rows with n=6 plus 2 element
+            matrixArray_1 = _mm512_loadu_si512(&a[((idx_m+5)*6 + 2)]);   // Load 5 rows with n=6 plus 2 element
+            matrixArray_2 = _mm512_loadu_si512(&a[((idx_m+10)*6 + 4)]);  // Load 5 rows with n=6 plus 2 element
+
+            // Stage 1: interleave for the a..k elements
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx01_1st, matrixArray_1);
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx23_1st, matrixArray_1);
+            matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_0, load_idx45_1st, matrixArray_1);
+
+            // Stage 2: interleave for the l..p elements and remix together
+            matrixArray_stage_0 = _mm512_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask01, load_idx01_2nd, matrixArray_2);
+            matrixArray_stage_1 = _mm512_mask_permutexvar_epi32(matrixArray_stage_1, permute_mask01, load_idx23_2nd, matrixArray_2);
+            matrixArray_stage_2 = _mm512_mask_permutexvar_epi32(matrixArray_stage_2, permute_mask45, load_idx45_2nd, matrixArray_2);
+
+            // Calculate the result of the 0|1 elements
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_01);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_23);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_2, (__m512bh) xArray_45);
+
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+        }
+
+        if (m - tag_m_16x > 7) {
+            __m256i M256_EPI32_1 = _mm512_castsi512_si256(M512_EPI32_1);
+            __m256i load_idx01_1st = _mm256_set_epi32( 0,  0, 15, 12,  9,  6,  3,  0);
+            __m256i load_idx01_2nd = _mm256_set_epi32( 5,  2,  0,  0,  0,  0,  0,  0);
+
+            __m256i load_idx23_1st = _mm256_add_epi32(load_idx01_1st, M256_EPI32_1);
+            __m256i load_idx23_2nd = _mm256_add_epi32(load_idx01_2nd, M256_EPI32_1);
+            unsigned char blend_mask_value = ((unsigned char)0x20);
+            __mmask8 blend_mask = *((__mmask8*) &blend_mask_value);
+            // Set the 6th element to be 0 as invalid index for a 512 bit epi32 register
+            load_idx23_1st = _mm256_mask_blend_epi32(blend_mask, load_idx23_1st, load_idx01_2nd);
+            // Set the 6th element to be 0 as 0 is the correct index
+            load_idx23_2nd = _mm256_mask_blend_epi32(blend_mask, load_idx23_2nd, load_idx01_2nd);
+
+            __m256i load_idx45_1st = _mm256_add_epi32(load_idx23_1st, M256_EPI32_1);
+            __m256i load_idx45_2nd = _mm256_add_epi32(load_idx23_2nd, M256_EPI32_1);
+
+            unsigned char permute_mask01_uint = (((unsigned char)0xc0));
+            __mmask8 permute_mask01 = *((__mmask8*) &permute_mask01_uint);
+            unsigned char permute_mask45_uint = (((unsigned char)0xe0));
+            __mmask8 permute_mask45 = *((__mmask8*) &permute_mask45_uint);
+
+            __m256i matrixArray_0, matrixArray_1, matrixArray_2;
+            __m256i matrixArray_stage_0;
+            __m256  result256_0;
+
+            result256_0 = _mm256_setzero_ps();
+
+            matrixArray_0 = _mm256_loadu_si256(&a[(tag_m_16x)*6]);          // Load 2 rows with n=6 plus 4 element
+            matrixArray_1 = _mm256_loadu_si256(&a[((tag_m_16x+2)*6 + 4)]);  // Load 2 rows with n=6 plus 4 element
+            matrixArray_2 = _mm256_loadu_si256(&a[((tag_m_16x+5)*6 + 2)]);  // Load 2 rows with n=6 plus 4 element
+
+            // Process the 0|1 elements
+            // Select the 0|1 elements for each row
+            matrixArray_stage_0 = _mm256_permutex2var_epi32(matrixArray_0, load_idx01_1st, matrixArray_1);
+            matrixArray_stage_0 = _mm256_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask01, load_idx01_2nd, matrixArray_2);
+            // Calculate the result of the 0|1 elements
+            result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray_stage_0, (__m256bh) _mm512_castsi512_si256(xArray_01));
+
+            // Process the 2|3 elements
+            // Select the 2|3 elements for each row
+            matrixArray_stage_0 = _mm256_permutex2var_epi32(matrixArray_0, load_idx23_1st, matrixArray_1);
+            matrixArray_stage_0 = _mm256_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask45, load_idx23_2nd, matrixArray_2);
+            // Calculate the result of the 0|1 elements
+            result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray_stage_0, (__m256bh) _mm512_castsi512_si256(xArray_23));
+
+            // Process the for 4 elements
+            // Select the 4|5 elements for each row
+            matrixArray_stage_0 = _mm256_permutex2var_epi32(matrixArray_0, load_idx45_1st, matrixArray_1);
+            matrixArray_stage_0 = _mm256_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask45, load_idx45_2nd, matrixArray_2);
+            // Calculate the result of the 0|1 elements
+            result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray_stage_0, (__m256bh) _mm512_castsi512_si256(xArray_45));
+
+            STORE8_COMPLETE_RESULT(result256_0, y+tag_m_16x)
+            tag_m_16x += 8;
+        }
+    }
+
+    if (tag_m_16x != m) {
+        __m128i matrixArray128;
+        __m128  result128, tmp128;
+        for (BLASLONG i = tag_m_16x; i < m; i++) {
+            result128 = _mm_setzero_ps();
+            matrixArray128 = _mm_maskz_loadu_epi16(x_load_mask, &a[(i)*6]);       // Load 1 rows with n=6
+            result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=7 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x7_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x7_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x7_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x7(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xff) >> 1);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i x128 = _mm_maskz_loadu_epi16(x_load_mask, x);               // |x0|x1|x2|x3|x4|x5|x6|0|
+
+    if (tag_m_16x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3;
+        __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3;
+        __m512i xArray_0123, xArray_4567;
+        __m512  result_0, result_1, result_2, result_3;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i M512_EPI32_2 = _mm512_set1_epi32(2);
+        __m512i load_idx_stage1_0 = _mm512_set_epi16(31, 27, 26, 25, 24, 23, 22, 21, 31, 20, 19, 18, 17, 16, 15, 14,
+                                                     31, 13, 12, 11, 10,  9,  8,  7, 31,  6,  5,  4,  3,  2,  1,  0);
+        __m512i load_idx_stage2_0 = _mm512_set_epi32(29, 25, 21, 17, 13,  9,  5,  1, 28, 24, 20, 16, 12,  8,  4,  0);
+        __m512i load_idx_stage2_1 = _mm512_add_epi32(load_idx_stage2_0, M512_EPI32_2);
+
+        unsigned short x_blend_mask_value = ((unsigned short)0xff00);
+        __mmask16 x_blend_mask = *((__mmask16*) &x_blend_mask_value);
+        xArray_0123 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(x128), \
+                                                            _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1)));
+        xArray_4567 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)), \
+                                                            _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x3)));
+
+        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 4);
+        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m)*7]);      // Load 4 rows with n=7
+            matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+4)*7]);    // Load 4 rows with n=7
+            matrixArray_2 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+8)*7]);    // Load 4 rows with n=7
+            matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+12)*7]);   // Load 4 rows with n=7
+
+            // Stage 1: padding
+            matrixArray_0 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_0);                        // |a0|a1|a2|a3|...|b6|b7|c0|c1|c2|c3|...|d6|d7|
+            matrixArray_1 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_1);                        // |e0|e1|e2|e3|...|f6|f7|g0|g1|g2|g3|...|h6|h7|
+            matrixArray_2 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_2);                        // |i0|i1|i2|i3|...|j6|j7|k0|k1|k2|k3|...|l6|l7|
+            matrixArray_3 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_3);                        // |m0|m1|m2|m3|...|n6|n7|o0|o1|o2|o3|...|p6|p7|
+
+            // Stage 2: interleave per 32 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1);  // |a0|a1|...|h0|h1|a2|a3|...|h2|h3|
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1);  // |a4|a5|...|h4|h5|a6|a7|...|h6|h7|
+            matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_0, matrixArray_3);  // |i0|i1|...|p0|p1|i2|i3|...|p2|p3|
+            matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_1, matrixArray_3);  // |i4|i5|...|p4|p5|i6|i7|...|p6|p7|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_2, (__m512bh) xArray_0123);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_3, (__m512bh) xArray_4567);
+
+            // Stage 3: interleave per 256 bits
+            result_2 = _mm512_shuffle_f32x4(result_0, result_1, 0x44);
+            result_3 = _mm512_shuffle_f32x4(result_0, result_1, 0xee);
+
+            result_2 = _mm512_add_ps(result_2, result_3);
+
+            STORE16_COMPLETE_RESULT(result_2, y+idx_m)
+        }
+
+        if (m - tag_m_16x > 7) {
+            result_0 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_16x)*7]);      // Load 4 rows with n=7
+            matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_16x+4)*7]);    // Load 4 rows with n=7
+
+            // Stage 1: padding
+            matrixArray_0 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_0);                        // |a0|a1|a2|a3|...|b6|b7|c0|c1|c2|c3|...|d6|d7|
+            matrixArray_1 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_1);                        // |e0|e1|e2|e3|...|f6|f7|g0|g1|g2|g3|...|h6|h7|
+
+            // Stage 2: interleave per 32 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1);  // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3|
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1);  // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1));
+
+            STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+
+            tag_m_16x += 8;
+        }
+
+        BLASLONG tail_num = m - tag_m_16x;
+        if (tail_num > 3) {
+            result_0 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_16x)*7]);      // Load 4 rows with n=7
+            unsigned int tail_load_mask_value = (((unsigned int)0xffffffff) >> (4+(8-tail_num)*7));
+            __mmask32 tail_load_mask = *((__mmask32*) &tail_load_mask_value);
+            matrixArray_1 = _mm512_maskz_loadu_epi16(tail_load_mask, &a[(tag_m_16x+4)*7]);    // Load 4 rows with n=7
+
+            // Stage 1: padding
+            matrixArray_0 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_0);                        // |a0|a1|a2|a3|...|b6|b7|c0|c1|c2|c3|...|d6|d7|
+            matrixArray_1 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_1);                        // |e0|e1|e2|e3|...|f6|f7|g0|g1|g2|g3|...|h6|h7|
+
+            // Stage 2: interleave per 32 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1);  // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3|
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1);  // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1));
+
+            unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-tail_num));
+            __mmask8 tail_mask = *((__mmask8*) &tail_mask_value);
+            STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_16x, tail_mask)
+            tag_m_16x = m;
+        }
+    }
+
+    if (tag_m_16x != m) {
+        __m128i matrixArray128;
+        __m128  result128, tmp128;
+        for (BLASLONG i = tag_m_16x; i < m; i++) {
+            result128 = _mm_setzero_ps();
+            matrixArray128 = _mm_maskz_loadu_epi16(x_load_mask, &a[(i)*7]);       // Load 1 rows with n=7
+            result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=8 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x8_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x8_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x8_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+
+    __m128i x128 = _mm_loadu_si128(x);               // |x0|x1|x2|x3|x4|x5|x6|x7|
+
+    if (tag_m_16x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3;
+        __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3;
+        __m512i xArray_0123, xArray_4567;
+        __m512  result_0, result_1, result_2, result_3;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i M512_EPI32_2 = _mm512_set1_epi32(2);
+        __m512i load_idx_stage2_0 = _mm512_set_epi32(29, 25, 21, 17, 13,  9,  5,  1, 28, 24, 20, 16, 12,  8,  4,  0);
+        __m512i load_idx_stage2_1 = _mm512_add_epi32(load_idx_stage2_0, M512_EPI32_2);
+
+        unsigned short x_blend_mask_value = ((unsigned short)0xff00);
+        __mmask16 x_blend_mask = *((__mmask16*) &x_blend_mask_value);
+        xArray_0123 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(x128), \
+                                                            _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1)));
+        xArray_4567 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)), \
+                                                            _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x3)));
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*8]);     // Load 4 rows with n=8
+            matrixArray_1 = _mm512_loadu_si512(&a[(idx_m+4)*8]);   // Load 4 rows with n=8
+            matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+8)*8]);   // Load 4 rows with n=8
+            matrixArray_3 = _mm512_loadu_si512(&a[(idx_m+12)*8]);  // Load 4 rows with n=8
+
+            // Stage 1: interleave per 32 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1);  // |a0|a1|...|h0|h1|a2|a3|...|h2|h3|
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1);  // |a4|a5|...|h4|h5|a6|a7|...|h6|h7|
+            matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_0, matrixArray_3);  // |i0|i1|...|p0|p1|i2|i3|...|p2|p3|
+            matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_1, matrixArray_3);  // |i4|i5|...|p4|p5|i6|i7|...|p6|p7|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_2, (__m512bh) xArray_0123);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_3, (__m512bh) xArray_4567);
+
+            // Stage 2: interleave per 256 bits
+            result_2 = _mm512_shuffle_f32x4(result_0, result_1, 0x44);
+            result_3 = _mm512_shuffle_f32x4(result_0, result_1, 0xee);
+
+            result_2 = _mm512_add_ps(result_2, result_3);
+
+            STORE16_COMPLETE_RESULT(result_2, y+idx_m)
+        }
+
+        if (m - tag_m_16x > 7) {
+            result_0 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x)*8]);      // Load 4 rows with n=8
+            matrixArray_1 = _mm512_loadu_si512(&a[(tag_m_16x+4)*8]);    // Load 4 rows with n=8
+
+            // Stage 1: interleave per 32 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1);  // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3|
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1);  // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1));
+
+            STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+            tag_m_16x += 8;
+        }
+
+        BLASLONG tail_num = m - tag_m_16x;
+        if (tail_num > 3) {
+            result_0 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x)*8]);      // Load 4 rows with n=8
+            unsigned short tail_load_mask_value = (((unsigned int)0xffff) >> ((8-tail_num)*4));
+            __mmask16 tail_load_mask = *((__mmask16*) &tail_load_mask_value);
+            matrixArray_1 = _mm512_maskz_loadu_epi32(tail_load_mask, &a[(tag_m_16x+4)*8]);    // Load 4 rows with n=8
+
+            // Stage 1: interleave per 32 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1);  // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3|
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1);  // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1));
+
+            unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-tail_num));
+            __mmask8 tail_mask = *((__mmask8*) &tail_mask_value);
+            STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_16x, tail_mask)
+            tag_m_16x = m;
+        }
+    }
+
+    if (tag_m_16x != m) {
+        __m128i matrixArray128;
+        __m128  result128, tmp128;
+        for (BLASLONG i = tag_m_16x; i < m; i++) {
+            result128 = _mm_setzero_ps();
+            matrixArray128 = _mm_loadu_si128(&a[(i)*8]);       // Load 1 rows with n=8
+            result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 14 rows parallel processing BF16 GEMV kernel for n=9 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_14x9_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_14x9_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_14x9_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_14x = m - (m%14);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xff) >> 7);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i x128_0 = _mm_loadu_si128(x);                         // |x0|x1|x2|x3|x4|x5|x6|x7|
+    __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8));  // |x8|0 |0 | 0| 0| 0| 0| 0|
+
+    if (tag_m_14x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5;
+        __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3;
+        __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89;
+        __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m256i M256_EPI16_2 = _mm256_set1_epi16(2);
+        __m256i idx_base_0 = _mm256_set_epi16( 0,  0, 55, 54, 46, 45, 37, 36, 28, 27, 19, 18, 10,  9,  1,  0);
+        __m256i idx_base_1 = _mm256_add_epi16(idx_base_0, M256_EPI16_2);
+        __m256i idx_base_2 = _mm256_add_epi16(idx_base_1, M256_EPI16_2);
+        __m256i idx_base_3 = _mm256_add_epi16(idx_base_2, M256_EPI16_2);
+        __m256i idx_base_4 = _mm256_add_epi16(idx_base_3, M256_EPI16_2);
+        __m512i idx_idx    = _mm512_set_epi32( 0,  0, 22, 21, 20, 19, 18, 17, 16,  6,  5,  4,  3,  2,  1,  0);
+
+        __m512i load_idx_stage1_0 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_0), idx_idx, _mm512_castsi256_si512(idx_base_1));
+        __m512i load_idx_stage1_1 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_2), idx_idx, _mm512_castsi256_si512(idx_base_3));
+        __m512i load_idx_stage1_2 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_1), idx_idx, _mm512_castsi256_si512(idx_base_0));
+        __m512i load_idx_stage1_3 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_3), idx_idx, _mm512_castsi256_si512(idx_base_2));
+        __m512i load_idx_stage1_4 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_4), idx_idx, _mm512_castsi256_si512(idx_base_4));
+        __m512i load_idx_stage2_0 = _mm512_set_epi32( 0,  0, 22, 21, 20, 19, 18, 17, 16, 13, 12, 11, 10,  9,  8,  7);
+
+        xArray_01 = _mm512_broadcastd_epi32(x128_0);                          // |x0|x1|x0|x1| ... |x0|x1|
+        xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1));  // |x2|x3|x2|x3| ... |x2|x3|
+        xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2));  // |x4|x5|x4|x5| ... |x4|x5|
+        xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3));  // |x6|x7|x6|x7| ... |x6|x7|
+        xArray_89 = _mm512_broadcastd_epi32(x128_1);                          // |x8|0 |x8| 0| ... |x8| 0|
+
+        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 1);
+        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+        unsigned short blend_mask_value = ((unsigned short)0x3f80);
+        __mmask16 blend_mask = *((__mmask16*) &blend_mask_value);
+        unsigned short store_mask_value = (((unsigned short)0xffff) >> 2);
+        __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+        for (BLASLONG idx_m = 0; idx_m < tag_m_14x; idx_m+=14) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*9]);                          // Load 3 rows with n=9 plus 5 elements
+            matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+3)*9 + 5]);   // Load 3 rows with n=9 plus 4 elements
+            matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+7)*9]);                        // Load 3 rows with n=9 plus 5 elements
+            matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+10)*9 + 5]);  // Load 3 rows with n=9 plus 4 elements
+
+            // Stage 1: interleave per 16 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx_stage1_0, matrixArray_1);  // |a0|a1|...|g0|g1|a2|a3|...|g2|g3|x|x|x|x|
+            matrixArray_stage_1 = _mm512_permutex2var_epi16(matrixArray_0, load_idx_stage1_1, matrixArray_1);  // |a4|a5|...|g4|g5|a6|a7|...|g6|g7|x|x|x|x|
+            matrixArray_stage_2 = _mm512_permutex2var_epi16(matrixArray_2, load_idx_stage1_2, matrixArray_3);  // |h2|h3|...|n2|n3|h0|h1|...|n0|n1|x|x|x|x|
+            matrixArray_stage_3 = _mm512_permutex2var_epi16(matrixArray_2, load_idx_stage1_3, matrixArray_3);  // |h6|h7|...|n6|n7|h4|h5|...|n4|n5|x|x|x|x|
+            matrixArray_4       = _mm512_permutex2var_epi16(matrixArray_0, load_idx_stage1_4, matrixArray_1);  // |a8| x|...|g8| x| x| x|...| x| x|x|x|x|x|
+            matrixArray_5       = _mm512_permutex2var_epi16(matrixArray_2, load_idx_stage1_4, matrixArray_3);  // | x| x|...| x| x|h8| x|...|n8| x|x|x|x|x|
+
+            // Stage 2: interleave per 32 bits
+            matrixArray_0 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_0, matrixArray_stage_2);           // |a0|a1|b0|b1|...|h0|h1|i0|i1|j0|j1|...|n0|n1|x|x|x|x|
+            matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, load_idx_stage2_0, matrixArray_stage_2);  // |a2|a3|b2|b3|...|h2|h3|i2|i3|j2|j3|...|n2|n3|x|x|x|x|
+            matrixArray_2 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_1, matrixArray_stage_3);           // |a4|a5|b4|b5|...|h4|h5|i4|i5|j4|j5|...|n4|n5|x|x|x|x|
+            matrixArray_3 = _mm512_permutex2var_epi32(matrixArray_stage_1, load_idx_stage2_0, matrixArray_stage_3);  // |a6|a7|b6|b7|...|h6|h7|i6|i7|j6|j7|...|n6|n7|x|x|x|x|
+            matrixArray_4 = _mm512_mask_blend_epi32(blend_mask, matrixArray_4, matrixArray_5);                       // |a8| x|b8| x|...|h8| x|i8| x|j8| x|...|n8| x|x|x|x|x|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89);
+            result_0 = _mm512_add_ps(result_0, result_1);
+
+            STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask)
+        }
+    }
+
+    if (tag_m_14x != m) {
+        __m256i matrixArray256;
+        __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1);
+        __m256  result256;
+        __m128  result128, tmp128;
+        unsigned short load256_mask_value = (((unsigned short)0xffff) >> 7);
+        __mmask16 load256_mask = *((__mmask16*) &load256_mask_value);
+        for (BLASLONG i = tag_m_14x; i < m; i++) {
+            result256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_maskz_loadu_epi16(load256_mask, &a[(i)*9]);
+            result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256);
+            result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1));
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 12 rows parallel processing BF16 GEMV kernel for n=10 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_12x10_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_12x10_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_12x10_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_12x  = m - (m%12);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xf) >> 3);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i x128_0 = _mm_loadu_si128(x);                                  // |x0|x1|x2|x3|x4|x5|x6|x7|
+    __m128i x128_1 = _mm_maskz_loadu_epi32(x_load_mask, (x+8));           // |x8|x9|0 | 0| 0| 0| 0| 0|
+
+    if (tag_m_12x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4;
+        __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3, matrixArray_stage_4, matrixArray_stage_5;
+        __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89;
+        __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m256i M256_EPI32_1 = _mm256_set1_epi32(1);
+        __m256i idx_base_0 = _mm256_set_epi32( 0,  0, 26, 21, 16, 10,  5,  0);
+        __m256i idx_base_1 = _mm256_add_epi32(idx_base_0, M256_EPI32_1);
+        __m256i idx_base_2 = _mm256_add_epi32(idx_base_1, M256_EPI32_1);
+        __m256i idx_base_3 = _mm256_add_epi32(idx_base_2, M256_EPI32_1);
+        __m256i idx_base_4 = _mm256_add_epi32(idx_base_3, M256_EPI32_1);
+        __m512i idx_idx    = _mm512_set_epi32( 0,  0,  0,  0, 21, 20, 19, 18, 17, 16,  5,  4,  3,  2,  1,  0);
+
+        __m512i load_idx_stage1_0 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_0), idx_idx, _mm512_castsi256_si512(idx_base_1));
+        __m512i load_idx_stage1_1 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_2), idx_idx, _mm512_castsi256_si512(idx_base_3));
+        __m512i load_idx_stage1_2 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_1), idx_idx, _mm512_castsi256_si512(idx_base_0));
+        __m512i load_idx_stage1_3 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_3), idx_idx, _mm512_castsi256_si512(idx_base_2));
+        __m512i load_idx_stage1_4 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_4), idx_idx, _mm512_castsi256_si512(idx_base_4));
+        __m512i load_idx_stage2_0 = _mm512_set_epi32( 0,  0,  0,  0, 21, 20, 19, 18, 17, 16, 11, 10,  9,  8,  7,  6);
+
+        xArray_01 = _mm512_broadcastd_epi32(x128_0);                          // |x0|x1|x0|x1| ... |x0|x1|
+        xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1));  // |x2|x3|x2|x3| ... |x2|x3|
+        xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2));  // |x4|x5|x4|x5| ... |x4|x5|
+        xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3));  // |x6|x7|x6|x7| ... |x6|x7|
+        xArray_89 = _mm512_broadcastd_epi32(x128_1);                          // |x8|x9|x8|x9| ... |x8|x9|
+
+        unsigned short blend_mask_value = ((unsigned short)0x0fc0);
+        __mmask16 blend_mask = *((__mmask16*) &blend_mask_value);
+        unsigned short load_mask_value = (((unsigned short)0xffff) >> 1);
+        __mmask16 load_mask = *((__mmask16*) &load_mask_value);
+        unsigned short store_mask_value = (((unsigned short)0xffff) >> 4);
+        __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+        for (BLASLONG idx_m = 0; idx_m < tag_m_12x; idx_m+=12) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m)*10]);     // Load 3 rows with n=10
+            matrixArray_1 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m+3)*10]);   // Load 3 rows with n=10
+            matrixArray_2 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m+6)*10]);   // Load 3 rows with n=10
+            matrixArray_3 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m+9)*10]);   // Load 3 rows with n=10
+
+            // Stage 1: interleave per 32 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage1_0, matrixArray_1);  // |a0|a1|...|f0|f1|a2|a3|...|f2|f3|x|x|x|x|x|x|x|x|
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage1_1, matrixArray_1);  // |a4|a5|...|f4|f5|a6|a7|...|f6|f7|x|x|x|x|x|x|x|x|
+            matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage1_2, matrixArray_3);  // |g2|g3|...|l2|l3|g0|g1|...|l0|l1|x|x|x|x|x|x|x|x|
+            matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage1_3, matrixArray_3);  // |g6|g7|...|l6|l7|g4|g5|...|l4|l5|x|x|x|x|x|x|x|x|
+            matrixArray_stage_4 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage1_4, matrixArray_1);  // |a8|a9|...|f8|f9| x| x|...| x| x|x|x|x|x|x|x|x|x|
+            matrixArray_stage_5 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage1_4, matrixArray_3);  // | x| x|...| x| x|g8|g9|...|l8|l9|x|x|x|x|x|x|x|x|
+
+            // Stage 3: interleave per 256 bits
+            matrixArray_0 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_0, matrixArray_stage_2);           // |a0|a1|...|l0|l1|x|x|x|x|x|x|x|x|
+            matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, load_idx_stage2_0, matrixArray_stage_2);  // |a2|a3|...|l2|l3|x|x|x|x|x|x|x|x|
+            matrixArray_2 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_1, matrixArray_stage_3);           // |a4|a5|...|l4|l5|x|x|x|x|x|x|x|x|
+            matrixArray_3 = _mm512_permutex2var_epi32(matrixArray_stage_1, load_idx_stage2_0, matrixArray_stage_3);  // |a6|a7|...|l6|l7|x|x|x|x|x|x|x|x|
+            matrixArray_4 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_4, matrixArray_stage_5);           // |a8|a9|...|l8|l9|x|x|x|x|x|x|x|x|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89);
+            result_0 = _mm512_add_ps(result_0, result_1);
+
+            STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask)
+        }
+    }
+
+    if (tag_m_12x != m) {
+        __m256i matrixArray256;
+        __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1);
+        __m256  result256;
+        __m128  result128, tmp128;
+        unsigned char load256_mask_value = (((unsigned char)0xff) >> 3);
+        __mmask8 load256_mask = *((__mmask8*) &load256_mask_value);
+        for (BLASLONG i = tag_m_12x; i < m; i++) {
+            result256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_maskz_loadu_epi32(load256_mask, &a[(i)*10]);
+            result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256);
+            result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1));
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 15 rows parallel processing BF16 GEMV kernel for n=11 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_15x11_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_15x11_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_15x11_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_15x = m - (m%15);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i x128_0 = _mm_loadu_si128(x);                         // |x0|x1| x2|x3|x4|x5|x6|x7|
+    __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8));  // |x8|x9|x10| 0| 0| 0| 0| 0|
+
+    if (tag_m_15x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5;
+        __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3, matrixArray_stage_4, matrixArray_stage_5;
+        __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89, xArray_10;
+        __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5;
+        __m512i idx_stage2_base_0, idx_stage2_base_1, idx_stage2_base_2, idx_stage2_base_3;
+
+        __m512i M512_EPI16_2, M512_EPI16_4, M512_EPI16_6, M512_EPI32_5;
+        M512_EPI16_2 = _mm512_set1_epi16(2);
+        M512_EPI16_4 = _mm512_add_epi16(M512_EPI16_2, M512_EPI16_2);
+        M512_EPI16_6 = _mm512_add_epi16(M512_EPI16_4, M512_EPI16_2);
+        M512_EPI32_5 = _mm512_set1_epi32(5);
+
+        unsigned int BASE_MASK_10_value = ((unsigned int)0x000003ff);
+        __mmask32 BASE_MASK_10 = *((__mmask32*) &BASE_MASK_10_value);
+        unsigned int BASE_MASK_20_value = ((unsigned int)0x000ffc00);
+        __mmask32 BASE_MASK_20 = *((__mmask32*) &BASE_MASK_20_value);
+        unsigned int BASE_MASK_30_value = ((unsigned int)0x3ff00000);
+        __mmask32 BASE_MASK_30 = *((__mmask32*) &BASE_MASK_30_value);
+
+        idx_stage1_base_0 = _mm512_set_epi16( 0,  0, 49, 48, 38, 37, 27, 26, 16, 15,  5,  4, 47, 46, 36, 35,
+                                             25, 24, 14, 13,  3,  2, 45, 44, 34, 33, 23, 22, 12, 11,  1,  0);
+        idx_stage1_base_1 = _mm512_add_epi16(idx_stage1_base_0, M512_EPI16_6);
+
+        idx_stage1_base_2 = _mm512_mask_add_epi16(idx_stage1_base_0, BASE_MASK_10, idx_stage1_base_0, M512_EPI16_2);
+        idx_stage1_base_2 = _mm512_mask_sub_epi16(idx_stage1_base_2, BASE_MASK_20, idx_stage1_base_0, M512_EPI16_2);
+        idx_stage1_base_3 = _mm512_add_epi16(idx_stage1_base_2, M512_EPI16_6);
+
+        idx_stage1_base_4 = _mm512_mask_add_epi16(idx_stage1_base_2, BASE_MASK_10, idx_stage1_base_2, M512_EPI16_2);
+        idx_stage1_base_4 = _mm512_mask_add_epi16(idx_stage1_base_4, BASE_MASK_20, idx_stage1_base_2, M512_EPI16_2);
+        idx_stage1_base_4 = _mm512_mask_sub_epi16(idx_stage1_base_4, BASE_MASK_30, idx_stage1_base_2, M512_EPI16_4);
+        idx_stage1_base_5 = _mm512_add_epi16(idx_stage1_base_4, M512_EPI16_6);
+
+        unsigned short idx_stage2_mask_1_value = ((unsigned short)0x03e0);
+        __mmask16 idx_stage2_mask_1 = *((__mmask16*) &idx_stage2_mask_1_value);
+        unsigned short idx_stage2_mask_2_value = ((unsigned short)0x7c00);
+        __mmask16 idx_stage2_mask_2 = *((__mmask16*) &idx_stage2_mask_2_value);
+        idx_stage2_base_0 = _mm512_set_epi32( 0,  0,  0,  0,  0,  0, 20, 19, 18, 17, 16,  9,  8,  7,  6,  5);
+        idx_stage2_base_1 = _mm512_set_epi32( 0, 25, 24, 23, 22, 21,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0);
+        idx_stage2_base_2 = _mm512_add_epi32(idx_stage2_base_0, M512_EPI32_5);
+        idx_stage2_base_2 = _mm512_mask_add_epi32(idx_stage2_base_2, idx_stage2_mask_1, idx_stage2_base_2, M512_EPI32_5);
+        idx_stage2_base_3 = _mm512_mask_sub_epi32(idx_stage2_base_1, idx_stage2_mask_2, idx_stage2_base_1, M512_EPI32_5);
+
+        xArray_01 = _mm512_broadcastd_epi32(x128_0);                          // |x0 |x1 |x0 |x1 | ... |x0 |x1 |
+        xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1));  // |x2 |x3 |x2 |x3 | ... |x2 |x3 |
+        xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2));  // |x4 |x5 |x4 |x5 | ... |x4 |x5 |
+        xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3));  // |x6 |x7 |x6 |x7 | ... |x6 |x7 |
+        xArray_89 = _mm512_broadcastd_epi32(x128_1);                          // |x8 |x9 |x8 |x9 | ... |x8 |x9 |
+        xArray_10 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_1, 0x1));  // |x10|0  |x10|0  | ... |x10|0  |
+
+        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 9);
+        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+
+        unsigned short store_mask_value = (((unsigned short)0xffff) >> 1);
+        __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_15x; idx_m+=15) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[idx_m*11]);                             // Load 2 rows with n=11 plus 10 elements
+            matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[idx_m*11 + 32]);       // Load 2 rows with n=11 plus 1 element
+            matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+5)*11]);                         // Load 2 rows with n=11 plus 10 elements
+            matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+5)*11 + 32]);   // Load 2 rows with n=11 plus 1 element
+            matrixArray_4 = _mm512_loadu_si512(&a[(idx_m+10)*11]);                        // Load 2 rows with n=11 plus 10 elements
+            matrixArray_5 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+10)*11 + 32]);  // Load 2 rows with n=11 plus 1 element
+
+            // Stage 1: interleave per 16 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi16(matrixArray_0, idx_stage1_base_0, matrixArray_1);  // |a0|a1|...|e0|e1|a2|a3|...|e2|e3|a4 |a5|...|e4 |e5|
+            matrixArray_stage_1 = _mm512_permutex2var_epi16(matrixArray_0, idx_stage1_base_1, matrixArray_1);  // |a6|a7|...|e6|e7|a8|a9|...|e8|e9|a10|x |...|e10|x |
+            matrixArray_stage_2 = _mm512_permutex2var_epi16(matrixArray_2, idx_stage1_base_2, matrixArray_3);  // |f2|f3|...|j2|j3|f0|f1|...|j0|j1|f4 |f5|...|j4 |j5|
+            matrixArray_stage_3 = _mm512_permutex2var_epi16(matrixArray_2, idx_stage1_base_3, matrixArray_3);  // |f8|f9|...|j8|j9|f6|f7|...|j6|j7|f10|x |...|j10|x |
+            matrixArray_stage_4 = _mm512_permutex2var_epi16(matrixArray_4, idx_stage1_base_4, matrixArray_5);  // |k4|k5|...|o4|o5|k2|k3|...|o2|o3|k0 |k1|...|o0 |o1|
+            matrixArray_stage_5 = _mm512_permutex2var_epi16(matrixArray_4, idx_stage1_base_5, matrixArray_5);  // |k10|x|...|o10|x|k8|k9|...|o8|o9|k6 |k7|...|o6 |o7|
+
+            // Stage 2: interleave per 32 bits
+            matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_0, matrixArray_stage_2);    // |a0|a1|...|j0|j1|x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_1, matrixArray_stage_3);    // |a6|a7|...|j6|j7|x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_0, matrixArray_stage_2);  // |a2|a3|...|j2|j3|x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_2, matrixArray_stage_2);  // |a4|a5|...|j4|j5|x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_0, matrixArray_stage_3);  // |a8|a9|...|j8|j9|x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_2, matrixArray_stage_3);  // |a10|x|...|j10|x|x|x|x|x|x|x|x|x|x|x|x|x|
+
+            matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_0,       matrixArray_stage_4);    // |a0|a1|.......................|o0|o1|x|x|
+            matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_3,       matrixArray_stage_5);    // |a6|a7|.......................|o6|o7|x|x|
+            matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_1      , idx_stage2_base_1, matrixArray_stage_4);  // |a2|a3|.......................|o2|o3|x|x|
+            matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_2      , idx_stage2_base_3, matrixArray_stage_4);  // |a4|a5|.......................|o4|o5|x|x|
+            matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_4      , idx_stage2_base_1, matrixArray_stage_5);  // |a8|a9|.......................|o8|o9|x|x|
+            matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_5      , idx_stage2_base_3, matrixArray_stage_5);  // |a10|x|.......................|o10|x|x|x|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_5, (__m512bh) xArray_10);
+            result_0 = _mm512_add_ps(result_0, result_1);
+
+            STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask)
+        }
+    }
+
+    if (tag_m_15x != m) {
+        __m256i matrixArray256;
+        __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1);
+        __m256  result256;
+        __m128  result128, tmp128;
+        unsigned short load256_mask_value = (((unsigned short)0xffff) >> 5);
+        __mmask16 load256_mask = *((__mmask16*) &load256_mask_value);
+        for (BLASLONG i = tag_m_15x; i < m; i++) {
+            result256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_maskz_loadu_epi16(load256_mask, &a[(i)*11]);
+            result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256);
+            result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1));
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 15 rows parallel processing BF16 GEMV kernel for n=12 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_15x12_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_15x12_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_15x12_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_15x = m - (m%15);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xff) >> 4);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i x128_0 = _mm_loadu_si128(x);                         // |x0|x1| x2| x3|x4|x5|x6|x7|
+    __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8));  // |x8|x9|x10|x11| 0| 0| 0| 0|
+
+    if (tag_m_15x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5;
+        __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3, matrixArray_stage_4, matrixArray_stage_5;
+        __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89, xArray_10;
+        __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5;
+        __m512i idx_stage2_base_0, idx_stage2_base_1, idx_stage2_base_2, idx_stage2_base_3;
+
+        __m512i M512_EPI32_1, M512_EPI32_2, M512_EPI32_3, M512_EPI32_5;
+        M512_EPI32_1 = _mm512_set1_epi32(1);
+        M512_EPI32_2 = _mm512_add_epi32(M512_EPI32_1, M512_EPI32_1);
+        M512_EPI32_3 = _mm512_add_epi32(M512_EPI32_2, M512_EPI32_1);
+        M512_EPI32_5 = _mm512_add_epi32(M512_EPI32_3, M512_EPI32_2);
+
+        unsigned short BASE_MASK_10_value = ((unsigned short)0x001f);
+        __mmask16 BASE_MASK_10 = *((__mmask16*) &BASE_MASK_10_value);
+        unsigned short BASE_MASK_20_value = ((unsigned short)0x03e0);
+        __mmask16 BASE_MASK_20 = *((__mmask16*) &BASE_MASK_20_value);
+        unsigned short BASE_MASK_30_value = ((unsigned short)0xfc00);
+        __mmask16 BASE_MASK_30 = *((__mmask16*) &BASE_MASK_30_value);
+
+        idx_stage1_base_0 = _mm512_set_epi32( 0, 26, 20, 14,  8,  2, 25, 19, 13,  7,  1,  24, 18, 12,  6,  0);
+        idx_stage1_base_1 = _mm512_add_epi32(idx_stage1_base_0, M512_EPI32_3);
+
+        idx_stage1_base_2 = _mm512_mask_add_epi32(idx_stage1_base_0, BASE_MASK_10, idx_stage1_base_0, M512_EPI32_1);
+        idx_stage1_base_2 = _mm512_mask_sub_epi32(idx_stage1_base_2, BASE_MASK_20, idx_stage1_base_0, M512_EPI32_1);
+        idx_stage1_base_3 = _mm512_add_epi32(idx_stage1_base_2, M512_EPI32_3);
+
+        idx_stage1_base_4 = _mm512_mask_add_epi32(idx_stage1_base_2, BASE_MASK_10, idx_stage1_base_2, M512_EPI32_1);
+        idx_stage1_base_4 = _mm512_mask_add_epi32(idx_stage1_base_4, BASE_MASK_20, idx_stage1_base_2, M512_EPI32_1);
+        idx_stage1_base_4 = _mm512_mask_sub_epi32(idx_stage1_base_4, BASE_MASK_30, idx_stage1_base_2, M512_EPI32_2);
+        idx_stage1_base_5 = _mm512_add_epi32(idx_stage1_base_4, M512_EPI32_3);
+
+        unsigned short idx_stage2_mask_1_value = ((unsigned short)0x03e0);
+        __mmask16 idx_stage2_mask_1 = *((__mmask16*) &idx_stage2_mask_1_value);
+        unsigned short idx_stage2_mask_2_value = ((unsigned short)0x7c00);
+        __mmask16 idx_stage2_mask_2 = *((__mmask16*) &idx_stage2_mask_2_value);
+        idx_stage2_base_0 = _mm512_set_epi32( 0,  0,  0,  0,  0,  0, 20, 19, 18, 17, 16,  9,  8,  7,  6,  5);
+        idx_stage2_base_1 = _mm512_set_epi32( 0, 25, 24, 23, 22, 21,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0);
+        idx_stage2_base_2 = _mm512_add_epi32(idx_stage2_base_0, M512_EPI32_5);
+        idx_stage2_base_2 = _mm512_mask_add_epi32(idx_stage2_base_2, idx_stage2_mask_1, idx_stage2_base_2, M512_EPI32_5);
+        idx_stage2_base_3 = _mm512_mask_sub_epi32(idx_stage2_base_1, idx_stage2_mask_2, idx_stage2_base_1, M512_EPI32_5);
+
+        xArray_01 = _mm512_broadcastd_epi32(x128_0);                          // |x0 |x1 |x0 |x1 | ... |x0 |x1 |
+        xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1));  // |x2 |x3 |x2 |x3 | ... |x2 |x3 |
+        xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2));  // |x4 |x5 |x4 |x5 | ... |x4 |x5 |
+        xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3));  // |x6 |x7 |x6 |x7 | ... |x6 |x7 |
+        xArray_89 = _mm512_broadcastd_epi32(x128_1);                          // |x8 |x9 |x8 |x9 | ... |x8 |x9 |
+        xArray_10 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_1, 0x1));  // |x10|x11|x10|x11| ... |x10|x11|
+
+        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 4);
+        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+
+        unsigned short store_mask_value = (((unsigned short)0xffff) >> 1);
+        __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_15x; idx_m+=15) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[idx_m*12]);                             // Load 2 rows with n=12 plus 8 elements
+            matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[idx_m*12 + 32]);       // Load 2 rows with n=12 plus 4 element
+            matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+5)*12]);                         // Load 2 rows with n=12 plus 8 elements
+            matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+5)*12 + 32]);   // Load 2 rows with n=12 plus 4 element
+            matrixArray_4 = _mm512_loadu_si512(&a[(idx_m+10)*12]);                        // Load 2 rows with n=12 plus 8 elements
+            matrixArray_5 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+10)*12 + 32]);  // Load 2 rows with n=12 plus 4 element
+
+            // Stage 1: interleave per 16 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, idx_stage1_base_0, matrixArray_1);  // |a0 |a1 |...|e0 |e1 |a2|a3|...|e2|e3|a4 |a5 |...|e4 |e5 |
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, idx_stage1_base_1, matrixArray_1);  // |a6 |a7 |...|e6 |e7 |a8|a9|...|e8|e9|a10|a11|...|e10|e11|
+            matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, idx_stage1_base_2, matrixArray_3);  // |f2 |f3 |...|j2 |j3 |f0|f1|...|j0|j1|f4 |f5 |...|j4 |j5 |
+            matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, idx_stage1_base_3, matrixArray_3);  // |f8 |f9 |...|j8 |j9 |f6|f7|...|j6|j7|f10|f11|...|j10|j11|
+            matrixArray_stage_4 = _mm512_permutex2var_epi32(matrixArray_4, idx_stage1_base_4, matrixArray_5);  // |k4 |k5 |...|o4 |o5 |k2|k3|...|o2|o3|k0 |k1 |...|o0 |o1 |
+            matrixArray_stage_5 = _mm512_permutex2var_epi32(matrixArray_4, idx_stage1_base_5, matrixArray_5);  // |k10|k11|...|o10|o11|k8|k9|...|o8|o9|k6 |k7 |...|o6 |o7 |       
+
+            // Stage 2: interleave per 32 bits
+            matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_0, matrixArray_stage_2);    // |a0 |a1 |...|j0 |j1 |x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_1, matrixArray_stage_3);    // |a6 |a7 |...|j6 |j7 |x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_0, matrixArray_stage_2);  // |a2 |a3 |...|j2 |j3 |x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_2, matrixArray_stage_2);  // |a4 |a5 |...|j4 |j5 |x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_0, matrixArray_stage_3);  // |a8 |a9 |...|j8 |j9 |x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_2, matrixArray_stage_3);  // |a10|a11|...|j10|j11|x|x|x|x|x|x|x|x|x|x|x|x|
+
+            matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_0,       matrixArray_stage_4);    // |a0|a1|.......................|o0|o1|x|x|
+            matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_3,       matrixArray_stage_5);    // |a6|a7|.......................|o6|o7|x|x|
+            matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_1      , idx_stage2_base_1, matrixArray_stage_4);  // |a2|a3|.......................|o2|o3|x|x|
+            matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_2      , idx_stage2_base_3, matrixArray_stage_4);  // |a4|a5|.......................|o4|o5|x|x|
+            matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_4      , idx_stage2_base_1, matrixArray_stage_5);  // |a8|a9|.......................|o8|o9|x|x|
+            matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_5      , idx_stage2_base_3, matrixArray_stage_5);  // |a10|x|.......................|o10|x|x|x|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_5, (__m512bh) xArray_10);
+            result_0 = _mm512_add_ps(result_0, result_1);
+
+            STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask)
+        }
+    }
+
+    if (tag_m_15x != m) {
+        __m256i matrixArray256;
+        __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1);
+        __m256  result256;
+        __m128  result128, tmp128;
+        unsigned short load256_mask_value = (((unsigned short)0xffff) >> 4);
+        __mmask16 load256_mask = *((__mmask16*) &load256_mask_value);
+        for (BLASLONG i = tag_m_15x; i < m; i++) {
+            result256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_maskz_loadu_epi16(load256_mask, &a[(i)*12]);
+            result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256);
+            result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1));
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+
+// 16 rows parallel processing BF16 GEMV kernel for n=13 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x13_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x13_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x13_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x13(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+
+    unsigned short x_load_mask_value = (((unsigned short)0xffff) >> 3);
+    __mmask16 x_load_mask = *((__mmask16*) &x_load_mask_value);
+    __m256i x256 = _mm256_maskz_loadu_epi16(x_load_mask, x);    // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|0|0|0|
+
+    if (tag_m_16x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7, \
+                matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15;
+        __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+        __m512  accum512_0, accum512_1;
+        __m512  result_0, result_1;
+
+        __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+        __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
+        __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 6);
+        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+
+        // Prepare X with 2-step interleave way
+        xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
+        BF16_INTERLEAVE_1x32(xArray)
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load matrix
+            BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 13, idx_m, 0, x_load_mask)
+
+            matrixArray_8  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+            matrixArray_9  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+            matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+            matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+            BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 13, idx_m+8, 0, x_load_mask)
+
+            matrixArray_12 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+            matrixArray_13 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+            matrixArray_14 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+            matrixArray_15 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+            // interleave per 256 bits
+            BF16_INTERLEAVE256_8x32(matrixArray)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_8x32(matrixArray)
+
+            // Calculate the temp result for a..p[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray)
+
+            // Reorder and add up the final result
+            result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+            result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+            result_0 = _mm512_add_ps(result_0, result_1);
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+        }
+
+        if (m - tag_m_16x > 7) {
+            __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12,  7,  6,  5,  4, 11, 10,  9,  8,  3,  2,  1,  0);
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load matrix
+            BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 13, tag_m_16x, 0, x_load_mask)
+
+            matrixArray_8  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+            matrixArray_9  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+            matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+            matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+            // interleave per 256 bits
+            matrixArray_0 = _mm512_shuffle_i32x4(matrixArray_8,  matrixArray_10, 0x44);
+            matrixArray_1 = _mm512_shuffle_i32x4(matrixArray_8,  matrixArray_10, 0xee);
+            matrixArray_2 = _mm512_shuffle_i32x4(matrixArray_9,  matrixArray_11, 0x44);
+            matrixArray_3 = _mm512_shuffle_i32x4(matrixArray_9,  matrixArray_11, 0xee);
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x32(matrixArray)
+
+            // Calculate the temp result for a..h[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray)
+
+            accum512_0 = _mm512_add_ps(accum512_0, accum512_1);
+            accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0);
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+            STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+            tag_m_16x += 8;
+        }
+
+        if (m - tag_m_16x > 3) {
+            __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3;
+            __m256  accum256_0, accum256_1;
+
+            xArray256_0 = _mm512_castsi512_si256(xArray_0);
+            xArray256_1 = _mm512_castsi512_si256(xArray_1);
+            xArray256_2 = _mm512_castsi512_si256(xArray_2);
+            xArray256_3 = _mm512_castsi512_si256(xArray_3);
+
+            accum256_0 = _mm256_setzero_ps();
+            accum256_1 = _mm256_setzero_ps();
+
+            BF16_MATRIX_MASKZ_LOAD_4x16(matrixArray256, a, 13, tag_m_16x, 0, x_load_mask)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x16(matrixArray256)
+
+            // Calculate the temp result for a..d[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256)
+
+            accum256_0 = _mm256_add_ps(accum256_0, accum256_1);
+            __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+            STORE4_COMPLETE_RESULT(result128, y+tag_m_16x)
+            tag_m_16x += 4;
+        }
+    }
+
+    if (tag_m_16x != m) {
+        __m256i matrixArray256;
+        __m256  accum256;
+        __m128  accum128, tmp128;
+        for (BLASLONG i = tag_m_16x; i < m; i++) {
+            accum256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_maskz_loadu_epi16(x_load_mask, &a[(i)*13]);       // Load 1 rows with n=13
+            accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256);
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+            accum128 = _mm_add_ps(accum128, tmp128);
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+            accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * accum128[0] + beta * y[i];
+#else
+            y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = accum128[0] * alpha;
+#else
+            y[i] = accum128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=14 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x14_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x14_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x14_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x14(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+
+    unsigned short x_load_mask_value = (((unsigned short)0xffff) >> 2);
+    __mmask16 x_load_mask = *((__mmask16*) &x_load_mask_value);
+    __m256i x256 = _mm256_maskz_loadu_epi16(x_load_mask, x);    // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|0|0|
+
+    if (tag_m_16x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7, \
+                matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15;
+        __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+        __m512  accum512_0, accum512_1;
+        __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+        __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
+        __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+        __m512i shift_idx    = _mm512_set_epi32(0,  13, 12, 11, 10,  9,  8,  7,  0,  6,  5,  4,  3,  2,  1,  0);
+
+        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 4);
+        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+
+        // Prepare X with 2-step interleave way
+        xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
+        BF16_INTERLEAVE_1x32(xArray)
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load matrix
+            BF16_MATRIX_MASKZ_LOAD_8x32_2(matrixArray, a, 14, idx_m, 0, load_mask)
+
+            // Pre-stage: shift the 2nd vector 1 position right for each register
+            BF16_PERMUTE_8x32_2(shift_idx, matrixArray)
+
+            // interleave per 256 bits
+            BF16_INTERLEAVE256_8x32(matrixArray)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_8x32(matrixArray)
+
+            // Calculate the temp result for a..p[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray)
+
+            // Reorder and add up the final result
+            result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+            result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+            result_0 = _mm512_add_ps(result_0, result_1);
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+        }
+
+        if (m - tag_m_16x > 7) {
+            __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12,  7,  6,  5,  4, 11, 10,  9,  8,  3,  2,  1,  0);
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load matrix
+            BF16_MATRIX_MASKZ_LOAD_4x32_2(matrixArray, a, 14, tag_m_16x, 0, load_mask)
+
+            // Pre-stage: shift the 2nd vector 1 position right for each register
+            BF16_PERMUTE_4x32_2(shift_idx, matrixArray)
+
+            // interleave per 256 bits
+            BF16_INTERLEAVE256_4x32(matrixArray)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x32(matrixArray)
+
+            // Calculate the temp result for a..h[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray)
+
+            accum512_0 = _mm512_add_ps(accum512_0, accum512_1);
+            accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0);
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+            STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+            tag_m_16x += 8;
+        }
+
+        if (m - tag_m_16x > 3) {
+            __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7;
+            __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3;
+            __m256  accum256_0, accum256_1;
+
+            xArray256_0 = _mm512_castsi512_si256(xArray_0);
+            xArray256_1 = _mm512_castsi512_si256(xArray_1);
+            xArray256_2 = _mm512_castsi512_si256(xArray_2);
+            xArray256_3 = _mm512_castsi512_si256(xArray_3);
+
+            accum256_0 = _mm256_setzero_ps();
+            accum256_1 = _mm256_setzero_ps();
+
+            BF16_MATRIX_MASKZ_LOAD_4x16(matrixArray256, a, 14, tag_m_16x, 0, x_load_mask)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x16(matrixArray256)
+
+            // Calculate the temp result for a..d[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256)
+
+            accum256_0 = _mm256_add_ps(accum256_0, accum256_1);
+            __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+            STORE4_COMPLETE_RESULT(result128, y+tag_m_16x)
+            tag_m_16x += 4;
+        }
+    }
+
+    if (tag_m_16x != m) {
+        __m256i matrixArray256;
+        __m256  accum256;
+        __m128  accum128, tmp128;
+        for (BLASLONG i = tag_m_16x; i < m; i++) {
+            accum256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_maskz_loadu_epi16(x_load_mask, &a[(i)*14]);       // Load 1 rows with n=14
+            accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256);
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+            accum128 = _mm_add_ps(accum128, tmp128);
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+            accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * accum128[0] + beta * y[i];
+#else
+            y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = accum128[0] * alpha;
+#else
+            y[i] = accum128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=15 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x15_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x15_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x15_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x15(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+
+    unsigned short x_load_mask_value = (((unsigned short)0xffff) >> 1);
+    __mmask16 x_load_mask = *((__mmask16*) &x_load_mask_value);
+    __m256i x256 = _mm256_maskz_loadu_epi16(x_load_mask, x);    // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|0|
+
+    if (tag_m_16x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7, \
+                matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15;
+        __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+        __m512  accum512_0, accum512_1;
+        __m512  result_0, result_1;
+
+        __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+        __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
+        __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2);
+        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+
+        // Prepare X with 2-step interleave way
+        xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
+        BF16_INTERLEAVE_1x32(xArray)
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load matrix
+            BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 15, idx_m, 0, x_load_mask)
+
+            matrixArray_8  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+            matrixArray_9  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+            matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+            matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+            BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 15, idx_m+8, 0, x_load_mask)
+
+            matrixArray_12 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+            matrixArray_13 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+            matrixArray_14 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+            matrixArray_15 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+            // interleave per 256 bits
+            BF16_INTERLEAVE256_8x32(matrixArray)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_8x32(matrixArray)
+
+            // Calculate the temp result for a..p[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray)
+
+            // Reorder and add up the final result
+            result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+            result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+            result_0 = _mm512_add_ps(result_0, result_1);
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+        }
+
+        if (m - tag_m_16x > 7) {
+            __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12,  7,  6,  5,  4, 11, 10,  9,  8,  3,  2,  1,  0);
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load matrix
+            BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 15, tag_m_16x, 0, x_load_mask)
+
+            matrixArray_8  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+            matrixArray_9  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+            matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+            matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+            // interleave per 256 bits
+            matrixArray_0 = _mm512_shuffle_i32x4(matrixArray_8,  matrixArray_10, 0x44);
+            matrixArray_1 = _mm512_shuffle_i32x4(matrixArray_8,  matrixArray_10, 0xee);
+            matrixArray_2 = _mm512_shuffle_i32x4(matrixArray_9,  matrixArray_11, 0x44);
+            matrixArray_3 = _mm512_shuffle_i32x4(matrixArray_9,  matrixArray_11, 0xee);
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x32(matrixArray)
+
+            // Calculate the temp result for a..h[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray)
+
+            accum512_0 = _mm512_add_ps(accum512_0, accum512_1);
+            accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0);
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+            STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+            tag_m_16x += 8;
+        }
+
+        if (m - tag_m_16x > 3) {
+            __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3;
+            __m256  accum256_0, accum256_1;
+
+            xArray256_0 = _mm512_castsi512_si256(xArray_0);
+            xArray256_1 = _mm512_castsi512_si256(xArray_1);
+            xArray256_2 = _mm512_castsi512_si256(xArray_2);
+            xArray256_3 = _mm512_castsi512_si256(xArray_3);
+
+            accum256_0 = _mm256_setzero_ps();
+            accum256_1 = _mm256_setzero_ps();
+
+            BF16_MATRIX_MASKZ_LOAD_4x16(matrixArray256, a, 15, tag_m_16x, 0, x_load_mask)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x16(matrixArray256)
+
+            // Calculate the temp result for a..d[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256)
+
+            accum256_0 = _mm256_add_ps(accum256_0, accum256_1);
+            __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+            STORE4_COMPLETE_RESULT(result128, y+tag_m_16x)
+            tag_m_16x += 4;
+        }
+    }
+
+    if (tag_m_16x != m) {
+        __m256i matrixArray256;
+        __m256  accum256;
+        __m128  accum128, tmp128;
+        for (BLASLONG i = tag_m_16x; i < m; i++) {
+            accum256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_maskz_loadu_epi16(x_load_mask, &a[(i)*15]);       // Load 1 rows with n=15
+            accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256);
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+            accum128 = _mm_add_ps(accum128, tmp128);
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+            accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * accum128[0] + beta * y[i];
+#else
+            y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = accum128[0] * alpha;
+#else
+            y[i] = accum128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=16 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x16_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x16_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x16_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+
+    __m256i x256 = _mm256_loadu_si256(x);    // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15|
+
+    if (tag_m_16x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7, \
+                matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15;
+        __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+        __m512  accum512_0, accum512_1;
+        __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+        __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
+        __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+        // Prepare X with 2-step interleave way
+        xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
+        BF16_INTERLEAVE_1x32(xArray)
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            matrixArray_8  = _mm512_loadu_si512(&a[(idx_m   )*16]);  // Load 2 rows with n=16
+            matrixArray_9  = _mm512_loadu_si512(&a[(idx_m+2 )*16]);  // Load 2 rows with n=16
+            matrixArray_10 = _mm512_loadu_si512(&a[(idx_m+4 )*16]);  // Load 2 rows with n=16
+            matrixArray_11 = _mm512_loadu_si512(&a[(idx_m+6 )*16]);  // Load 2 rows with n=16
+            matrixArray_12 = _mm512_loadu_si512(&a[(idx_m+8 )*16]);  // Load 2 rows with n=16
+            matrixArray_13 = _mm512_loadu_si512(&a[(idx_m+10)*16]);  // Load 2 rows with n=16
+            matrixArray_14 = _mm512_loadu_si512(&a[(idx_m+12)*16]);  // Load 2 rows with n=16
+            matrixArray_15 = _mm512_loadu_si512(&a[(idx_m+14)*16]);  // Load 2 rows with n=16
+
+            // interleave per 256 bits
+            BF16_INTERLEAVE256_8x32(matrixArray)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_8x32(matrixArray)
+
+            // Calculate the temp result for a..p[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray)
+
+            // Reorder and add up the final result
+            result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+            result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+            result_0 = _mm512_add_ps(result_0, result_1);
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+        }
+
+        if (m - tag_m_16x > 7) {
+            __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12,  7,  6,  5,  4, 11, 10,  9,  8,  3,  2,  1,  0);
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            matrixArray_4 = _mm512_loadu_si512(&a[(tag_m_16x   )*16]);  // Load 2 rows with n=16
+            matrixArray_5 = _mm512_loadu_si512(&a[(tag_m_16x+2 )*16]);  // Load 2 rows with n=16
+            matrixArray_6 = _mm512_loadu_si512(&a[(tag_m_16x+4 )*16]);  // Load 2 rows with n=16
+            matrixArray_7 = _mm512_loadu_si512(&a[(tag_m_16x+6 )*16]);  // Load 2 rows with n=16
+
+            // interleave per 256 bits
+            BF16_INTERLEAVE256_4x32(matrixArray)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x32(matrixArray)
+
+            // Calculate the temp result for a..h[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray)
+
+            accum512_0 = _mm512_add_ps(accum512_0, accum512_1);
+            accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0);
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+            STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+            tag_m_16x += 8;
+        }
+
+        if (m - tag_m_16x > 3) {
+            __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2,  matrixArray256_3, \
+                    matrixArray256_4, matrixArray256_5, matrixArray256_6,  matrixArray256_7;
+            __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3;
+            __m256  accum256_0, accum256_1;
+
+            xArray256_0 = _mm512_castsi512_si256(xArray_0);
+            xArray256_1 = _mm512_castsi512_si256(xArray_1);
+            xArray256_2 = _mm512_castsi512_si256(xArray_2);
+            xArray256_3 = _mm512_castsi512_si256(xArray_3);
+
+            accum256_0 = _mm256_setzero_ps();
+            accum256_1 = _mm256_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x   )*16]);  // Load 2 rows with n=16
+            matrixArray_1 = _mm512_loadu_si512(&a[(tag_m_16x+2 )*16]);  // Load 2 rows with n=16
+
+            matrixArray256_0 = _mm512_castsi512_si256(matrixArray_0);
+            matrixArray256_1 = _mm512_extracti32x8_epi32(matrixArray_0, 0x1);
+            matrixArray256_2 = _mm512_castsi512_si256(matrixArray_1);
+            matrixArray256_3 = _mm512_extracti32x8_epi32(matrixArray_1, 0x1);
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x16(matrixArray256)
+
+            // Calculate the temp result for a..d[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256)
+
+            accum256_0 = _mm256_add_ps(accum256_0, accum256_1);
+            __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+            STORE4_COMPLETE_RESULT(result128, y+tag_m_16x)
+            tag_m_16x += 4;
+        }
+    }
+
+    if (tag_m_16x != m) {
+        __m256i matrixArray256;
+        __m256  accum256;
+        __m128  accum128, tmp128;
+        for (BLASLONG i = tag_m_16x; i < m; i++) {
+            accum256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_loadu_si256(&a[(i)*16]);       // Load 1 rows with n=16
+            accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256);
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+            accum128 = _mm_add_ps(accum128, tmp128);
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+            accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * accum128[0] + beta * y[i];
+#else
+            y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = accum128[0] * alpha;
+#else
+            y[i] = accum128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 8 rows parallel processing BF16 GEMV kernel for n>16 && lda effective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_8x16p_lda_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_8x16p_lda_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_8x16p_lda_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_8x16p_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_8x  = m & (~7);
+
+    unsigned int load_mask_value = (((unsigned int)0xffffffff) >> (32-n));
+    __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+    __m512i x512 = _mm512_maskz_loadu_epi16(load_mask, x);    // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15|...
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7, \
+            matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15;
+    __m512  accum512_0, accum512_1, accum512_2, accum512_3;
+    __m256  accum256;
+    __m128  accum128;
+
+    if (tag_m_8x > 0) {
+        __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+
+        __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+        __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
+        __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+        // Prepare X with 2-step interleave way
+        xArray_0 = x512;
+        BF16_INTERLEAVE_1x32(xArray)
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load 8 rows from matrix
+            BF16_MATRIX_MASKZ_LOAD_8x32(matrixArray, a, lda, idx_m, 0, load_mask)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_8x32(matrixArray)
+
+            // Calculate the temp result for a..h[0:31]
+            BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray)
+
+            // Reorder and add up the final result
+            accum512_2 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+            accum512_3 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+            accum512_2 = _mm512_add_ps(accum512_2, accum512_3);
+            accum256   = _mm256_add_ps(_mm512_castps512_ps256(accum512_2), _mm512_extractf32x8_ps(accum512_2, 1));
+            STORE8_COMPLETE_RESULT(accum256, y+idx_m)
+        }
+
+        if (m - tag_m_8x > 3) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load 4 rows from matrix
+            BF16_MATRIX_MASKZ_LOAD_4x32(matrixArray, a, lda, tag_m_8x, 0, load_mask)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x32(matrixArray)
+
+            // Calculate the temp result for a..d[0:31]
+            BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray)
+
+            accum512_0 = _mm512_add_ps(accum512_0, accum512_1);
+            accum256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+            STORE4_COMPLETE_RESULT(accum128, y+tag_m_8x)
+            tag_m_8x += 4;
+        }
+    }
+
+    if (tag_m_8x != m) {
+        __m128  tmp128;
+        for (BLASLONG i = tag_m_8x; i < m; i++) {
+            accum512_0 = _mm512_setzero_ps();
+            matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(i)*lda]);       // Load 1 rows with n=16
+            accum512_0 = _mm512_dpbf16_ps(accum512_0, (__m512bh) matrixArray_0, (__m512bh) x512);
+            accum256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+            accum128 = _mm_add_ps(accum128, tmp128);
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+            accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * accum128[0] + beta * y[i];
+#else
+            y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = accum128[0] * alpha;
+#else
+            y[i] = accum128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 8 rows parallel processing BF16 GEMV kernel for big N && lda effective scenario (process before interleave)
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_1x128_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_1x128_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_1x128_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_8x   = m & (~7);
+    BLASLONG tag_n_32x  = n & (~31);
+    BLASLONG tag_n_128x = n & (~127);
+
+    __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \
+           accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15;
+    __m512 accum512_bridge[8];
+    __m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3;
+    __m256 accum256_0;
+    __m128 accum128;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3;
+    __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+
+    unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(n&31)));
+    __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
+
+    __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+    __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
+    __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+    if (tag_m_8x > 0) {
+        for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) {
+            for (int j = idx_m; j < idx_m + 8; j++) {
+                accum512_t_0 = _mm512_setzero_ps();
+                accum512_t_1 = _mm512_setzero_ps();
+                accum512_t_2 = _mm512_setzero_ps();
+                accum512_t_3 = _mm512_setzero_ps();
+                /* Processing the main chunk with 128-elements per round */
+                for (long idx_n = 0; idx_n < tag_n_128x; idx_n += 128) {
+                    BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n +  0)
+                    BF16_MATRIX_LOAD_1x32(matrixArray_1, a, lda, j, idx_n + 32)
+                    BF16_MATRIX_LOAD_1x32(matrixArray_2, a, lda, j, idx_n + 64)
+                    BF16_MATRIX_LOAD_1x32(matrixArray_3, a, lda, j, idx_n + 96)
+
+                    BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n + 0)
+                    BF16_VECTOR_LOAD_1x32(xArray_1, x, idx_n + 32)
+                    BF16_VECTOR_LOAD_1x32(xArray_2, x, idx_n + 64)
+                    BF16_VECTOR_LOAD_1x32(xArray_3, x, idx_n + 96)
+
+                    BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0)
+                    BF16_DOT_1x32(accum512_t_1, matrixArray_1, xArray_1)
+                    BF16_DOT_1x32(accum512_t_2, matrixArray_2, xArray_2)
+                    BF16_DOT_1x32(accum512_t_3, matrixArray_3, xArray_3)
+                }
+
+                /* Processing the remaining <128 chunk with 32-elements per round */
+                for (long idx_n = tag_n_128x; idx_n < tag_n_32x; idx_n += 32) {
+                    BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n)
+                    BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n)
+                    BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0)
+                }
+
+                /* Processing the remaining <32 chunk with masked 32-elements processing */
+                if ((n&31) != 0) {
+                    BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_0, a, lda, j, tag_n_32x, tail_mask)
+                    BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask)
+                    BF16_DOT_1x32(accum512_t_2, matrixArray_0, xArray_0)
+                }
+
+                /* Accumulate the 4 registers into 1 register */
+                accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_1);
+                accum512_t_2 = _mm512_add_ps(accum512_t_2, accum512_t_3);
+                accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_2);
+
+                // Temply save the result into a ZMM
+                accum512_bridge[j-idx_m] = accum512_t_0;
+            }
+
+            FP32_INTERLEAVE_8x16_ARRAY(accum512_bridge)
+            FP32_ACCUM2_8x16_ARRAY(accum512_bridge)
+            accum512_bridge[1] = _mm512_permutex2var_ps(accum512_bridge[0], idx_base_0, accum512_bridge[4]);
+            accum512_bridge[2] = _mm512_permutex2var_ps(accum512_bridge[0], idx_base_1, accum512_bridge[4]);
+            accum512_bridge[1] = _mm512_add_ps(accum512_bridge[1], accum512_bridge[2]);
+            accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_bridge[1]), _mm512_extractf32x8_ps(accum512_bridge[1], 1));
+            STORE8_COMPLETE_RESULT(accum256_0, y+idx_m)
+        }
+    }
+
+    if (tag_m_8x != m) {
+        __m128  tmp128;
+        for (BLASLONG j = tag_m_8x; j < m; j++) {
+            accum512_t_0 = _mm512_setzero_ps();
+            accum512_t_1 = _mm512_setzero_ps();
+            accum512_t_2 = _mm512_setzero_ps();
+            accum512_t_3 = _mm512_setzero_ps();
+            /* Processing the main chunk with 128-elements per round */
+            for (long idx_n = 0; idx_n < tag_n_128x; idx_n += 128) {
+                BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n +  0)
+                BF16_MATRIX_LOAD_1x32(matrixArray_1, a, lda, j, idx_n + 32)
+                BF16_MATRIX_LOAD_1x32(matrixArray_2, a, lda, j, idx_n + 64)
+                BF16_MATRIX_LOAD_1x32(matrixArray_3, a, lda, j, idx_n + 96)
+
+                BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n + 0)
+                BF16_VECTOR_LOAD_1x32(xArray_1, x, idx_n + 32)
+                BF16_VECTOR_LOAD_1x32(xArray_2, x, idx_n + 64)
+                BF16_VECTOR_LOAD_1x32(xArray_3, x, idx_n + 96)
+
+                BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0)
+                BF16_DOT_1x32(accum512_t_1, matrixArray_1, xArray_1)
+                BF16_DOT_1x32(accum512_t_2, matrixArray_2, xArray_2)
+                BF16_DOT_1x32(accum512_t_3, matrixArray_3, xArray_3)
+            }
+
+            /* Processing the remaining <128 chunk with 32-elements per round */
+            for (long idx_n = tag_n_128x; idx_n < tag_n_32x; idx_n += 32) {
+                BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n)
+                BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n)
+                BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0)
+            }
+
+            /* Processing the remaining <32 chunk with masked 32-elements processing */
+            if ((n&31) != 0) {
+                BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_0, a, lda, j, tag_n_32x, tail_mask)
+                BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask)
+                BF16_DOT_1x32(accum512_t_2, matrixArray_0, xArray_0)
+            }
+
+            /* Accumulate the 4 registers into 1 register */
+            accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_1);
+            accum512_t_2 = _mm512_add_ps(accum512_t_2, accum512_t_3);
+            accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_2);
+
+            accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_t_0), _mm512_extractf32x8_ps(accum512_t_0, 1));
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+            accum128 = _mm_add_ps(accum128, tmp128);
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+            accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[j] = alpha * accum128[0] + beta * y[j];
+#else
+            y[j] = alpha * accum128[0] + y[j];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[j] = accum128[0] * alpha;
+#else
+            y[j] = accum128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 8 rows parallel processing BF16 GEMV kernel for n=32 && lda effective scenario (process before interleave)
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_8x32_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_8x32_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_8x32_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_8x32_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_8x  = m & (~7);
+    BLASLONG tag_n_32x = n & (~31);
+
+    __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \
+           accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15;
+    __m256 accum256_0;
+    __m128 accum128;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7;
+    __m512i xArray_0;
+
+    unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(n&31)));
+    __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
+
+    if (tag_m_8x > 0) {
+        __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+        __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
+        __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+            accum512_2 = _mm512_setzero_ps();
+            accum512_3 = _mm512_setzero_ps();
+            accum512_4 = _mm512_setzero_ps();
+            accum512_5 = _mm512_setzero_ps();
+            accum512_6 = _mm512_setzero_ps();
+            accum512_7 = _mm512_setzero_ps();
+
+            for (BLASLONG idx_n = 0; idx_n < tag_n_32x; idx_n+=32) {
+                // Load 8 rows from matrix
+                BF16_MATRIX_LOAD_8x32(matrixArray, a, lda, idx_m, idx_n)
+
+                // Load x
+                BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n)
+
+                // Calculate the temp result for a..h[0:31]
+                BF16_DOT_8x32(accum512, matrixArray, xArray_0)
+            }
+
+            if (tag_n_32x != n) {         // Go with masked 512
+                // Load 8 rows from matrix
+                BF16_MATRIX_MASKZ_LOAD_8x32(matrixArray, a, lda, idx_m, tag_n_32x, tail_mask)
+
+                // Load x
+                BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask)
+
+                // Calculate the temp result for a..h[0:31]
+                BF16_DOT_8x32(accum512, matrixArray, xArray_0)
+            }
+
+            // 2-step interleave for FP32 regsiter array
+            FP32_INTERLEAVE_8x16(accum512)
+
+            // Accumulate the 2 batch of registers into 2 register (0 and 4)
+            FP32_ACCUM2_8x16(accum512)
+
+            accum512_1 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_4);
+            accum512_2 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_4);
+            accum512_1 = _mm512_add_ps(accum512_1, accum512_2);
+            accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_1), _mm512_extractf32x8_ps(accum512_1, 1));
+            STORE8_COMPLETE_RESULT(accum256_0, y+idx_m)
+        }
+    }
+
+    if (tag_m_8x != m) {
+        __m128  tmp128;
+        for (BLASLONG i = tag_m_8x; i < m; i++) {
+            accum512_0 = _mm512_setzero_ps();
+            for (BLASLONG idx_n = 0; idx_n < tag_n_32x; idx_n+=32) {
+                // Load 32 elements from matrix
+                BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, i, idx_n)
+
+                // Load 32 elements from x
+                BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n)
+
+                // Calculate and accumulate the temp result
+                BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
+            }
+
+            if (tag_n_32x != n) {
+                // Load tail elements from matrix
+                BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_0, a, lda, i, tag_n_32x, tail_mask)
+
+                // Load 32 elements from x
+                BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask)
+
+                // Calculate and accumulate the temp result
+                BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
+            }
+
+            accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+            accum128 = _mm_add_ps(accum128, tmp128);
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+            accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * accum128[0] + beta * y[i];
+#else
+            y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = accum128[0] * alpha;
+#else
+            y[i] = accum128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 8 rows parallel processing BF16 GEMV kernel for n<16 && lda effective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_8x16m_lda_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_8x16m_lda_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_8x16m_lda_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_8x  = m & (~7);
+
+    __m256i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7;
+    __m256i xArray256;
+
+    // Keep align with other kernels and macro definition, the high 256bit is never used
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(alpha));
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_castps256_ps512(_mm256_set1_ps(beta));
+#endif
+
+    __m256  accum256_0, accum256_1, accum256_2, accum256_3, accum256_4, accum256_5, accum256_6, accum256_7, \
+            accum256_8, accum256_9, accum256_10, accum256_11, accum256_12, accum256_13, accum256_14, accum256_15;
+
+    __m256i M256_EPI32_4 = _mm256_set1_epi32(4);
+    __m256i idx_base_0   = _mm256_set_epi32(11, 10,  9,  8,  3,  2,  1,  0);
+    __m256i idx_base_1   = _mm256_add_epi32(idx_base_0, M256_EPI32_4);
+
+    unsigned short load_mask_value = (((unsigned short)0xffff) >> (16-n));
+    __mmask16 load_mask = *((__mmask16*) &load_mask_value);
+
+    if (n == 16) {
+        BF16_VECTOR_LOAD_1x16(xArray256, x, 0)
+    } else {
+        BF16_VECTOR_MASKZ_LOAD_1x16(xArray256, x, 0, load_mask)
+    }
+
+    if (n == 16) {
+        for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) {
+            accum256_0 = _mm256_setzero_ps();
+            accum256_1 = _mm256_setzero_ps();
+            accum256_2 = _mm256_setzero_ps();
+            accum256_3 = _mm256_setzero_ps();
+            accum256_4 = _mm256_setzero_ps();
+            accum256_5 = _mm256_setzero_ps();
+            accum256_6 = _mm256_setzero_ps();
+            accum256_7 = _mm256_setzero_ps();
+
+            BF16_MATRIX_LOAD_8x16(matrixArray, a, lda, idx_m, 0)
+
+            BF16_DOT_8x16(accum256, matrixArray, xArray256)
+
+            // 2-step interleave for FP32 regsiter array
+            FP32_INTERLEAVE_8x8(accum256)
+
+            // Accumulate the 2 batch of registers into 2 register (0 and 4)
+            FP32_ACCUM2_8x8(accum256)
+
+            accum256_1 = _mm256_permutex2var_ps(accum256_0, idx_base_0, accum256_4);
+            accum256_2 = _mm256_permutex2var_ps(accum256_0, idx_base_1, accum256_4);
+            accum256_1 = _mm256_add_ps(accum256_1, accum256_2);
+
+            STORE8_COMPLETE_RESULT(accum256_1, y+idx_m)
+        }
+
+        if (tag_m_8x != m) {
+            __m128  accum128, tmp128;
+            for (BLASLONG i = tag_m_8x; i < m; i++) {
+                accum256_0 = _mm256_setzero_ps();
+                matrixArray_0 = _mm256_loadu_si256(&a[(i)*lda]);       // Load 1 rows with n=16
+                accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256);
+                accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+                tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+                accum128 = _mm_add_ps(accum128, tmp128);
+                tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+                accum128 = _mm_add_ps(accum128, tmp128);
+                y[i] += accum128[0] * alpha;
+            }
+        }
+    } else {
+        for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) {
+            accum256_0 = _mm256_setzero_ps();
+            accum256_1 = _mm256_setzero_ps();
+            accum256_2 = _mm256_setzero_ps();
+            accum256_3 = _mm256_setzero_ps();
+            accum256_4 = _mm256_setzero_ps();
+            accum256_5 = _mm256_setzero_ps();
+            accum256_6 = _mm256_setzero_ps();
+            accum256_7 = _mm256_setzero_ps();
+
+            BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray, a, lda, idx_m, 0, load_mask)
+
+            BF16_DOT_8x16(accum256, matrixArray, xArray256)
+
+            // 2-step interleave for FP32 regsiter array
+            FP32_INTERLEAVE_8x8(accum256)
+
+            // Accumulate the 2 batch of registers into 2 register (0 and 4)
+            FP32_ACCUM2_8x8(accum256)
+
+            accum256_1 = _mm256_permutex2var_ps(accum256_0, idx_base_0, accum256_4);
+            accum256_2 = _mm256_permutex2var_ps(accum256_0, idx_base_1, accum256_4);
+            accum256_1 = _mm256_add_ps(accum256_1, accum256_2);
+
+            STORE8_COMPLETE_RESULT(accum256_1, y+idx_m)
+        }
+
+        if (tag_m_8x != m) {
+            __m128  accum128, tmp128;
+            for (BLASLONG i = tag_m_8x; i < m; i++) {
+                accum256_0 = _mm256_setzero_ps();
+                matrixArray_0 = _mm256_maskz_loadu_epi16(load_mask, &a[(i)*lda]);       // Load 1 rows with n=16
+                accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256);
+                accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+                tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+                accum128 = _mm_add_ps(accum128, tmp128);
+                tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+                accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+                y[i] = alpha * accum128[0] + beta * y[i];
+#else
+                y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+                y[i] = accum128[0] * alpha;
+#else
+                y[i] = accum128[0];
+#endif
+#endif
+            }
+        }
+    }
+
+    return 0;
+}

From c5e62dad69ca13d48c2e9ce29a6398668e687dc9 Mon Sep 17 00:00:00 2001
From: "Chen, Guobing" <guobing.chen@intel.com>
Date: Thu, 29 Oct 2020 03:37:51 +0800
Subject: [PATCH 58/83] Fix cooperlake compile issue

Add a missing macro which is required in Makefile.x86_64 due to recent
clearnup, which causes cooperlake platform build failure.
---
 Makefile.system | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile.system b/Makefile.system
index 6d985786d..52d3e2cdc 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -319,6 +319,7 @@ ifeq ($(GCCVERSIONGTEQ7),1)
 else
 	GCCDUMPVERSION_PARAM := -dumpversion
 endif
+GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1)
 GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2)
 GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
 endif

From b43549188525741f311d6e5574c0fd960f964204 Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Thu, 29 Oct 2020 14:57:51 -0500
Subject: [PATCH 59/83] Optimize caxpy for POWER10

This patch makes use of new POWER10 vector pair instructions for
loads and stores.
---
 kernel/power/KERNEL.POWER10         |   6 +-
 kernel/power/caxpy_microk_power10.c | 188 ++++++++++++++++++++++++++++
 kernel/power/caxpy_power10.c        | 126 +++++++++++++++++++
 3 files changed, 315 insertions(+), 5 deletions(-)
 create mode 100644 kernel/power/caxpy_microk_power10.c
 create mode 100644 kernel/power/caxpy_power10.c

diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index 1e514fcc9..b4c7a5e41 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -143,11 +143,7 @@ ZASUMKERNEL  = zasum.c
 #
 SAXPYKERNEL  = saxpy_power10.c
 DAXPYKERNEL  = daxpy_power10.c
-ifneq ($(GCCVERSIONGTEQ9),1)
-CAXPYKERNEL  = caxpy_power9.S
-else
-CAXPYKERNEL  = caxpy.c
-endif
+CAXPYKERNEL  = caxpy_power10.c
 ZAXPYKERNEL  = zaxpy_power10.c
 #
 SCOPYKERNEL  = scopy_power10.c
diff --git a/kernel/power/caxpy_microk_power10.c b/kernel/power/caxpy_microk_power10.c
new file mode 100644
index 000000000..0d13416b3
--- /dev/null
+++ b/kernel/power/caxpy_microk_power10.c
@@ -0,0 +1,188 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_8 1
+static void caxpy_kernel_8 (long n, float *x, float *y,
+			    float alpha_r, float alpha_i)
+{
+#if !defined(CONJ)
+  static const float mvec[4] = { -1.0, 1.0, -1.0, 1.0 };
+#else
+  static const float mvec[4] = { 1.0, -1.0, 1.0, -1.0 };
+#endif
+  const float *mvecp = mvec;
+  /* We have to load reverse mask for big endian.  */
+  /* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */
+
+  __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
+  long ytmp;
+
+  __asm__
+    (
+       "xscvdpspn 32, %7    \n\t"
+       "xscvdpspn 33, %8    \n\t"
+       "xxspltw 32, 32, 0   \n\t"
+       "xxspltw 33, 33, 0   \n\t"
+       "lxvd2x          36, 0, %9       \n\t"   // mvec
+
+#if !defined(CONJ)
+       "xvmulsp		33, 33, 36	\n\t"	// alpha_i * mvec
+#else
+       "xvmulsp		32, 32, 36	\n\t"	// alpha_r * mvec
+#endif
+       "mr		%4, %3		\n\t"
+       "dcbt		0, %2		\n\t"
+       "dcbt		0, %3		\n\t"
+
+       "lxvp		40, 0(%2)	\n\t"	// x0
+       "lxvp		42, 32(%2)	\n\t"	// x2
+       "lxvp		48, 0(%3)	\n\t"	// y0
+       "lxvp		50, 32(%3)	\n\t"	// y2
+
+       "xxperm 52, 40, %x10 \n\t"       // exchange real and imag part
+       "xxperm 53, 41, %x10 \n\t"       // exchange real and imag part
+       "xxperm 54, 42, %x10 \n\t"       // exchange real and imag part
+       "xxperm 55, 43, %x10 \n\t"       // exchange real and imag part
+
+       "lxvp		44, 64(%2)	\n\t"	// x4
+       "lxvp		46, 96(%2)	\n\t"	// x6
+       "lxvp		34, 64(%3)	\n\t"	// y4
+       "lxvp		38, 96(%3)	\n\t"	// y6
+
+       "xxperm 56, 44, %x10 \n\t"       // exchange real and imag part
+       "xxperm 57, 45, %x10 \n\t"       // exchange real and imag part
+       "xxperm 58, 46, %x10 \n\t"       // exchange real and imag part
+       "xxperm 59, 47, %x10 \n\t"       // exchange real and imag part
+
+       "addi		%2, %2, 128	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+       "one%=:				\n\t"
+
+       "xvmaddasp	48, 40, 32	\n\t"	// alpha_r * x0_r , alpha_r * x0_i
+       "xvmaddasp	49, 41, 32	\n\t"
+       "lxvp		40, 0(%2)	\n\t"	// x0
+       "xvmaddasp	50, 42, 32	\n\t"
+       "xvmaddasp	51, 43, 32	\n\t"
+       "lxvp		42, 32(%2)	\n\t"	// x2
+
+       "xvmaddasp	34, 44, 32	\n\t"
+       "xvmaddasp	35, 45, 32	\n\t"
+       "lxvp		44, 64(%2)	\n\t"	// x4
+       "xvmaddasp	38, 46, 32	\n\t"
+       "xvmaddasp	39, 47, 32	\n\t"
+       "lxvp		46, 96(%2)	\n\t"	// x6
+
+       "xvmaddasp	48, 52, 33	\n\t"	// alpha_i * x0_i , alpha_i * x0_r
+       "addi		%2, %2, 128	\n\t"
+       "xvmaddasp	49, 53, 33	\n\t"
+       "xvmaddasp	50, 54, 33	\n\t"
+       "xvmaddasp	51, 55, 33	\n\t"
+
+       "xvmaddasp	34, 56, 33	\n\t"
+       "xvmaddasp	35, 57, 33	\n\t"
+       "xvmaddasp	38, 58, 33	\n\t"
+       "xvmaddasp	39, 59, 33	\n\t"
+
+       "stxvp		48, 0(%4)	\n\t"
+       "stxvp		50, 32(%4)	\n\t"
+       "stxvp		34, 64(%4)	\n\t"
+       "stxvp		38, 96(%4)	\n\t"
+
+       "addi		%4, %4, 128	\n\t"
+       "xxperm 52, 40, %x10 \n\t"       // exchange real and imag part
+       "xxperm 53, 41, %x10 \n\t"       // exchange real and imag part
+
+       "lxvp		48, 0(%3)	\n\t"	// y0
+       "xxperm 54, 42, %x10 \n\t"       // exchange real and imag part
+       "xxperm 55, 43, %x10 \n\t"       // exchange real and imag part
+       "lxvp		50, 32(%3)	\n\t"	// y2
+
+       "xxperm 56, 44, %x10 \n\t"       // exchange real and imag part
+       "xxperm 57, 45, %x10 \n\t"       // exchange real and imag part
+       "lxvp		34, 64(%3)	\n\t"	// y4
+       "xxperm 58, 46, %x10 \n\t"       // exchange real and imag part
+       "xxperm 59, 47, %x10 \n\t"       // exchange real and imag part
+       "lxvp		38, 96(%3)	\n\t"	// y6
+
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "bgt		one%=		\n"
+
+       "two%=:				\n\t"
+       "xvmaddasp	48, 40, 32	\n\t"	// alpha_r * x0_r , alpha_r * x0_i
+       "xvmaddasp	49, 41, 32	\n\t"
+       "xvmaddasp	50, 42, 32	\n\t"
+       "xvmaddasp	51, 43, 32	\n\t"
+
+       "xvmaddasp	34, 44, 32	\n\t"
+       "xvmaddasp	35, 45, 32	\n\t"
+       "xvmaddasp	38, 46, 32	\n\t"
+       "xvmaddasp	39, 47, 32	\n\t"
+
+       "xvmaddasp	48, 52, 33	\n\t"	// alpha_i * x0_i , alpha_i * x0_r
+       "xvmaddasp	49, 53, 33	\n\t"
+       "xvmaddasp	50, 54, 33	\n\t"
+       "xvmaddasp	51, 55, 33	\n\t"
+
+       "xvmaddasp	34, 56, 33	\n\t"
+       "xvmaddasp	35, 57, 33	\n\t"
+       "xvmaddasp	38, 58, 33	\n\t"
+       "xvmaddasp	39, 59, 33	\n\t"
+
+       "stxvp		48, 0(%4)	\n\t"
+       "stxvp		50, 32(%4)	\n\t"
+       "stxvp		34, 64(%4)	\n\t"
+       "stxvp		38, 96(%4)	\n\t"
+
+     "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n"
+     :
+       "+m" (*y),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y),	// 3
+       "=b" (ytmp)	// 4 
+     :
+       "m" (*x),
+       "m" (*mvecp),
+       "d" (alpha_r),	// 7
+       "d" (alpha_i),	// 8
+       "4" (mvecp),	// 9
+       "wa" (mask)
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
+       "vs56","vs57","vs58","vs59"
+     );
+}
diff --git a/kernel/power/caxpy_power10.c b/kernel/power/caxpy_power10.c
new file mode 100644
index 000000000..14b8cda67
--- /dev/null
+++ b/kernel/power/caxpy_power10.c
@@ -0,0 +1,126 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "caxpy_microk_power10.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_8
+
+static void caxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i)
+{
+	BLASLONG register i  = 0;
+	BLASLONG register ix = 0;
+ 
+	
+
+	while(i < n)
+        {
+#if !defined(CONJ)
+              y[ix]   += ( da_r * x[ix]   - da_i * x[ix+1] ) ;
+              y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix]   ) ;
+              y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ;
+              y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ;
+#else
+              y[ix]   += ( da_r * x[ix]   + da_i * x[ix+1] ) ;
+              y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix]   ) ;
+              y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ;
+              y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ;
+#endif
+
+              ix+=4 ;
+              i+=2 ;
+
+       }
+
+}
+
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	if ( n <= 0 )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1) )
+	{
+
+		BLASLONG n1 = n & -16;
+
+		if ( n1 )
+		{
+			caxpy_kernel_8 (n1, x, y, da_r, da_i);
+			ix = 2 * n1;
+		}
+		i = n1;
+		while(i < n)
+		{
+#if !defined(CONJ)
+                	y[ix]   += ( da_r * x[ix]   - da_i * x[ix+1] ) ;
+                	y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix]   ) ;
+#else
+                	y[ix]   += ( da_r * x[ix]   + da_i * x[ix+1] ) ;
+                	y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix]   ) ;
+#endif
+			i++ ;
+			ix += 2;
+
+		}
+		return(0);
+
+
+	}
+
+	inc_x *=2;
+	inc_y *=2;
+
+	while(i < n)
+	{
+
+#if !defined(CONJ)
+                y[iy]   += ( da_r * x[ix]   - da_i * x[ix+1] ) ;
+                y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix]   ) ;
+#else
+                y[iy]   += ( da_r * x[ix]   + da_i * x[ix+1] ) ;
+                y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix]   ) ;
+#endif
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+

From 1f564d729b147fb79831008af820a018f500a73a Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Sat, 31 Oct 2020 10:00:48 -0400
Subject: [PATCH 60/83] fix avx2 detection

reword commits to make it clearer
---
 cpuid_x86.c             | 16 ++++++++--------
 driver/others/dynamic.c | 12 ++++++------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/cpuid_x86.c b/cpuid_x86.c
index 728d459d1..84c12ff43 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -202,7 +202,7 @@ int support_avx(){
   if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
     xgetbv(0, &eax, &edx);
     if((eax & 6) == 6){
-      ret=1;  //OS support AVX
+      ret=1;  //OS supports saving xmm and ymm registers (6 = (1<<1) | (1<<2))
     }
   }
   return ret;
@@ -219,8 +219,8 @@ int support_avx2(){
   if (!support_avx()) 
     return 0;
   cpuid(7, &eax, &ebx, &ecx, &edx);
-  if((ebx & (1<<7)) != 0)
-      ret=1;  //OS supports AVX2
+  if((ebx & (1<<5)) != 0)
+      ret=1;  //CPU supports AVX2
   return ret;
 #else
   return 0;
@@ -235,14 +235,14 @@ int support_avx512(){
   if (!support_avx()) 
     return 0;
   cpuid(7, &eax, &ebx, &ecx, &edx);
-  if((ebx & 32) != 32){
-      ret=0;  //OS does not even support AVX2
+  if((ebx & (1<<5)) == 0){
+      ret=0;  //cpu does not have avx2 flag
   }
-  if((ebx & (1<<31)) != 0){
+  if((ebx & (1<<31)) != 0){ //AVX512VL flag
     xgetbv(0, &eax, &edx); 
     if((eax & 0xe0) == 0xe0)
-      ret=1;  //OS supports AVX512VL
-  }
+      ret=1;  //OS supports saving zmm registers
+ }
   return ret;
 #else
   return 0;
diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 21d2c7948..58f4d8b59 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -330,8 +330,8 @@ int support_avx2(){
   if (!support_avx())
     return 0;
   cpuid(7, &eax, &ebx, &ecx, &edx);
-  if((ebx & (1<<7)) != 0)
-      ret=1;  //OS supports AVX2
+  if((ebx & (1<<5)) != 0)
+      ret=1;  //AVX2 flag is set
   return ret;
 #else
   return 0;
@@ -346,13 +346,13 @@ int support_avx512(){
   if (!support_avx())
     return 0;
   cpuid(7, &eax, &ebx, &ecx, &edx);
-  if((ebx & (1<<7)) == 0){
-      ret=0;  //OS does not even support AVX2
+  if((ebx & (1<<5)) == 0){
+      ret=0;  //cpu does not have avx2 flag
   }
-  if((ebx & (1u<<31)) != 0){
+  if((ebx & (1<<31)) != 0){ //AVX512VL flag is set
     xgetbv(0, &eax, &edx);
     if((eax & 0xe0) == 0xe0)
-      ret=1;  //OS supports AVX512VL
+      ret=1;  //OS supports saving zmm register
   }
   return ret;
 #else

From 9fab65e90ad35253014cd9620be0caaabf5f130b Mon Sep 17 00:00:00 2001
From: User User-User <user@localhost>
Date: Sun, 1 Nov 2020 00:38:08 +0200
Subject: [PATCH 61/83] add openbsd gfortran

---
 f_check | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/f_check b/f_check
index f894aa9ac..c12b0f2ef 100644
--- a/f_check
+++ b/f_check
@@ -33,7 +33,7 @@ if ($compiler eq "") {
               "ppuf77", "ppuf95", "ppuf90", "ppuxlf",
 	      "pathf90", "pathf95",
 	      "pgf95", "pgf90", "pgf77",
-	      "flang",
+	      "flang", "egfortran",
               "ifort");
 
 OUTER:

From 7f26be4802042d7c54bd1645c54adc3e2ff72d50 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 1 Nov 2020 00:00:43 +0100
Subject: [PATCH 62/83] Reunify BUFFERSIZE across arm64 platforms to avoid
 segfaults in DYNAMIC_ARCH

---
 common_arm64.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/common_arm64.h b/common_arm64.h
index 314946282..9cdded305 100644
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -142,14 +142,8 @@ REALNAME:
 #define HUGE_PAGESIZE   ( 4 << 20)
 
 #ifndef BUFFERSIZE
-#if defined(CORTEXA57)
-#define BUFFER_SIZE     (20 << 20)
-#elif defined(TSV110) || defined(EMAG8180)
 #define BUFFER_SIZE     (32 << 20)
 #else
-#define BUFFER_SIZE     (16 << 20)
-#endif
-#else
 #define BUFFER_SIZE	(32 << BUFFERSIZE)
 #endif
 

From dd7a9cc5bf6b926a44b38d13366743691fd6e604 Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Sat, 31 Oct 2020 18:28:57 -0500
Subject: [PATCH 63/83] POWER10:  Change dgemm unroll factors

Changing the unroll factors for dgemm to 8 shows improved performance with
POWER10 MMA feature.   Also made some minor changes in sgemm for edge cases.
---
 kernel/power/KERNEL.POWER10          |  14 +-
 kernel/power/dgemm_kernel_power10.c  | 473 +++++++++++++--------------
 kernel/power/dgemm_ncopy_8_power10.c | 326 ++++++++++++++++++
 kernel/power/sgemm_kernel_power10.c  |  70 ++--
 param.h                              |   4 +
 5 files changed, 589 insertions(+), 298 deletions(-)
 create mode 100644 kernel/power/dgemm_ncopy_8_power10.c

diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index b4c7a5e41..28c39051f 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -34,12 +34,12 @@ SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 DGEMMKERNEL    =  dgemm_kernel_power10.c
-DGEMMINCOPY    = ../generic/gemm_ncopy_16.c
-DGEMMITCOPY    =  dgemm_tcopy_16_power8.S
-DGEMMONCOPY    =  dgemm_ncopy_4_power8.S
-DGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMINCOPY    =
+DGEMMITCOPY    =
+DGEMMONCOPY    =  dgemm_ncopy_8_power10.c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_8.c
+DGEMMINCOPYOBJ =
+DGEMMITCOPYOBJ =
 DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
@@ -69,7 +69,7 @@ STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
 STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
 
 DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= dtrsm_kernel_LT_16x4_power8.S
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 
diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c
index b2a29140e..b531799a6 100644
--- a/kernel/power/dgemm_kernel_power10.c
+++ b/kernel/power/dgemm_kernel_power10.c
@@ -149,7 +149,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 #endif
   )
 {
-  BLASLONG N = n;
   BLASLONG i1;
 #if defined(TRMMKERNEL)
   BLASLONG off;
@@ -158,10 +157,221 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
   off = -offset;
 #endif
   v4sf_t valpha = { alpha, alpha };
-  N = n >> 2;
-  for (i1 = 0; i1 < N; i1++)
+  for (i1 = 0; i1 < (n >> 3); i1++)
     {
-      BLASLONG i, j, temp;
+      BLASLONG j, temp;
+      FLOAT *CO;
+      FLOAT *AO;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      CO = C;
+      C += ldc << 3;
+      AO = A;
+      PREFETCH1 (A, 128);
+      PREFETCH1 (A, 256);
+      for (j = 0; j < (m >> 3); j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (8, 8);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7;
+	  BLASLONG l = 0;
+	  vec_t *rowA = (vec_t *) & AO[0];
+	  vec_t *rb = (vec_t *) & BO[0];
+	  __vector_pair rowB, rowB1;
+	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	  __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
+	  __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
+	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
+	  __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
+	  __builtin_mma_xvf64ger (&acc4, rowB, rowA[2]);
+	  __builtin_mma_xvf64ger (&acc5, rowB1, rowA[2]);
+	  __builtin_mma_xvf64ger (&acc6, rowB, rowA[3]);
+	  __builtin_mma_xvf64ger (&acc7, rowB1, rowA[3]);
+	  for (l = 1; l < temp; l++)
+	    {
+	      rowA = (vec_t *) & AO[l << 3];
+	      rb = (vec_t *) & BO[l << 3];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
+	      __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
+	      __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[2]);
+	      __builtin_mma_xvf64gerpp (&acc5, rowB1, rowA[2]);
+	      __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[3]);
+	      __builtin_mma_xvf64gerpp (&acc7, rowB1, rowA[3]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC1 (&acc1, 0);
+	  SAVE_ACC (&acc2, 2);
+	  SAVE_ACC1 (&acc3, 2);
+	  SAVE_ACC (&acc4, 4);
+	  SAVE_ACC1 (&acc5, 4);
+	  SAVE_ACC (&acc6, 6);
+	  SAVE_ACC1 (&acc7, 6);
+	  CO += 8;
+	  AO += temp << 3;
+	  BO += temp << 3;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (8, 8)
+#endif
+	}
+      if (m & 4)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (4, 8);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3;
+	  BLASLONG l = 0;
+	  vec_t *rowA = (vec_t *) & AO[0];
+	  __vector_pair rowB, rowB1;
+	  vec_t *rb = (vec_t *) & BO[0];
+	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	  __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
+	  __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
+	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
+	  __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
+	  for (l = 1; l < temp; l++)
+	    {
+	      rowA = (vec_t *) & AO[l << 2];
+	      rb = (vec_t *) & BO[l << 3];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
+	      __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC1 (&acc1, 0);
+	  SAVE_ACC (&acc2, 2);
+	  SAVE_ACC1 (&acc3, 2);
+	  CO += 4;
+	  AO += temp << 2;
+	  BO += temp << 3;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (4, 8)
+#endif
+	}
+      if (m & 2)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (2, 8);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1;
+	  BLASLONG l = 0;
+	  vec_t *rowA = (vec_t *) & AO[0];
+	  __vector_pair rowB, rowB1;
+	  vec_t *rb = (vec_t *) & BO[0];
+	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	  __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
+	  __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
+	  for (l = 1; l < temp; l++)
+	    {
+	      rowA = (vec_t *) & AO[l << 1];
+	      rb = (vec_t *) & BO[l << 3];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC1 (&acc1, 0);
+	  CO += 2;
+	  AO += temp << 1;
+	  BO += temp << 3;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (2, 8)
+#endif
+	}
+      if (m & 1)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (1, 8);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0 };
+	  v4sf_t t1 = { 0, 0 };
+	  v4sf_t t2 = { 0, 0 };
+	  v4sf_t t3 = { 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowA = { AO[l], AO[l] };
+	      v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1] };
+	      v4sf_t rowB1 = { BO[(l << 3) + 2], BO[(l << 3) + 3] };
+	      v4sf_t rowB2 = { BO[(l << 3) + 4], BO[(l << 3) + 5] };
+	      v4sf_t rowB3 = { BO[(l << 3) + 6], BO[(l << 3) + 7] };
+	      t += rowA * rowB;
+	      t1 += rowA * rowB1;
+	      t2 += rowA * rowB2;
+	      t3 += rowA * rowB3;
+	    }
+	  t = t * valpha;
+	  t1 = t1 * valpha;
+	  t2 = t2 * valpha;
+	  t3 = t3 * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0 * ldc] = t[0];
+	  CO[1 * ldc] = t[1];
+	  CO[2 * ldc] = t1[0];
+	  CO[3 * ldc] = t1[1];
+	  CO[4 * ldc] = t2[0];
+	  CO[5 * ldc] = t2[1];
+	  CO[6 * ldc] = t3[0];
+	  CO[7 * ldc] = t3[1];
+#else
+	  CO[0 * ldc] += t[0];
+	  CO[1 * ldc] += t[1];
+	  CO[2 * ldc] += t1[0];
+	  CO[3 * ldc] += t1[1];
+	  CO[4 * ldc] += t2[0];
+	  CO[5 * ldc] += t2[1];
+	  CO[6 * ldc] += t3[0];
+	  CO[7 * ldc] += t3[1];
+#endif
+	  CO += 1;
+	  AO += temp;
+	  BO += temp << 3;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (1, 8)
+#endif
+	}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 8;                 // number of values in A
+#endif
+      B += k << 3;
+    }
+  if (n & 4)
+    {
+      BLASLONG j, temp;
       FLOAT *CO;
       FLOAT *AO;
 #if defined(TRMMKERNEL) && defined(LEFT)
@@ -172,71 +382,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
       AO = A;
       PREFETCH1 (A, 128);
       PREFETCH1 (A, 256);
-      i = m >> 4;
-      for (j = 0; j < i; j++)
-	{
-          FLOAT *BO;
-#if defined(TRMMKERNEL)
-          REFRESH_POINTERS (16, 4);
-#else
-          BO = B;
-          temp = k;
-#endif
-	  v4sf_t *rowC;
-	  v4sf_t result[4];
-	  BLASLONG l = 0;
-	  PREFETCH1 (CO, 0);
-	  PREFETCH1 (CO + ldc, 0);
-	  PREFETCH1 (CO + ldc + ldc, 0);
-	  PREFETCH1 (CO + ldc + ldc + ldc, 0);
-	  PREFETCH1 (CO, 128);
-	  PREFETCH1 (CO + ldc, 128);
-	  PREFETCH1 (CO + ldc + ldc, 128);
-	  PREFETCH1 (CO + ldc + ldc + ldc, 128);
-	  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
-	  vec_t *rowA = (vec_t *) & AO[0];
-	  __vector_pair rowB;
-	  vec_t *rb = (vec_t *) & BO[0];
-	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
-	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
-	  __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
-	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
-	  __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
-	  __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
-	  __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
-	  __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
-	  __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
-	  for (l = 1; l < temp; l++)
-	    {
-	      rowA = (vec_t *) & AO[l << 4];
-	      rb = (vec_t *) & BO[l << 2];
-	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
-	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
-	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
-	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
-	      __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
-	      __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
-	      __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
-	      __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
-	      __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
-	    }
-	  SAVE_ACC (&acc0, 0);
-	  SAVE_ACC (&acc2, 4);
-	  SAVE_ACC (&acc1, 2);
-	  SAVE_ACC (&acc3, 6);
-	  SAVE_ACC (&acc4, 8);
-	  SAVE_ACC (&acc6, 12);
-	  SAVE_ACC (&acc5, 10);
-	  SAVE_ACC (&acc7, 14);
-	  AO += temp << 4;
-	  BO += temp << 2;
-#if defined(TRMMKERNEL)
-          REFRESH_AFTER_SAVE (16, 4)
-#endif
-	  CO += 16;
-	}
-      i = (m & 15) >> 3;
-      for (j = 0; j < i; j++)
+      for (j = 0; j < (m >> 3); j++)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -278,8 +424,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
           REFRESH_AFTER_SAVE (8, 4)
 #endif
 	}
-      i = (m & 7) >> 2;
-      for (j = 0; j < i; j++)
+      if (m & 4)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -315,8 +460,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
           REFRESH_AFTER_SAVE (4, 4)
 #endif
 	}
-      i = (m & 3) >> 1;
-      for (j = 0; j < i; j++)
+      if (m & 2)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -349,8 +493,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
           REFRESH_AFTER_SAVE (2, 4)
 #endif
 	}
-      i = (m & 1) >> 0;
-      for (j = 0; j < i; j++)
+      if (m & 1)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -395,10 +538,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 #endif
       B += k << 2;
     }
-  N = (n & 3) >> 1;
-  for (i1 = 0; i1 < N; i1++)
+  if (n & 2)
     {
-      BLASLONG i, j, temp;
+      BLASLONG j, temp;
 #if defined(TRMMKERNEL) && defined(LEFT)
       off = offset;
 #endif
@@ -407,66 +549,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
       CO = C;
       C += ldc << 1;
       AO = A;
-      i = m >> 4;
-      for (j = 0; j < i; j++)
-	{
-	  FLOAT *BO;
-#if defined(TRMMKERNEL)
-          REFRESH_POINTERS (16, 2);
-#else
-          BO = B;
-          temp = k;
-#endif
-	  v4sf_t *rowC;
-	  v4sf_t result[4];
-	  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
-	  BLASLONG l = 0;
-	  FLOAT t[4] = { 0, 0, 0, 0 };
-	  t[0] = BO[0], t[1] = BO[1];
-	  __vector_pair rowB;
-	  vec_t *rb = (vec_t *) & t[0];
-	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
-	  vec_t *rowA = (vec_t *) & AO[0];
-	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
-	  __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
-	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
-	  __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
-	  __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
-	  __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
-	  __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
-	  __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
-	  for (l = 1; l < temp; l++)
-	    {
-	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
-	      rb = (vec_t *) & t[0];
-	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
-	      rowA = (vec_t *) & AO[l << 4];
-	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
-	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
-	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
-	      __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
-	      __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
-	      __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
-	      __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
-	      __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
-	    }
-	  SAVE2x4_ACC (&acc0, 0);
-	  SAVE2x4_ACC (&acc1, 2);
-	  SAVE2x4_ACC (&acc2, 4);
-	  SAVE2x4_ACC (&acc3, 6);
-	  SAVE2x4_ACC (&acc4, 8);
-	  SAVE2x4_ACC (&acc5, 10);
-	  SAVE2x4_ACC (&acc6, 12);
-	  SAVE2x4_ACC (&acc7, 14);
-	  CO += 16;
-	  AO += temp << 4;
-	  BO += temp << 1;
-#if defined(TRMMKERNEL)
-          REFRESH_AFTER_SAVE (16, 2)
-#endif
-	}
-      i = (m & 15) >> 3;
-      for (j = 0; j < i; j++)
+      for (j = 0; j < (m >> 3); j++)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -511,8 +594,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
           REFRESH_AFTER_SAVE (8, 2)
 #endif
 	}
-      i = (m & 7) >> 2;
-      for (j = 0; j < i; j++)
+      if (m & 4)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -551,8 +633,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
           REFRESH_AFTER_SAVE (4, 2)
 #endif
 	}
-      i = (m & 3) >> 1;
-      for (j = 0; j < i; j++)
+      if (m & 2)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -588,8 +669,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
           REFRESH_AFTER_SAVE (2, 2)
 #endif
 	}
-      i = (m & 1) >> 0;
-      for (j = 0; j < i; j++)
+      if (m & 1)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -626,8 +706,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 #endif
       B += k << 1;
     }
-  N = (n & 1) >> 0;
-  for (i1 = 0; i1 < N; i1++)
+  if (n & 1)
     {
       BLASLONG i, temp;
 #if defined(TRMMKERNEL) && defined(LEFT)
@@ -638,97 +717,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
       CO = C;
       C += ldc;
       AO = A;
-      i = m;
-      while (i >= 16)
-	{
-	  FLOAT *BO;
-#if defined(TRMMKERNEL)
-          REFRESH_POINTERS (16, 1)
-#else
-          BO = B;
-          temp = k;
-#endif
-	  BLASLONG l = 0;
-	  v4sf_t t = { 0, 0 };
-	  v4sf_t t1 = { 0, 0 };
-	  v4sf_t t2 = { 0, 0 };
-	  v4sf_t t3 = { 0, 0 };
-	  v4sf_t t4 = { 0, 0 };
-	  v4sf_t t5 = { 0, 0 };
-	  v4sf_t t6 = { 0, 0 };
-	  v4sf_t t7 = { 0, 0 };
-	  for (l = 0; l < temp; l++)
-	    {
-	      v4sf_t rowB = { BO[l], BO[l] };
-	      v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] };
-	      v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] };
-	      v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] };
-	      v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] };
-	      v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] };
-	      v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] };
-	      v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] };
-	      v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] };
-	      t += rowA * rowB;
-	      t1 += rowA1 * rowB;
-	      t2 += rowA2 * rowB;
-	      t3 += rowA3 * rowB;
-	      t4 += rowA4 * rowB;
-	      t5 += rowA5 * rowB;
-	      t6 += rowA6 * rowB;
-	      t7 += rowA7 * rowB;
-	    }
-	  t = t * valpha;
-	  t1 = t1 * valpha;
-	  t2 = t2 * valpha;
-	  t3 = t3 * valpha;
-	  t4 = t4 * valpha;
-	  t5 = t5 * valpha;
-	  t6 = t6 * valpha;
-	  t7 = t7 * valpha;
-#if defined(TRMMKERNEL)
-	  CO[0] = t[0];
-	  CO[1] = t[1];
-	  CO[2] = t1[0];
-	  CO[3] = t1[1];
-	  CO[4] = t2[0];
-	  CO[5] = t2[1];
-	  CO[6] = t3[0];
-	  CO[7] = t3[1];
-	  CO[8] = t4[0];
-	  CO[9] = t4[1];
-	  CO[10] = t5[0];
-	  CO[11] = t5[1];
-	  CO[12] = t6[0];
-	  CO[13] = t6[1];
-	  CO[14] = t7[0];
-	  CO[15] = t7[1];
-#else
-	  CO[0] += t[0];
-	  CO[1] += t[1];
-	  CO[2] += t1[0];
-	  CO[3] += t1[1];
-	  CO[4] += t2[0];
-	  CO[5] += t2[1];
-	  CO[6] += t3[0];
-	  CO[7] += t3[1];
-	  CO[8] += t4[0];
-	  CO[9] += t4[1];
-	  CO[10] += t5[0];
-	  CO[11] += t5[1];
-	  CO[12] += t6[0];
-	  CO[13] += t6[1];
-	  CO[14] += t7[0];
-	  CO[15] += t7[1];
-#endif
-	  AO += temp << 4;
-	  BO += temp;
-	  CO += 16;
-	  i -= 16;
-#if defined(TRMMKERNEL)
-          REFRESH_AFTER_SAVE (16, 1)
-#endif
-	}
-      while (i >= 8)
+      for (i = 0; i < (m >> 3); i++)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -780,12 +769,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  AO += temp << 3;
 	  BO += temp;
 	  CO += 8;
-	  i -= 8;
 #if defined(TRMMKERNEL)
           REFRESH_AFTER_SAVE (8, 1)
 #endif
 	}
-      while (i >= 4)
+      if (m & 4)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -821,12 +809,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  AO += temp << 2;
 	  BO += temp;
 	  CO += 4;
-	  i -= 4;
 #if defined(TRMMKERNEL)
           REFRESH_AFTER_SAVE (4, 1)
 #endif
 	}
-      while (i >= 2)
+      if (m & 2)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -854,12 +841,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  AO += temp << 1;
 	  BO += temp;
 	  CO += 2;
-	  i -= 2;
 #if defined(TRMMKERNEL)
           REFRESH_AFTER_SAVE (2, 1)
 #endif
 	}
-      while (i >= 1)
+      if (m & 1)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -882,7 +868,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  CO[0] += t * alpha;
 #endif
 	  CO += 1;
-	  i -= 1;
 #if defined(TRMMKERNEL)
           REFRESH_AFTER_SAVE (1, 1)
 #endif
diff --git a/kernel/power/dgemm_ncopy_8_power10.c b/kernel/power/dgemm_ncopy_8_power10.c
new file mode 100644
index 000000000..9836c2e7f
--- /dev/null
+++ b/kernel/power/dgemm_ncopy_8_power10.c
@@ -0,0 +1,326 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <altivec.h>
+#define PREFETCHA(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+  BLASLONG i, j;
+
+  IFLOAT *aoffset;
+  IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
+  IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
+
+  IFLOAT *boffset;
+  IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+  IFLOAT ctemp09, ctemp17, ctemp33;
+  IFLOAT ctemp25, ctemp41;
+  IFLOAT ctemp49, ctemp57;
+
+  aoffset = a;
+  boffset = b;
+
+  j = (n >> 3);
+  if (j > 0){
+    do{
+      aoffset1  = aoffset;
+      aoffset2  = aoffset1 + lda;
+      aoffset3  = aoffset2 + lda;
+      aoffset4  = aoffset3 + lda;
+      aoffset5  = aoffset4 + lda;
+      aoffset6  = aoffset5 + lda;
+      aoffset7  = aoffset6 + lda;
+      aoffset8  = aoffset7 + lda;
+      aoffset += 8 * lda;
+
+      i = (m >> 3);
+      if (i > 0){
+	do{
+	PREFETCHA (aoffset1, 384);
+	PREFETCHA (aoffset2, 384);
+	PREFETCHA (aoffset3, 384);
+	PREFETCHA (aoffset4, 384);
+	PREFETCHA (aoffset5, 384);
+	PREFETCHA (aoffset6, 384);
+	PREFETCHA (aoffset7, 384);
+	PREFETCHA (aoffset8, 384);
+	__vector double va0 = *(__vector double*)(aoffset1 +  0);
+	__vector double va1 = *(__vector double*)(aoffset1 +  2);
+	__vector double va2 = *(__vector double*)(aoffset1 +  4);
+	__vector double va3 = *(__vector double*)(aoffset1 +  6);
+
+	__vector double va4 = *(__vector double*)(aoffset2 +  0);
+	__vector double va5 = *(__vector double*)(aoffset2 +  2);
+	__vector double va6 = *(__vector double*)(aoffset2 +  4);
+	__vector double va7 = *(__vector double*)(aoffset2 +  6);
+
+	__vector double va8 = *(__vector double*)(aoffset3 +  0);
+	__vector double va9 = *(__vector double*)(aoffset3 +  2);
+	__vector double va10 = *(__vector double*)(aoffset3 + 4);
+	__vector double va11 = *(__vector double*)(aoffset3 + 6);
+
+	__vector double va12 = *(__vector double*)(aoffset4 +  0);
+	__vector double va13 = *(__vector double*)(aoffset4 +  2);
+	__vector double va14 = *(__vector double*)(aoffset4 +  4);
+	__vector double va15 = *(__vector double*)(aoffset4 +  6);
+
+	__vector double va16 = *(__vector double*)(aoffset5 +  0);
+	__vector double va17 = *(__vector double*)(aoffset5 +  2);
+	__vector double va18 = *(__vector double*)(aoffset5 +  4);
+	__vector double va19 = *(__vector double*)(aoffset5 +  6);
+
+	__vector double va20 = *(__vector double*)(aoffset6 +  0);
+	__vector double va21 = *(__vector double*)(aoffset6 +  2);
+	__vector double va22 = *(__vector double*)(aoffset6 +  4);
+	__vector double va23 = *(__vector double*)(aoffset6 +  6);
+
+	__vector double va24 = *(__vector double*)(aoffset7 +  0);
+	__vector double va25 = *(__vector double*)(aoffset7 +  2);
+	__vector double va26 = *(__vector double*)(aoffset7 + 4);
+	__vector double va27 = *(__vector double*)(aoffset7 + 6);
+
+	__vector double va28 = *(__vector double*)(aoffset8 +  0);
+	__vector double va29 = *(__vector double*)(aoffset8 +  2);
+	__vector double va30 = *(__vector double*)(aoffset8 +  4);
+	__vector double va31 = *(__vector double*)(aoffset8 +  6);
+
+	*(__vector double*)(boffset +  0) = vec_xxpermdi(va0, va4, 0);
+	*(__vector double*)(boffset +  2) = vec_xxpermdi(va8, va12, 0);
+	*(__vector double*)(boffset +  4) = vec_xxpermdi(va16, va20, 0);
+	*(__vector double*)(boffset +  6) = vec_xxpermdi(va24, va28, 0);
+	*(__vector double*)(boffset +  8) = vec_xxpermdi(va0, va4, 3);
+	*(__vector double*)(boffset +  10) = vec_xxpermdi(va8, va12, 3);
+	*(__vector double*)(boffset +  12) = vec_xxpermdi(va16, va20, 3);
+	*(__vector double*)(boffset +  14) = vec_xxpermdi(va24, va28, 3);
+
+	*(__vector double*)(boffset +  16) = vec_xxpermdi(va1, va5, 0);
+	*(__vector double*)(boffset +  18) = vec_xxpermdi(va9, va13, 0);
+	*(__vector double*)(boffset +  20) = vec_xxpermdi(va17, va21, 0);
+	*(__vector double*)(boffset +  22) = vec_xxpermdi(va25, va29, 0);
+	*(__vector double*)(boffset +  24) = vec_xxpermdi(va1, va5, 3);
+	*(__vector double*)(boffset +  26) = vec_xxpermdi(va9, va13, 3);
+	*(__vector double*)(boffset +  28) = vec_xxpermdi(va17, va21, 3);
+	*(__vector double*)(boffset +  30) = vec_xxpermdi(va25, va29, 3);
+
+	*(__vector double*)(boffset +  32) = vec_xxpermdi(va2, va6, 0);
+	*(__vector double*)(boffset +  34) = vec_xxpermdi(va10, va14, 0);
+	*(__vector double*)(boffset +  36) = vec_xxpermdi(va18, va22, 0);
+	*(__vector double*)(boffset +  38) = vec_xxpermdi(va26, va30, 0);
+	*(__vector double*)(boffset +  40) = vec_xxpermdi(va2, va6, 3);
+	*(__vector double*)(boffset +  42) = vec_xxpermdi(va10, va14, 3);
+	*(__vector double*)(boffset +  44) = vec_xxpermdi(va18, va22, 3);
+	*(__vector double*)(boffset +  46) = vec_xxpermdi(va26, va30, 3);
+
+	*(__vector double*)(boffset +  48) = vec_xxpermdi(va3, va7, 0);
+	*(__vector double*)(boffset +  50) = vec_xxpermdi(va11, va15, 0);
+	*(__vector double*)(boffset +  52) = vec_xxpermdi(va19, va23, 0);
+	*(__vector double*)(boffset +  54) = vec_xxpermdi(va27, va31, 0);
+	*(__vector double*)(boffset +  56) = vec_xxpermdi(va3, va7, 3);
+	*(__vector double*)(boffset +  58) = vec_xxpermdi(va11, va15, 3);
+	*(__vector double*)(boffset +  60) = vec_xxpermdi(va19, va23, 3);
+	*(__vector double*)(boffset +  62) = vec_xxpermdi(va27, va31, 3);
+	  aoffset1 +=  8;
+	  aoffset2 +=  8;
+	  aoffset3 +=  8;
+	  aoffset4 +=  8;
+	  aoffset5 +=  8;
+	  aoffset6 +=  8;
+	  aoffset7 +=  8;
+	  aoffset8 +=  8;
+	  boffset  += 64;
+	  i --;
+	}while(i > 0);
+      }
+
+      i = (m & 7);
+      if (i > 0){
+	do{
+	  ctemp01 = *(aoffset1 +  0);
+	  ctemp09 = *(aoffset2 +  0);
+	  ctemp17 = *(aoffset3 +  0);
+	  ctemp25 = *(aoffset4 +  0);
+	  ctemp33 = *(aoffset5 +  0);
+	  ctemp41 = *(aoffset6 +  0);
+	  ctemp49 = *(aoffset7 +  0);
+	  ctemp57 = *(aoffset8 +  0);
+
+	  *(boffset +  0) = ctemp01;
+	  *(boffset +  1) = ctemp09;
+	  *(boffset +  2) = ctemp17;
+	  *(boffset +  3) = ctemp25;
+	  *(boffset +  4) = ctemp33;
+	  *(boffset +  5) = ctemp41;
+	  *(boffset +  6) = ctemp49;
+	  *(boffset +  7) = ctemp57;
+
+	  aoffset1 ++;
+	  aoffset2 ++;
+	  aoffset3 ++;
+	  aoffset4 ++;
+	  aoffset5 ++;
+	  aoffset6 ++;
+	  aoffset7 ++;
+	  aoffset8 ++;
+
+	  boffset += 8;
+	  i --;
+	}while(i > 0);
+      }
+      j--;
+    }while(j > 0);
+  } /* end of if(j > 0) */
+
+  if (n & 4){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset1 + lda;
+    aoffset3  = aoffset2 + lda;
+    aoffset4  = aoffset3 + lda;
+    aoffset += 4 * lda;
+
+    i = (m >> 2);
+    if (i > 0){
+      do{
+	PREFETCHA (aoffset1, 384);
+	PREFETCHA (aoffset2, 384);
+	PREFETCHA (aoffset3, 384);
+	PREFETCHA (aoffset4, 384);
+	__vector double va0 = *(__vector double*)(aoffset1 +  0);
+	__vector double va1 = *(__vector double*)(aoffset1 +  2);
+	__vector double va2 = *(__vector double*)(aoffset2 +  0);
+	__vector double va3 = *(__vector double*)(aoffset2 +  2);
+	__vector double va4 = *(__vector double*)(aoffset3 +  0);
+	__vector double va5 = *(__vector double*)(aoffset3 +  2);
+	__vector double va6 = *(__vector double*)(aoffset4 +  0);
+	__vector double va7 = *(__vector double*)(aoffset4 +  2);
+	*(__vector double*)(boffset +  0) = vec_xxpermdi(va0, va2, 0);
+	*(__vector double*)(boffset +  2) = vec_xxpermdi(va4, va6, 0);
+	*(__vector double*)(boffset +  4) = vec_xxpermdi(va0, va2, 3);
+	*(__vector double*)(boffset +  6) = vec_xxpermdi(va4, va6, 3);
+	*(__vector double*)(boffset +  8) = vec_xxpermdi(va1, va3, 0);
+	*(__vector double*)(boffset +  10) = vec_xxpermdi(va5, va7, 0);
+	*(__vector double*)(boffset +  12) = vec_xxpermdi(va1, va3, 3);
+	*(__vector double*)(boffset +  14) = vec_xxpermdi(va5, va7, 3);
+
+	aoffset1 +=  4;
+	aoffset2 +=  4;
+	aoffset3 +=  4;
+	aoffset4 +=  4;
+	boffset  +=  16;
+	i --;
+      }while(i > 0);
+    }
+
+    i = (m & 3);
+    if (i > 0){
+      do{
+	ctemp01 = *(aoffset1 +  0);
+	ctemp02 = *(aoffset2 +  0);
+	ctemp03 = *(aoffset3 +  0);
+	ctemp04 = *(aoffset4 +  0);
+
+	*(boffset +  0) = ctemp01;
+	*(boffset +  1) = ctemp02;
+	*(boffset +  2) = ctemp03;
+	*(boffset +  3) = ctemp04;
+
+	aoffset1 ++;
+	aoffset2 ++;
+	aoffset3 ++;
+	aoffset4 ++;
+
+	boffset += 4;
+	i --;
+      }while(i > 0);
+    }
+  } /* end of if(j > 0) */
+
+  if (n & 2){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset1 + lda;
+    aoffset += 2 * lda;
+
+    i = (m >> 1);
+    if (i > 0){
+      do{
+	__vector double va0 = *(__vector double*)(aoffset1 +  0);
+	__vector double va1 = *(__vector double*)(aoffset2 +  0);
+	*(__vector double*)(boffset +  0) = vec_xxpermdi(va0, va1, 0);
+	*(__vector double*)(boffset +  2) = vec_xxpermdi(va0, va1, 3);
+
+	aoffset1 +=  2;
+	aoffset2 +=  2;
+	boffset  +=  4;
+	i --;
+      }while(i > 0);
+    }
+
+    if (m & 1){
+      ctemp01 = *(aoffset1 +  0);
+      ctemp02 = *(aoffset2 +  0);
+
+      *(boffset +  0) = ctemp01;
+      *(boffset +  1) = ctemp02;
+
+      aoffset1 ++;
+      aoffset2 ++;
+      boffset += 2;
+    }
+  } /* end of if(j > 0) */
+
+  if (n & 1){
+    aoffset1  = aoffset;
+
+    i = m;
+    if (i > 0){
+      do{
+	ctemp01 = *(aoffset1 +  0);
+
+	*(boffset +  0) = ctemp01;
+
+	aoffset1 ++;
+	boffset  ++;
+	i --;
+      }while(i > 0);
+    }
+
+  } /* end of if(j > 0) */
+
+  return 0;
+}
diff --git a/kernel/power/sgemm_kernel_power10.c b/kernel/power/sgemm_kernel_power10.c
index 9fbf84695..80f495f70 100644
--- a/kernel/power/sgemm_kernel_power10.c
+++ b/kernel/power/sgemm_kernel_power10.c
@@ -197,7 +197,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 #endif
   )
 {
-  BLASLONG N = n;
   BLASLONG i1;
 #if defined(TRMMKERNEL)
   BLASLONG off;
@@ -207,10 +206,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 #endif
 
   v4sf_t valpha = { alpha, alpha, alpha, alpha };
-  N = n >> 3;
-  for (i1 = 0; i1 < N; i1++)
+  for (i1 = 0; i1 < (n >> 3); i1++)
     {
-      BLASLONG i, j, temp;
+      BLASLONG j, temp;
       FLOAT *CO;
       FLOAT *AO;
 #if defined(TRMMKERNEL) && defined(LEFT)
@@ -221,8 +219,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
       AO = A;
       PREFETCH1 (A, 128);
       PREFETCH1 (A, 256);
-      i = m >> 4;
-      for (j = 0; j < i; j++)
+      for (j = 0; j < (m >> 4); j++)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -438,8 +435,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 #endif
 	    CO += 16;
 	}
-      i = (m & 15) >> 3;
-      for (j = 0; j < i; j++)
+      if (m & 8)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -478,8 +474,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (8, 8)
 #endif
 	}
-      i = (m & 7) >> 2;
-      for (j = 0; j < i; j++)
+      if (m & 4)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -512,8 +507,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (4, 8)
 #endif
 	}
-      i = (m & 3) >> 1;
-      for (j = 0; j < i; j++)
+      if (m & 2)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -550,8 +544,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (2, 8)
 #endif
 	}
-      i = (m & 1) >> 0;
-      for (j = 0; j < i; j++)
+      if (m & 1)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -610,8 +603,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 
       B += k << 3;
     }
-  N = (n & 7) >> 2;
-  for (i1 = 0; i1 < N; i1++)
+  if (n & 4)
     {
       BLASLONG i, j, temp;
 #if defined(TRMMKERNEL) && defined(LEFT)
@@ -719,8 +711,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (16, 4)
 #endif
 	}
-      i = (m & 15) >> 3;
-      for (j = 0; j < i; j++)
+      if (m & 8)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -753,8 +744,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (8, 4)
 #endif
 	}
-      i = (m & 7) >> 2;
-      for (j = 0; j < i; j++)
+      if (m & 4)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -784,8 +774,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (4, 4)
 #endif
 	}
-      i = (m & 3) >> 1;
-      for (j = 0; j < i; j++)
+      if (m & 2)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -818,8 +807,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (2, 4)
 #endif
 	}
-      i = (m & 1) >> 0;
-      for (j = 0; j < i; j++)
+      if (m & 1)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -863,8 +851,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 
       B += k << 2;
     }
-  N = (n & 3) >> 1;
-  for (i1 = 0; i1 < N; i1++)
+  if (n & 2)
     {
       BLASLONG i, j, temp;
 #if defined(TRMMKERNEL) && defined(LEFT)
@@ -973,8 +960,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (16, 2)
 #endif
 	}
-      i = (m & 15) >> 3;
-      for (j = 0; j < i; j++)
+      if (m & 8)
 	{
 	  FLOAT *BO;
 	  v4sf_t *rowC;
@@ -1010,8 +996,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (8, 2)
 #endif
 	}
-      i = (m & 7) >> 2;
-      for (j = 0; j < i; j++)
+      if (m & 4)
 	{
 	  FLOAT *BO;
 	  v4sf_t *rowC;
@@ -1044,8 +1029,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (4, 2)
 #endif
 	}
-      i = (m & 3) >> 1;
-      for (j = 0; j < i; j++)
+      if (m & 2)
 	{
 	  FLOAT *BO;
 	  BLASLONG l = 0;
@@ -1081,8 +1065,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (2, 2)
 #endif
 	}
-      i = (m & 1) >> 0;
-      for (j = 0; j < i; j++)
+      if (m & 1)
 	{
 	  FLOAT *BO;
 	  BLASLONG l = 0;
@@ -1120,8 +1103,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 
       B += k << 1;
     }
-  N = (n & 1) >> 0;
-  for (i1 = 0; i1 < N; i1++)
+  if (n & 1)
     {
       BLASLONG i, temp;
 #if defined(TRMMKERNEL) && defined(LEFT)
@@ -1132,8 +1114,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
       CO = C;
       C += ldc;
       AO = A;
-      i = m;
-      while (i >= 16)
+      for (i = 0; i < (m >> 4); i++)
 	{
 	  FLOAT *BO;
 	  BLASLONG l = 0;
@@ -1213,12 +1194,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  AO += temp << 4;
 	  BO += temp;
 	  CO += 16;
-	  i -= 16;
 #if defined(TRMMKERNEL)
 	  REFRESH_AFTER_SAVE (16, 1)
 #endif
 	}
-      while (i >= 8)
+      if (m & 8)
 	{
 	  FLOAT *BO;
 	  BLASLONG l = 0;
@@ -1268,12 +1248,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  AO += temp << 3;
 	  BO += temp;
 	  CO += 8;
-	  i -= 8;
 #if defined(TRMMKERNEL)
 	  REFRESH_AFTER_SAVE (8, 1)
 #endif
 	}
-      while (i >= 4)
+      if (m & 4)
 	{
 	  FLOAT *BO;
 	  BLASLONG l = 0;
@@ -1308,12 +1287,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  AO += temp << 2;
 	  BO += temp;
 	  CO += 4;
-	  i -= 4;
 #if defined(TRMMKERNEL)
 	  REFRESH_AFTER_SAVE (4, 1)
 #endif
 	}
-      while (i >= 2)
+      if (m & 2)
 	{
 	  FLOAT *BO;
 	  BLASLONG l = 0;
@@ -1342,12 +1320,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  AO += temp << 1;
 	  BO += temp;
 	  CO += 2;
-	  i -= 2;
 #if defined(TRMMKERNEL)
 	  REFRESH_AFTER_SAVE (2, 1)
 #endif
 	}
-      while (i >= 1)
+      if (m & 1)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -1371,7 +1348,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  CO[0] += t * alpha;
 #endif
 	  CO += 1;
-	  i -= 1;
 #if defined(TRMMKERNEL)
 	  REFRESH_AFTER_SAVE (1, 1)
 #endif
diff --git a/param.h b/param.h
index f3ddde6a1..2047e4776 100644
--- a/param.h
+++ b/param.h
@@ -2436,6 +2436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SBGEMM_DEFAULT_P 832
 #define SBGEMM_DEFAULT_Q 1026
 #define SBGEMM_DEFAULT_R 4096
+#undef DGEMM_DEFAULT_UNROLL_M
+#undef DGEMM_DEFAULT_UNROLL_N
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define DGEMM_DEFAULT_UNROLL_N 8
 #endif
 
 #if defined(SPARC) && defined(V7)

From 40a93c232b6a9a09fb0cf10a8de5ba6ca94070a8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 1 Nov 2020 21:58:26 +0100
Subject: [PATCH 64/83] Disable EXPRECISION for DYNAMIC_ARCH in combination
 with TARGET=GENERIC

NO_EXPRECISION is disabled for the GENERIC_TARGET already, so prevent mixing with code parts that use a different float size by default
---
 Makefile.system | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index 52d3e2cdc..b62eab379 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -93,6 +93,11 @@ endif
 ifdef TARGET
 GETARCH_FLAGS := -DFORCE_$(TARGET)
 GETARCH_FLAGS += -DUSER_TARGET
+ifeq ($(TARGET), GENERIC)
+ifeq ($(DYNAMIC_ARCH), 1)
+override NO_EXPRECISION=1
+endif
+endif
 endif
 
 # Force fallbacks for 32bit

From 6baf8af6588725ee720bcfad12e235a61df5deb2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 1 Nov 2020 22:11:48 +0100
Subject: [PATCH 65/83] Disable EXPRECISION for the combination of DYNAMIC_CORE
 and GENERIC target

---
 cmake/os.cmake | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cmake/os.cmake b/cmake/os.cmake
index c644bc3f7..98428c624 100644
--- a/cmake/os.cmake
+++ b/cmake/os.cmake
@@ -84,6 +84,10 @@ if (X86)
   set(NO_EXPRECISION 1)
 endif ()
 
+if ((DYNAMIC_ARCH) AND (${TARGET} STREQUAL "GENERIC))
+  set(NO_EXPRECISION 1)
+endif ()
+
 if (UTEST_CHECK)
   set(CCOMMON_OPT "${CCOMMON_OPT} -DUTEST_CHECK")
   set(SANITY_CHECK 1)

From e5f8c2bf8ae438ec6b626f9fe6711101ad004d3d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 1 Nov 2020 22:25:43 +0100
Subject: [PATCH 66/83] typo fix

---
 cmake/os.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/os.cmake b/cmake/os.cmake
index 98428c624..1eb2b7472 100644
--- a/cmake/os.cmake
+++ b/cmake/os.cmake
@@ -84,7 +84,7 @@ if (X86)
   set(NO_EXPRECISION 1)
 endif ()
 
-if ((DYNAMIC_ARCH) AND (${TARGET} STREQUAL "GENERIC))
+if ((DYNAMIC_ARCH) AND (${TARGET} STREQUAL "GENERIC"))
   set(NO_EXPRECISION 1)
 endif ()
 

From b9bc76aec4c869fed0b5cfbbe11336206a6ff5ec Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 2 Nov 2020 22:43:50 +0100
Subject: [PATCH 67/83] Add files via upload

---
 cmake/os.cmake       |  4 +++-
 cmake/prebuild.cmake | 30 ++++++++++++++++++++++++++++++
 cmake/system.cmake   | 31 +++++++++++++++++++++++++++++--
 3 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/cmake/os.cmake b/cmake/os.cmake
index 1eb2b7472..feb4c05d1 100644
--- a/cmake/os.cmake
+++ b/cmake/os.cmake
@@ -84,9 +84,11 @@ if (X86)
   set(NO_EXPRECISION 1)
 endif ()
 
-if ((DYNAMIC_ARCH) AND (${TARGET} STREQUAL "GENERIC"))
+if (DYNAMIC_ARCH)
+if (${TARGET} STREQUAL "GENERIC")
   set(NO_EXPRECISION 1)
 endif ()
+endif ()
 
 if (UTEST_CHECK)
   set(CCOMMON_OPT "${CCOMMON_OPT} -DUTEST_CHECK")
diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index 3e38abbf5..b1b4c501a 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -139,6 +139,36 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
       set(CGEMM3M_UNROLL_N 4)
       set(ZGEMM3M_UNROLL_M 4)
       set(ZGEMM3M_UNROLL_N 4)
+  elseif ("${TCORE}" STREQUAL "BARCELONA")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define HAVE_SSE3\n")
+  elseif ("${TCORE}" STREQUAL "STEAMROLLER")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define HAVE_SSE3\n")
+  elseif ("${TCORE}" STREQUAL "EXCAVATOR")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define HAVE_SSE3\n")
+  elseif ("${TCORE}" STREQUAL "NEHALEM")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define HAVE_SSE3\n")
+  elseif ("${TCORE}" STREQUAL "PRESCOTT")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define HAVE_SSE3\n")
+  elseif ("${TCORE}" STREQUAL "SANDYBRIDGE")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define HAVE_AVX\n")
+  elseif ("${TCORE}" STREQUAL "HASWELL")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define HAVE_AVX2\n")
+  elseif ("${TCORE}" STREQUAL "ZEN")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define HAVE_AVX2\n")
+  elseif ("${TCORE}" STREQUAL "SKYLAKEX")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define HAVE_AVX512\n")
+  elseif ("${TCORE}" STREQUAL "COOPERLAKE")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define HAVE_AVX512\n")
   elseif ("${TCORE}" STREQUAL "ARMV7")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_DATA_SIZE\t65536\n"
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 4cc46236d..83b79bab2 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -64,12 +64,39 @@ if (DEFINED TARGET)
     if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
       execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
       if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
-        set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+        set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
       endif()
     elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
     endif()
   endif()
+  if (${TARGET} STREQUAL "ZEN" AND NOT NO_AVX2)
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
+  endif()
+  if (${TARGET} STREQUAL "SANDYBRIDGE" AND NOT NO_AVX)
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx")
+  endif()
+  if (${TARGET} STREQUAL "BARCELONA")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
+  endif()
+  if (${TARGET} STREQUAL "STEAMROLLER")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
+  endif()
+  if (${TARGET} STREQUAL "EXCAVATOR")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
+  endif()
+  if (${TARGET} STREQUAL "PILEDRIVER")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
+  endif()
+  if (${TARGET} STREQUAL "PRESCOTT")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
+  endif()
+  if (${TARGET} STREQUAL "NEHALEM")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
+  endif()
+  if (${TARGET} STREQUAL "CORE2")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
+  endif()
   if (DEFINED HAVE_SSE)
     set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
   endif()

From a9f9354296d448ffc087fc618d4fc9c39b56f72c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 2 Nov 2020 23:17:46 +0100
Subject: [PATCH 68/83] Fix target test

---
 cmake/os.cmake | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cmake/os.cmake b/cmake/os.cmake
index feb4c05d1..e24059dd5 100644
--- a/cmake/os.cmake
+++ b/cmake/os.cmake
@@ -85,10 +85,12 @@ if (X86)
 endif ()
 
 if (DYNAMIC_ARCH)
+if (TARGET)
 if (${TARGET} STREQUAL "GENERIC")
   set(NO_EXPRECISION 1)
 endif ()
 endif ()
+endif ()
 
 if (UTEST_CHECK)
   set(CCOMMON_OPT "${CCOMMON_OPT} -DUTEST_CHECK")

From 0155cd53a3c29e8a57cdef504a4a685bc7ea098a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 3 Nov 2020 23:45:49 +0100
Subject: [PATCH 69/83] Add -msse3 where needed for DYNAMIC_ARCH builds

---
 cmake/system.cmake | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/cmake/system.cmake b/cmake/system.cmake
index 83b79bab2..48d206b12 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -67,34 +67,31 @@ if (DEFINED TARGET)
         set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
       endif()
     elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse -msse3 -mavx2")
     endif()
   endif()
+  if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2)
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
+  endif()
   if (${TARGET} STREQUAL "ZEN" AND NOT NO_AVX2)
       set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
   endif()
   if (${TARGET} STREQUAL "SANDYBRIDGE" AND NOT NO_AVX)
       set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx")
   endif()
-  if (${TARGET} STREQUAL "BARCELONA")
+  if (${TARGET} STREQUAL "BARCELONA" OR ${TARGET} STREQUAL "STEAMROLLER" OR ${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "EXCAVATOR")
       set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
   endif()
-  if (${TARGET} STREQUAL "STEAMROLLER")
+  if (${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "BOBCAT" OR ${TARGET} STREQUAL "OPTERON_SSE3")
       set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
   endif()
-  if (${TARGET} STREQUAL "EXCAVATOR")
+  if (${TARGET} STREQUAL "PRESCOTT" OR ${TARGET} STREQUAL "NANO")
       set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
   endif()
-  if (${TARGET} STREQUAL "PILEDRIVER")
+  if (${TARGET} STREQUAL "NEHALEM" OR ${TARGET} STREQUAL "ATOM")
       set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
   endif()
-  if (${TARGET} STREQUAL "PRESCOTT")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
-  endif()
-  if (${TARGET} STREQUAL "NEHALEM")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
-  endif()
-  if (${TARGET} STREQUAL "CORE2")
+  if (${TARGET} STREQUAL "CORE2" OR ${TARGET} STREQUAL "PENRYN" OR ${TARGET} STREQUAL "DUNNINGTON")
       set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
   endif()
   if (DEFINED HAVE_SSE)

From 8cc73fee98684b49fdd1869e44b3d6a816cdb407 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 3 Nov 2020 23:47:04 +0100
Subject: [PATCH 70/83] Export NO_EXPRECISION after overriding for DYNAMIC_ARCH
 with GENERIC target

---
 Makefile.system | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile.system b/Makefile.system
index b62eab379..ca302a98a 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -96,6 +96,7 @@ GETARCH_FLAGS += -DUSER_TARGET
 ifeq ($(TARGET), GENERIC)
 ifeq ($(DYNAMIC_ARCH), 1)
 override NO_EXPRECISION=1
+export NO_EXPRECiSION
 endif
 endif
 endif

From d9ba49165af15d535d9b9955bd248eab4d259f06 Mon Sep 17 00:00:00 2001
From: Gengxin Xie <gengxin.xie@intel.com>
Date: Sun, 27 Sep 2020 10:38:19 +0800
Subject: [PATCH 71/83] Improve the performance of rot by using AVX512 and AVX2
 intrinsic

---
 driver/others/blas_l1_thread.c         |   2 +-
 driver/others/blas_server_win32.c      |  11 +-
 kernel/x86_64/KERNEL.HASWELL           |   3 +
 kernel/x86_64/drot.c                   | 139 +++++++++++++++++++++++++
 kernel/x86_64/drot_microk_haswell-2.c  |  87 ++++++++++++++++
 kernel/x86_64/drot_microk_skylakex-2.c |  94 +++++++++++++++++
 kernel/x86_64/srot.c                   | 139 +++++++++++++++++++++++++
 kernel/x86_64/srot_microk_haswell-2.c  |  87 ++++++++++++++++
 kernel/x86_64/srot_microk_skylakex-2.c |  91 ++++++++++++++++
 9 files changed, 648 insertions(+), 5 deletions(-)
 create mode 100644 kernel/x86_64/drot.c
 create mode 100644 kernel/x86_64/drot_microk_haswell-2.c
 create mode 100644 kernel/x86_64/drot_microk_skylakex-2.c
 create mode 100644 kernel/x86_64/srot.c
 create mode 100644 kernel/x86_64/srot_microk_haswell-2.c
 create mode 100644 kernel/x86_64/srot_microk_skylakex-2.c

diff --git a/driver/others/blas_l1_thread.c b/driver/others/blas_l1_thread.c
index 04acbcc5f..06039c952 100644
--- a/driver/others/blas_l1_thread.c
+++ b/driver/others/blas_l1_thread.c
@@ -80,7 +80,7 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha
     break;
   }
 
-  mode |= BLAS_LEGACY;
+  if(!(mode & BLAS_PTHREAD)) mode |= BLAS_LEGACY;
 
   for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]);
 
diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index d2cc91757..f47908c70 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -476,12 +476,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
 
   routine = queue -> routine;
 
-    if (!(queue -> mode & BLAS_LEGACY)) {
+  if (queue -> mode & BLAS_LEGACY) {
+    legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
+  } else
+    if (queue -> mode & BLAS_PTHREAD) {
+      void (*pthreadcompat)(void *) = queue -> routine;
+      (pthreadcompat)(queue -> args);
+    } else
       (routine)(queue -> args, queue -> range_m, queue -> range_n,
 		queue -> sa, queue -> sb, 0);
-    } else {
-      legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
-    }
 
   if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next);
 
diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index b979fc0ae..81eaf96ac 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -102,3 +102,6 @@ ZGEMM3MKERNEL    =  zgemm3m_kernel_4x4_haswell.c
 
 SASUMKERNEL = sasum.c
 DASUMKERNEL = dasum.c
+
+SROTKERNEL = srot.c
+DROTKERNEL = drot.c
diff --git a/kernel/x86_64/drot.c b/kernel/x86_64/drot.c
new file mode 100644
index 000000000..a312b7ff9
--- /dev/null
+++ b/kernel/x86_64/drot.c
@@ -0,0 +1,139 @@
+#include "common.h"
+
+#if defined(SKYLAKEX)
+#include "drot_microk_skylakex-2.c"
+#elif defined(HASWELL)
+#include "drot_microk_haswell-2.c"
+#endif
+
+#ifndef HAVE_DROT_KERNEL
+
+static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+    FLOAT f0, f1, f2, f3;
+    FLOAT x0, x1, x2, x3;
+    FLOAT g0, g1, g2, g3;
+    FLOAT y0, y1, y2, y3;
+
+    FLOAT* xp = x;
+    FLOAT* yp = y;
+
+    BLASLONG n1 = n & (~7);
+
+    while (i < n1) {
+        x0 = xp[0];
+        y0 = yp[0];
+        x1 = xp[1];
+        y1 = yp[1];
+        x2 = xp[2];
+        y2 = yp[2];
+        x3 = xp[3];
+        y3 = yp[3];
+
+        f0 = c*x0 + s*y0;
+        g0 = c*y0 - s*x0;
+        f1 = c*x1 + s*y1;
+        g1 = c*y1 - s*x1;
+        f2 = c*x2 + s*y2;
+        g2 = c*y2 - s*x2;
+        f3 = c*x3 + s*y3;
+        g3 = c*y3 - s*x3;
+
+        xp[0] = f0;
+        yp[0] = g0;
+        xp[1] = f1;
+        yp[1] = g1;
+        xp[2] = f2;
+        yp[2] = g2;
+        xp[3] = f3;
+        yp[3] = g3;
+
+        xp += 4;
+        yp += 4;
+        i += 4;
+    }
+
+    while (i < n) {
+        FLOAT temp = c*x[i] + s*y[i];
+        y[i] = c*y[i] - s*x[i];
+        x[i] = temp;
+
+        i++;
+    }
+}
+
+#endif
+static void rot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+    BLASLONG ix = 0, iy = 0;
+
+    FLOAT temp;
+    
+    if (n <= 0)
+        return;
+    if ((inc_x == 1) && (inc_y == 1)) {
+            drot_kernel(n, x, y, c, s);
+    }
+    else {
+        while (i < n) {
+            temp = c * x[ix] + s * y[iy];
+            y[iy] = c * y[iy] - s * x[ix];
+            x[ix] = temp;
+
+            ix += inc_x;
+            iy += inc_y;
+            i++;
+        }
+    }
+    return;
+}
+
+
+#if defined(SMP)
+static int rot_thread_function(blas_arg_t *args)
+{
+
+    rot_compute(args->m, 
+            args->a, args->lda, 
+            args->b, args->ldb, 
+            ((FLOAT *)args->alpha)[0], 
+            ((FLOAT *)args->alpha)[1]);
+    return 0;
+}
+
+extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads);
+#endif
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+#if defined(SMP)
+    int nthreads;
+    FLOAT alpha[2]={c, s};
+    FLOAT dummy_c;
+#endif
+
+#if defined(SMP)
+    if (inc_x == 0 || inc_y == 0 || n <= 100000) {
+        nthreads = 1;
+    }
+    else {
+        nthreads = num_cpu_avail(1);
+    }
+
+    if (nthreads == 1) {
+        rot_compute(n, x, inc_x, y, inc_y, c, s);
+    }
+    else {
+#if defined(DOUBLE)
+	    int mode = BLAS_DOUBLE | BLAS_REAL | BLAS_PTHREAD;
+#else
+	    int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD;
+#endif
+	    blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads);
+    }
+#else	
+    rot_compute(n, x, inc_x, y, inc_y, c, s);
+#endif
+    return 0;
+}
diff --git a/kernel/x86_64/drot_microk_haswell-2.c b/kernel/x86_64/drot_microk_haswell-2.c
new file mode 100644
index 000000000..72a87696e
--- /dev/null
+++ b/kernel/x86_64/drot_microk_haswell-2.c
@@ -0,0 +1,87 @@
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_DROT_KERNEL 1
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+
+    BLASLONG tail_index_4 = n&(~3);
+    BLASLONG tail_index_16 = n&(~15);
+
+    __m256d c_256, s_256;
+    if (n >= 4) {
+        c_256 = _mm256_set1_pd(c);
+        s_256 = _mm256_set1_pd(s);
+    }
+
+    __m256d x0, x1, x2, x3;
+    __m256d y0, y1, y2, y3;
+    __m256d t0, t1, t2, t3;
+
+    for (i = 0; i < tail_index_16; i += 16) {
+        x0 = _mm256_loadu_pd(&x[i + 0]);
+        x1 = _mm256_loadu_pd(&x[i + 4]);
+        x2 = _mm256_loadu_pd(&x[i + 8]);
+        x3 = _mm256_loadu_pd(&x[i +12]);
+        y0 = _mm256_loadu_pd(&y[i + 0]);
+        y1 = _mm256_loadu_pd(&y[i + 4]);
+        y2 = _mm256_loadu_pd(&y[i + 8]);
+        y3 = _mm256_loadu_pd(&y[i +12]);
+
+        t0 = _mm256_mul_pd(s_256, y0);
+        t1 = _mm256_mul_pd(s_256, y1);
+        t2 = _mm256_mul_pd(s_256, y2);
+        t3 = _mm256_mul_pd(s_256, y3);
+
+        t0 = _mm256_fmadd_pd(c_256, x0, t0);
+        t1 = _mm256_fmadd_pd(c_256, x1, t1);
+        t2 = _mm256_fmadd_pd(c_256, x2, t2);
+        t3 = _mm256_fmadd_pd(c_256, x3, t3);
+
+        _mm256_storeu_pd(&x[i + 0], t0);
+        _mm256_storeu_pd(&x[i + 4], t1);
+        _mm256_storeu_pd(&x[i + 8], t2);
+        _mm256_storeu_pd(&x[i +12], t3);
+
+        t0 = _mm256_mul_pd(s_256, x0);
+        t1 = _mm256_mul_pd(s_256, x1);
+        t2 = _mm256_mul_pd(s_256, x2);
+        t3 = _mm256_mul_pd(s_256, x3);
+
+        t0 = _mm256_fmsub_pd(c_256, y0, t0);
+        t1 = _mm256_fmsub_pd(c_256, y1, t1);
+        t2 = _mm256_fmsub_pd(c_256, y2, t2);
+        t3 = _mm256_fmsub_pd(c_256, y3, t3);
+
+        _mm256_storeu_pd(&y[i + 0], t0);
+        _mm256_storeu_pd(&y[i + 4], t1);
+        _mm256_storeu_pd(&y[i + 8], t2);
+        _mm256_storeu_pd(&y[i +12], t3);
+
+    }
+
+    for (i = tail_index_16; i < tail_index_4; i += 4) {
+        x0 = _mm256_loadu_pd(&x[i]);
+        y0 = _mm256_loadu_pd(&y[i]);
+
+        t0 = _mm256_mul_pd(s_256, y0);
+        t0 = _mm256_fmadd_pd(c_256, x0, t0);
+        _mm256_storeu_pd(&x[i], t0);
+        
+        t0 = _mm256_mul_pd(s_256, x0);
+        t0 = _mm256_fmsub_pd(c_256, y0, t0);
+        _mm256_storeu_pd(&y[i], t0);
+    }
+
+    for (i = tail_index_4; i < n; ++i) {
+        FLOAT temp = c * x[i] + s * y[i];
+        y[i] = c * y[i] - s * x[i];
+        x[i] = temp;
+    }
+}
+#endif
diff --git a/kernel/x86_64/drot_microk_skylakex-2.c b/kernel/x86_64/drot_microk_skylakex-2.c
new file mode 100644
index 000000000..4e862e663
--- /dev/null
+++ b/kernel/x86_64/drot_microk_skylakex-2.c
@@ -0,0 +1,94 @@
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_DROT_KERNEL 1
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+    BLASLONG n1 = n;
+    
+    BLASLONG tail_index_8 = 0;
+    BLASLONG tail_index_32 = 0;
+
+    __m512d c_512 = _mm512_set1_pd(c);
+    __m512d s_512 = _mm512_set1_pd(s);
+
+    tail_index_8 = n1 & (~7);
+    tail_index_32 = n1 & (~31);
+
+
+    __m512d x0, x1, x2, x3;
+    __m512d y0, y1, y2, y3;
+    __m512d t0, t1, t2, t3;
+
+    for (i = 0; i < tail_index_32; i += 32) {
+        x0 = _mm512_loadu_pd(&x[i + 0]);
+        x1 = _mm512_loadu_pd(&x[i + 8]);
+        x2 = _mm512_loadu_pd(&x[i +16]);
+        x3 = _mm512_loadu_pd(&x[i +24]);
+        y0 = _mm512_loadu_pd(&y[i + 0]);
+        y1 = _mm512_loadu_pd(&y[i + 8]);
+        y2 = _mm512_loadu_pd(&y[i +16]);
+        y3 = _mm512_loadu_pd(&y[i +24]);
+
+        t0 = _mm512_mul_pd(s_512, y0);
+        t1 = _mm512_mul_pd(s_512, y1);
+        t2 = _mm512_mul_pd(s_512, y2);
+        t3 = _mm512_mul_pd(s_512, y3);
+
+        t0 = _mm512_fmadd_pd(c_512, x0, t0);
+        t1 = _mm512_fmadd_pd(c_512, x1, t1);
+        t2 = _mm512_fmadd_pd(c_512, x2, t2);
+        t3 = _mm512_fmadd_pd(c_512, x3, t3);
+
+        _mm512_storeu_pd(&x[i + 0], t0);
+        _mm512_storeu_pd(&x[i + 8], t1);
+        _mm512_storeu_pd(&x[i +16], t2);
+        _mm512_storeu_pd(&x[i +24], t3);
+
+        t0 = _mm512_mul_pd(s_512, x0);
+        t1 = _mm512_mul_pd(s_512, x1);
+        t2 = _mm512_mul_pd(s_512, x2);
+        t3 = _mm512_mul_pd(s_512, x3);
+
+        t0 = _mm512_fmsub_pd(c_512, y0, t0);
+        t1 = _mm512_fmsub_pd(c_512, y1, t1);
+        t2 = _mm512_fmsub_pd(c_512, y2, t2);
+        t3 = _mm512_fmsub_pd(c_512, y3, t3);
+
+        _mm512_storeu_pd(&y[i + 0], t0);
+        _mm512_storeu_pd(&y[i + 8], t1);
+        _mm512_storeu_pd(&y[i +16], t2);
+        _mm512_storeu_pd(&y[i +24], t3);
+    }
+
+    for (i = tail_index_32; i < tail_index_8; i += 8) {
+        x0 = _mm512_loadu_pd(&x[i]);
+        y0 = _mm512_loadu_pd(&y[i]);
+
+        t0 = _mm512_mul_pd(s_512, y0);
+        t0 = _mm512_fmadd_pd(c_512, x0, t0);
+        _mm512_storeu_pd(&x[i], t0);
+
+        t0 = _mm512_mul_pd(s_512, x0);
+        t0 = _mm512_fmsub_pd(c_512, y0, t0);
+        _mm512_storeu_pd(&y[i], t0);
+    }
+
+    if ((n1&7) > 0) {
+        unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 -(n1&7)));
+	__m512d tail_x = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &x[tail_index_8]);
+	__m512d tail_y = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &y[tail_index_8]);
+	__m512d temp = _mm512_mul_pd(s_512, tail_y);
+	temp = _mm512_fmadd_pd(c_512, tail_x, temp);
+	_mm512_mask_storeu_pd(&x[tail_index_8],*((__mmask8*)&tail_mask8), temp);
+        temp = _mm512_mul_pd(s_512, tail_x);
+        temp = _mm512_fmsub_pd(c_512, tail_y, temp);
+        _mm512_mask_storeu_pd(&y[tail_index_8], *((__mmask8*)&tail_mask8), temp);	
+    }
+}
+#endif
diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c
new file mode 100644
index 000000000..021c20d82
--- /dev/null
+++ b/kernel/x86_64/srot.c
@@ -0,0 +1,139 @@
+#include "common.h"
+
+#if defined(SKYLAKEX)
+#include "srot_microk_skylakex-2.c"
+#elif defined(HASWELL)
+#include "srot_microk_haswell-2.c"
+#endif
+
+#ifndef HAVE_SROT_KERNEL
+
+static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+    FLOAT f0, f1, f2, f3;
+    FLOAT x0, x1, x2, x3;
+    FLOAT g0, g1, g2, g3;
+    FLOAT y0, y1, y2, y3;
+
+    FLOAT* xp = x;
+    FLOAT* yp = y;
+
+    BLASLONG n1 = n & (~7);
+
+    while (i < n1) {
+        x0 = xp[0];
+        y0 = yp[0];
+        x1 = xp[1];
+        y1 = yp[1];
+        x2 = xp[2];
+        y2 = yp[2];
+        x3 = xp[3];
+        y3 = yp[3];
+
+        f0 = c*x0 + s*y0;
+        g0 = c*y0 - s*x0;
+        f1 = c*x1 + s*y1;
+        g1 = c*y1 - s*x1;
+        f2 = c*x2 + s*y2;
+        g2 = c*y2 - s*x2;
+        f3 = c*x3 + s*y3;
+        g3 = c*y3 - s*x3;
+
+        xp[0] = f0;
+        yp[0] = g0;
+        xp[1] = f1;
+        yp[1] = g1;
+        xp[2] = f2;
+        yp[2] = g2;
+        xp[3] = f3;
+        yp[3] = g3;
+
+        xp += 4;
+        yp += 4;
+        i += 4;
+    }
+
+    while (i < n) {
+        FLOAT temp = c*x[i] + s*y[i];
+        y[i] = c*y[i] - s*x[i];
+        x[i] = temp;
+
+        i++;
+    }
+}
+
+#endif
+static void rot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+    BLASLONG ix = 0, iy = 0;
+
+    FLOAT temp;
+    
+    if (n <= 0)
+        return;
+    if ((inc_x == 1) && (inc_y == 1)) {
+            srot_kernel(n, x, y, c, s);
+    }
+    else {
+        while (i < n) {
+            temp = c * x[ix] + s * y[iy];
+            y[iy] = c * y[iy] - s * x[ix];
+            x[ix] = temp;
+
+            ix += inc_x;
+            iy += inc_y;
+            i++;
+        }
+    }
+    return;
+}
+
+
+#if defined(SMP)
+static int rot_thread_function(blas_arg_t *args)
+{
+
+    rot_compute(args->m, 
+            args->a, args->lda, 
+            args->b, args->ldb, 
+            ((float *)args->alpha)[0], 
+            ((float *)args->alpha)[1]);
+    return 0;
+}
+
+extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads);
+#endif
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+#if defined(SMP)
+    int nthreads;
+    FLOAT alpha[2]={c, s};
+    FLOAT dummy_c;
+#endif
+
+#if defined(SMP)
+    if (inc_x == 0 || inc_y == 0 || n <= 100000) {
+        nthreads = 1;
+    }
+    else {
+        nthreads = num_cpu_avail(1);
+    }
+
+    if (nthreads == 1) {
+        rot_compute(n, x, inc_x, y, inc_y, c, s);
+    }
+    else {
+#if defined(DOUBLE)
+	    int mode = BLAS_DOUBLE | BLAS_REAL | BLAS_PTHREAD;
+#else
+	    int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD;
+#endif
+	    blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads);
+    }
+#else	
+    rot_compute(n, x, inc_x, y, inc_y, c, s);
+#endif
+    return 0;
+}
diff --git a/kernel/x86_64/srot_microk_haswell-2.c b/kernel/x86_64/srot_microk_haswell-2.c
new file mode 100644
index 000000000..cba962042
--- /dev/null
+++ b/kernel/x86_64/srot_microk_haswell-2.c
@@ -0,0 +1,87 @@
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_SROT_KERNEL 1
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+
+    BLASLONG tail_index_8 = n&(~7);
+    BLASLONG tail_index_32 = n&(~31);
+
+    __m256 c_256, s_256;
+    if (n >= 8) {
+        c_256 = _mm256_set1_ps(c);
+        s_256 = _mm256_set1_ps(s);
+    }
+
+    __m256 x0, x1, x2, x3;
+    __m256 y0, y1, y2, y3;
+    __m256 t0, t1, t2, t3;
+
+    for (i = 0; i < tail_index_32; i += 32) {
+        x0 = _mm256_loadu_ps(&x[i + 0]);
+        x1 = _mm256_loadu_ps(&x[i + 8]);
+        x2 = _mm256_loadu_ps(&x[i +16]);
+        x3 = _mm256_loadu_ps(&x[i +24]);
+        y0 = _mm256_loadu_ps(&y[i + 0]);
+        y1 = _mm256_loadu_ps(&y[i + 8]);
+        y2 = _mm256_loadu_ps(&y[i +16]);
+        y3 = _mm256_loadu_ps(&y[i +24]);
+
+        t0 = _mm256_mul_ps(s_256, y0);
+        t1 = _mm256_mul_ps(s_256, y1);
+        t2 = _mm256_mul_ps(s_256, y2);
+        t3 = _mm256_mul_ps(s_256, y3);
+
+        t0 = _mm256_fmadd_ps(c_256, x0, t0);
+        t1 = _mm256_fmadd_ps(c_256, x1, t1);
+        t2 = _mm256_fmadd_ps(c_256, x2, t2);
+        t3 = _mm256_fmadd_ps(c_256, x3, t3);
+
+        _mm256_storeu_ps(&x[i + 0], t0);
+        _mm256_storeu_ps(&x[i + 8], t1);
+        _mm256_storeu_ps(&x[i +16], t2);
+        _mm256_storeu_ps(&x[i +24], t3);
+
+        t0 = _mm256_mul_ps(s_256, x0);
+        t1 = _mm256_mul_ps(s_256, x1);
+        t2 = _mm256_mul_ps(s_256, x2);
+        t3 = _mm256_mul_ps(s_256, x3);
+
+        t0 = _mm256_fmsub_ps(c_256, y0, t0);
+        t1 = _mm256_fmsub_ps(c_256, y1, t1);
+        t2 = _mm256_fmsub_ps(c_256, y2, t2);
+        t3 = _mm256_fmsub_ps(c_256, y3, t3);
+
+        _mm256_storeu_ps(&y[i + 0], t0);
+        _mm256_storeu_ps(&y[i + 8], t1);
+        _mm256_storeu_ps(&y[i +16], t2);
+        _mm256_storeu_ps(&y[i +24], t3);
+
+    }
+
+    for (i = tail_index_32; i < tail_index_8; i += 8) {
+        x0 = _mm256_loadu_ps(&x[i]);
+        y0 = _mm256_loadu_ps(&y[i]);
+
+        t0 = _mm256_mul_ps(s_256, y0);
+        t0 = _mm256_fmadd_ps(c_256, s0, t0);
+        _mm256_storeu_ps(&x[i], t0);
+
+        t0 = _mm256_mul_ps(s_256, x0);
+        t0 = _mm256_fmsub_ps(c_256, y0, t0);
+        _mm256_storeu_ps(&y[i], t0);
+    }
+
+    for (i = tail_index_8; i < n; ++i) {
+        FLOAT temp = c * x[i] + s * y[i];
+        y[i] = c * y[i] - s * x[i];
+        x[i] = temp;
+    }
+}
+#endif
diff --git a/kernel/x86_64/srot_microk_skylakex-2.c b/kernel/x86_64/srot_microk_skylakex-2.c
new file mode 100644
index 000000000..a21d1cf64
--- /dev/null
+++ b/kernel/x86_64/srot_microk_skylakex-2.c
@@ -0,0 +1,91 @@
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_SROT_KERNEL 1
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+    __m512 c_512, s_512;
+    c_512 = _mm512_set1_ps(c);
+    s_512 = _mm512_set1_ps(s);
+
+    BLASLONG tail_index_16 = n&(~15);
+    BLASLONG tail_index_64 = n&(~63);
+
+
+    __m512 x0, x1, x2, x3;
+    __m512 y0, y1, y2, y3;
+    __m512 t0, t1, t2, t3;
+
+    for (i = 0; i < tail_index_64; i += 64) {
+        x0 = _mm512_loadu_ps(&x[i + 0]);
+        x1 = _mm512_loadu_ps(&x[i +16]);
+        x2 = _mm512_loadu_ps(&x[i +32]);
+        x3 = _mm512_loadu_ps(&x[i +48]);
+        y0 = _mm512_loadu_ps(&y[i + 0]);
+        y1 = _mm512_loadu_ps(&y[i +16]);
+        y2 = _mm512_loadu_ps(&y[i +32]);
+        y3 = _mm512_loadu_ps(&y[i +48]);
+
+        t0 = _mm512_mul_ps(s_512, y0);
+        t1 = _mm512_mul_ps(s_512, y1);
+        t2 = _mm512_mul_ps(s_512, y2);
+        t3 = _mm512_mul_ps(s_512, y3);
+
+        t0 = _mm512_fmadd_ps(c_512, x0, t0);
+        t1 = _mm512_fmadd_ps(c_512, x1, t1);
+        t2 = _mm512_fmadd_ps(c_512, x2, t2);
+        t3 = _mm512_fmadd_ps(c_512, x3, t3);
+
+        _mm512_storeu_ps(&x[i + 0], t0);
+        _mm512_storeu_ps(&x[i +16], t1);
+        _mm512_storeu_ps(&x[i +32], t2);
+        _mm512_storeu_ps(&x[i +48], t3);
+
+        t0 = _mm512_mul_ps(s_512, x0);
+        t1 = _mm512_mul_ps(s_512, x1);
+        t2 = _mm512_mul_ps(s_512, x2);
+        t3 = _mm512_mul_ps(s_512, x3);
+
+        t0 = _mm512_fmsub_ps(c_512, y0, t0);
+        t1 = _mm512_fmsub_ps(c_512, y1, t1);
+        t2 = _mm512_fmsub_ps(c_512, y2, t2);
+        t3 = _mm512_fmsub_ps(c_512, y3, t3);
+
+        _mm512_storeu_ps(&y[i + 0], t0);
+        _mm512_storeu_ps(&y[i +16], t1);
+        _mm512_storeu_ps(&y[i +32], t2);
+        _mm512_storeu_ps(&y[i +48], t3);
+    }
+
+    for (i = tail_index_64; i < tail_index_16; i += 16) {
+        x0 = _mm512_loadu_ps(&x[i]);
+        y0 = _mm512_loadu_ps(&y[i]);
+
+        t0 = _mm512_mul_ps(s_512, y0);
+        t0 = _mm512_fmadd_ps(c_512, x0, t0);
+        _mm512_storeu_ps(&x[i], t0);
+
+        t0 = _mm512_mul_ps(s_512, x0);
+        t0 = _mm512_fmsub_ps(c_512, y0, t0);
+        _mm512_storeu_ps(&y[i], t0);
+    }
+
+
+    if ((n & 15) > 0) {
+        uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16-(n&15)));
+        __m512 tail_x = _mm512_maskz_loadu_ps(*((__mmask16*)&tail_mask16), &x[tail_index_16]);
+        __m512 tail_y = _mm512_maskz_loadu_ps(*((__mmask16*)&tail_mask16), &y[tail_index_16]);
+	    __m512 temp = _mm512_mul_ps(s_512, tail_y);
+	    temp = _mm512_fmadd_ps(c_512, tail_x, temp);
+	    _mm512_mask_storeu_ps(&x[tail_index_16], *((__mmask16*)&tail_mask16), temp);
+	    temp = _mm512_mul_ps(s_512, tail_x);
+	    temp = _mm512_fmsub_ps(c_512, tail_y, temp);
+	    _mm512_mask_storeu_ps(&y[tail_index_16], *((__mmask16*)&tail_mask16), temp);	
+    }
+}
+#endif

From 725ffbf041b021d2f3602b2313e4027aab19ee89 Mon Sep 17 00:00:00 2001
From: Gengxin Xie <gengxin.xie@intel.com>
Date: Thu, 5 Nov 2020 16:25:17 +0800
Subject: [PATCH 72/83] fix typo

---
 kernel/x86_64/srot_microk_haswell-2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/srot_microk_haswell-2.c b/kernel/x86_64/srot_microk_haswell-2.c
index cba962042..8e245cc8f 100644
--- a/kernel/x86_64/srot_microk_haswell-2.c
+++ b/kernel/x86_64/srot_microk_haswell-2.c
@@ -70,7 +70,7 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
         y0 = _mm256_loadu_ps(&y[i]);
 
         t0 = _mm256_mul_ps(s_256, y0);
-        t0 = _mm256_fmadd_ps(c_256, s0, t0);
+        t0 = _mm256_fmadd_ps(c_256, x0, t0);
         _mm256_storeu_ps(&x[i], t0);
 
         t0 = _mm256_mul_ps(s_256, x0);

From 28d2dfe2b3bd6c779137fcb53451f97f47b78b37 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 7 Nov 2020 12:17:49 +0100
Subject: [PATCH 73/83] Fix macro name used in ifdef

---
 kernel/arm/zdot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c
index ba0e57eb5..73ae3acd7 100644
--- a/kernel/arm/zdot.c
+++ b/kernel/arm/zdot.c
@@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 		i++ ;
 
 	}
-#if !defined(__POWER__)	
+#if !defined(__PPC__)	
         CREAL(result) = dot[0];
 	CIMAG(result) = dot[1];
 #else

From 438a8e5624ef1adfe98f989655ca398866143458 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 7 Nov 2020 20:26:12 +0100
Subject: [PATCH 74/83] Fix placement of getarch call and spurious cpu property
 accumulation in DYNAMIC_ARCH builds

---
 cmake/prebuild.cmake |  45 ++++++----------
 cmake/system.cmake   | 124 ++++++++++++++++++++-----------------------
 2 files changed, 73 insertions(+), 96 deletions(-)

diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index b1b4c501a..da7686c33 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -139,36 +139,6 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
       set(CGEMM3M_UNROLL_N 4)
       set(ZGEMM3M_UNROLL_M 4)
       set(ZGEMM3M_UNROLL_N 4)
-  elseif ("${TCORE}" STREQUAL "BARCELONA")
-    file(APPEND ${TARGET_CONF_TEMP}
-      "#define HAVE_SSE3\n")
-  elseif ("${TCORE}" STREQUAL "STEAMROLLER")
-    file(APPEND ${TARGET_CONF_TEMP}
-      "#define HAVE_SSE3\n")
-  elseif ("${TCORE}" STREQUAL "EXCAVATOR")
-    file(APPEND ${TARGET_CONF_TEMP}
-      "#define HAVE_SSE3\n")
-  elseif ("${TCORE}" STREQUAL "NEHALEM")
-    file(APPEND ${TARGET_CONF_TEMP}
-      "#define HAVE_SSE3\n")
-  elseif ("${TCORE}" STREQUAL "PRESCOTT")
-    file(APPEND ${TARGET_CONF_TEMP}
-      "#define HAVE_SSE3\n")
-  elseif ("${TCORE}" STREQUAL "SANDYBRIDGE")
-    file(APPEND ${TARGET_CONF_TEMP}
-      "#define HAVE_AVX\n")
-  elseif ("${TCORE}" STREQUAL "HASWELL")
-    file(APPEND ${TARGET_CONF_TEMP}
-      "#define HAVE_AVX2\n")
-  elseif ("${TCORE}" STREQUAL "ZEN")
-    file(APPEND ${TARGET_CONF_TEMP}
-      "#define HAVE_AVX2\n")
-  elseif ("${TCORE}" STREQUAL "SKYLAKEX")
-    file(APPEND ${TARGET_CONF_TEMP}
-      "#define HAVE_AVX512\n")
-  elseif ("${TCORE}" STREQUAL "COOPERLAKE")
-    file(APPEND ${TARGET_CONF_TEMP}
-      "#define HAVE_AVX512\n")
   elseif ("${TCORE}" STREQUAL "ARMV7")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_DATA_SIZE\t65536\n"
@@ -586,6 +556,21 @@ else(NOT CMAKE_CROSSCOMPILING)
       MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
     endif ()
   endif ()
+  unset (HAVE_AVX2)
+  unset (HAVE_AVX)
+  unset (HAVE_FMA3)
+  unset (HAVE_MMX)
+  unset (HAVE_SSE)
+  unset (HAVE_SSE2)
+  unset (HAVE_SSE3)
+  unset (HAVE_SSSE3)
+  unset (HAVE_SSE4A)
+  unset (HAVE_SSE4_1)
+  unset (HAVE_SSE4_2)
+  unset (HAVE_NEON)
+  unset (HAVE_VFP)
+  unset (HAVE_VFPV3)
+  unset (HAVE_VFPV4)
   message(STATUS "Running getarch")
 
   # use the cmake binary w/ the -E param to run a shell command in a cross-platform way
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 48d206b12..66e95c6d3 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -44,74 +44,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
   endif ()
 endif ()
 
-if (DEFINED TARGET)
-  if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512)
-#    if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
-      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
-        if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
-          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
-        else()
-          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
-        endif()
-#    elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
-#      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
-#    endif()    
-  endif()
-  if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
-  endif()
-  if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2)
-    if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
-      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
-      if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
-        set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
-      endif()
-    elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse -msse3 -mavx2")
-    endif()
-  endif()
-  if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2)
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
-  endif()
-  if (${TARGET} STREQUAL "ZEN" AND NOT NO_AVX2)
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
-  endif()
-  if (${TARGET} STREQUAL "SANDYBRIDGE" AND NOT NO_AVX)
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx")
-  endif()
-  if (${TARGET} STREQUAL "BARCELONA" OR ${TARGET} STREQUAL "STEAMROLLER" OR ${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "EXCAVATOR")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
-  endif()
-  if (${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "BOBCAT" OR ${TARGET} STREQUAL "OPTERON_SSE3")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
-  endif()
-  if (${TARGET} STREQUAL "PRESCOTT" OR ${TARGET} STREQUAL "NANO")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
-  endif()
-  if (${TARGET} STREQUAL "NEHALEM" OR ${TARGET} STREQUAL "ATOM")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
-  endif()
-  if (${TARGET} STREQUAL "CORE2" OR ${TARGET} STREQUAL "PENRYN" OR ${TARGET} STREQUAL "DUNNINGTON")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
-  endif()
-  if (DEFINED HAVE_SSE)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
-  endif()
-  if (DEFINED HAVE_SSE2)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2")
-  endif()
-  if (DEFINED HAVE_SSE3)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
-  endif()
-    if (DEFINED HAVE_SSSE3)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3")
-  endif()
-    if (DEFINED HAVE_SSE4_1)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1")
-  endif()
-endif()
 
 if (DEFINED TARGET)
+  message(STATUS "-- -- -- -- -- -- -- -- -- -- -- -- --")
   message(STATUS "Targeting the ${TARGET} architecture.")
   set(GETARCH_FLAGS "-DFORCE_${TARGET}")
 endif ()
@@ -211,6 +146,63 @@ else()
 endif ()
 
 include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
+if (DEFINED TARGET)
+  if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512)
+#    if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
+      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+        if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
+          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
+        else()
+          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
+        endif()
+#    elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
+#      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+#    endif()    
+  endif()
+  if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
+  endif()
+  if (${TARGET} STREQUAL HASWELL AND NOT NO_AVX2)
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
+      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+      if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
+        set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+      endif()
+    elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+    endif()
+  endif()
+  if (DEFINED HAVE_AVX)
+	if (NOT NO_AVX)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx")
+	endif()
+  endif()
+  if (DEFINED HAVE_AVX2)
+	if (NOT NO_AVX2)
+      	  set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+	endif()
+  endif()
+  if (DEFINED HAVE_FMA3)
+	if (NOT NO_AVX2)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
+	endif()
+  endif()
+    if (DEFINED HAVE_SSE)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
+  endif()
+  if (DEFINED HAVE_SSE2)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2")
+  endif()
+  if (DEFINED HAVE_SSE3)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
+  endif()
+    if (DEFINED HAVE_SSSE3)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3")
+  endif()
+    if (DEFINED HAVE_SSE4_1)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1")
+  endif()
+endif()
 if (DEFINED BINARY)
   message(STATUS "Compiling a ${BINARY}-bit binary.")
 endif ()

From a29338aaa6b364ce99ea30785d1227bd327ce3c7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 7 Nov 2020 20:27:42 +0100
Subject: [PATCH 75/83] Remove extraneous quotes that caused a cmake policy
 warning

---
 cmake/cc.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index 2f4d1c6d7..b963940d6 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -96,7 +96,7 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN")
   endif ()
 endif ()
 
-if (${CORE} STREQUAL "SKYLAKEX")
+if (${CORE} STREQUAL SKYLAKEX)
   if (NOT DYNAMIC_ARCH)
     if (NOT NO_AVX512)
       set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512")
@@ -104,7 +104,7 @@ if (${CORE} STREQUAL "SKYLAKEX")
   endif ()
 endif ()
 
-if (${CORE} STREQUAL "COOPERLAKE")
+if (${CORE} STREQUAL COOPERLAKE)
   if (NOT DYNAMIC_ARCH)
     if (NOT NO_AVX512)
       execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)

From ccb9731c7b41b601412b00b73f6da98613d66b7f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 7 Nov 2020 20:30:15 +0100
Subject: [PATCH 76/83] Fix propagation of cpu properties to compiler options

---
 Makefile.x86_64 | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/Makefile.x86_64 b/Makefile.x86_64
index 49a9a0a23..43bfc9ecd 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -9,9 +9,9 @@ endif
 endif
 
 ifdef HAVE_SSE3
-ifndef DYNAMIC_ARCH
 CCOMMON_OPT += -msse3
 FCOMMON_OPT += -msse3
+endif
 ifdef HAVE_SSSE3
 CCOMMON_OPT += -mssse3
 FCOMMON_OPT += -mssse3
@@ -20,7 +20,17 @@ ifdef HAVE_SSE4_1
 CCOMMON_OPT += -msse4.1
 FCOMMON_OPT += -msse4.1
 endif
+ifdef HAVE_AVX
+CCOMMON_OPT += -mavx
+FCOMMON_OPT += -mavx
 endif
+ifdef HAVE_AVX2
+CCOMMON_OPT += -mavx2
+FCOMMON_OPT += -mavx2
+endif
+ifdef HAVE_FMA3
+CCOMMON_OPT += -mfma
+FCOMMON_OPT += -mfma
 endif
 
 ifeq ($(CORE), SKYLAKEX)
@@ -66,8 +76,7 @@ endif
 endif
 endif
 
-ifeq ($(CORE), $(filter $(CORE), HASWELL ZEN SKYLAKEX COOPERLAKE))
-ifndef DYNAMIC_ARCH
+ifdef HAVE_AVX2
 ifndef NO_AVX2
 ifeq ($(C_COMPILER), GCC)
 # AVX2 support was added in 4.7.0
@@ -96,7 +105,6 @@ endif
 endif
 endif
 endif
-endif
 
 
 

From a04f532edfe65a7e4cf4dfb2dc34d363e2eba065 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 7 Nov 2020 20:37:03 +0100
Subject: [PATCH 77/83] Reset cpu property flags between build cycles in
 DYNAMIC_ARCH mode

---
 Makefile.system | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index ca302a98a..dc7ed3f3a 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -252,6 +252,22 @@ DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)"
 ifndef TARGET_CORE
 include $(TOPDIR)/Makefile.conf
 else
+undefine HAVE_NEON
+undefine HAVE_VFP
+undefine HAVE_VFPV3
+undefine HAVE_VFPV4
+undefine HAVE_MMX
+undefine HAVE_SSE
+undefine HAVE_SSE2
+undefine HAVE_SSE3
+undefine HAVE_SSSE3
+undefine HAVE_SSE4_1
+undefine HAVE_SSE4_2
+undefine HAVE_SSE4A
+undefine HAVE_SSE5
+undefine HAVE_AVX
+undefine HAVE_AVX2
+undefine HAVE_FMA3
 include $(TOPDIR)/Makefile_kernel.conf
 endif
 
@@ -1522,6 +1538,8 @@ export HAVE_SSE4_2
 export HAVE_SSE4A
 export HAVE_SSE5
 export HAVE_AVX
+export HAVE_AVX2
+export HAVE_FMA3
 export HAVE_VFP
 export HAVE_VFPV3
 export HAVE_VFPV4

From b976a0bf4095fd8b9e80ae3cf0e0f6eab200219e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 7 Nov 2020 20:39:56 +0100
Subject: [PATCH 78/83] Remove previous workaround for compiler flags related
 to cpu capabilities in x86_64 DYNAMIC_ARCH builds

---
 kernel/Makefile | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/kernel/Makefile b/kernel/Makefile
index e811ed43d..fb1d5d39a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,13 +5,6 @@ endif
 TOPDIR	= ..
 include $(TOPDIR)/Makefile.system
 
-ifdef HAVE_SSE3
-CFLAGS += -msse3
-endif
-ifdef HAVE_SSSE3
-CFLAGS += -mssse3
-endif
-
 ifeq ($(ARCH), power)
 ifeq ($(C_COMPILER), CLANG)
  override CFLAGS += -fno-integrated-as
@@ -38,12 +31,6 @@ ifdef NO_AVX2
 endif
 
 ifdef TARGET_CORE
-	ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3))
-	override CFLAGS += -msse -msse2 -msse3 -mssse3 -msse4.1
-endif
-	ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),KATMAI COPPERMINE BANIAS NORTHWOOD ATHLON OPTERON))
-	override CFLAGS += -msse -msse2
-endif
 ifeq ($(TARGET_CORE), COOPERLAKE)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
  ifeq ($(GCCVERSIONGTEQ10), 1) 

From 6e364981a8af0f72ad9e62a69fe62fdedc18255b Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Sat, 7 Nov 2020 15:21:58 -0600
Subject: [PATCH 79/83] Optimize sdot/ddot for POWER10

This patch makes use of new POWER10 vector pair instructions for
loads and stores.
---
 kernel/power/KERNEL.POWER10        |   6 +-
 kernel/power/ddot_microk_power10.c | 131 ++++++++++++++++++++++++
 kernel/power/ddot_power10.c        | 130 ++++++++++++++++++++++++
 kernel/power/sdot_microk_power10.c | 135 +++++++++++++++++++++++++
 kernel/power/sdot_power10.c        | 154 +++++++++++++++++++++++++++++
 5 files changed, 553 insertions(+), 3 deletions(-)
 create mode 100644 kernel/power/ddot_microk_power10.c
 create mode 100644 kernel/power/ddot_power10.c
 create mode 100644 kernel/power/sdot_microk_power10.c
 create mode 100644 kernel/power/sdot_power10.c

diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index 28c39051f..c25cd9f04 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -151,9 +151,9 @@ DCOPYKERNEL  = dcopy_power10.c
 CCOPYKERNEL  = ccopy_power10.c
 ZCOPYKERNEL  = zcopy_power10.c
 #
-SDOTKERNEL   =  sdot.c
-DDOTKERNEL   =  ddot.c
-DSDOTKERNEL  =  sdot.c
+SDOTKERNEL   =  sdot_power10.c
+DDOTKERNEL   =  ddot_power10.c
+DSDOTKERNEL  =  sdot_power10.c
 ifneq ($(GCCVERSIONGTEQ9),1)
 CDOTKERNEL   =  cdot_power9.S
 else
diff --git a/kernel/power/ddot_microk_power10.c b/kernel/power/ddot_microk_power10.c
new file mode 100644
index 000000000..3a9865cc0
--- /dev/null
+++ b/kernel/power/ddot_microk_power10.c
@@ -0,0 +1,131 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_8 1
+
+static double ddot_kernel_8 (long n, double *x, double *y)
+{
+  double dot;
+
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+       "dcbt		0, %3		\n\t"
+
+       "xxlxor		32, 32,	32	\n\t"
+       "xxlxor		33, 33,	33	\n\t"
+       "xxlxor		34, 34,	34	\n\t"
+       "xxlxor		35, 35,	35	\n\t"
+       "xxlxor		36, 36,	36	\n\t"
+       "xxlxor		37, 37,	37	\n\t"
+       "xxlxor		38, 38,	38	\n\t"
+       "xxlxor		39, 39,	39	\n\t"
+
+       "lxvp            40, 0(%2)       \n\t"
+       "lxvp            42, 32(%2)      \n\t"
+       "lxvp            44, 64(%2)      \n\t"
+       "lxvp            46, 96(%2)      \n\t"
+       "lxvp            48, 0(%3)       \n\t"
+       "lxvp            50, 32(%3)      \n\t"
+       "lxvp            52, 64(%3)      \n\t"
+       "lxvp            54, 96(%3)      \n\t"
+
+       "addi		%2, %2, 128	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "xvmaddadp	32, 40, 48	\n\t"
+       "xvmaddadp	33, 41, 49	\n\t"
+       "lxvp            40, 0(%2)       \n\t"
+       "lxvp            48, 0(%3)       \n\t"
+       "xvmaddadp	34, 42, 50	\n\t"
+       "xvmaddadp	35, 43, 51	\n\t"
+       "lxvp            42, 32(%2)      \n\t"
+       "lxvp            50, 32(%3)      \n\t"
+       "xvmaddadp	36, 44, 52	\n\t"
+       "xvmaddadp	37, 45, 53	\n\t"
+       "lxvp            44, 64(%2)      \n\t"
+       "lxvp            52, 64(%3)      \n\t"
+       "xvmaddadp	38, 46, 54	\n\t"
+       "xvmaddadp	39, 47, 55	\n\t"
+       "lxvp            46, 96(%2)      \n\t"
+       "lxvp            54, 96(%3)      \n\t"
+
+       "addi		%2, %2, 128	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "xvmaddadp	32, 40, 48	\n\t"
+       "xvmaddadp	33, 41, 49	\n\t"
+       "xvmaddadp	34, 42, 50	\n\t"
+       "xvmaddadp	35, 43, 51	\n\t"
+       "xvmaddadp	36, 44, 52	\n\t"
+       "xvmaddadp	37, 45, 53	\n\t"
+       "xvmaddadp	38, 46, 54	\n\t"
+       "xvmaddadp	39, 47, 55	\n\t"
+
+       "xvadddp		32, 32, 33	\n\t"
+       "xvadddp		34, 34, 35	\n\t"
+       "xvadddp		36, 36, 37	\n\t"
+       "xvadddp		38, 38, 39	\n\t"
+
+       "xvadddp		32, 32, 34	\n\t"
+       "xvadddp		36, 36, 38	\n\t"
+
+       "xvadddp		32, 32, 36	\n\t"
+
+       XXSWAPD_S(33,32)
+
+       "xsadddp		%x0, 32, 33	\n"
+
+     "#dot=%0 n=%1 x=%4=%2 y=%5=%3\n"
+     :
+       "=d" (dot),	// 0
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y)		// 3
+     :
+       "m" (*x),
+       "m" (*y)
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55"
+     );
+
+  return dot;
+}
diff --git a/kernel/power/ddot_power10.c b/kernel/power/ddot_power10.c
new file mode 100644
index 000000000..302dceb68
--- /dev/null
+++ b/kernel/power/ddot_power10.c
@@ -0,0 +1,130 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "ddot_microk_power10.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_8
+
+static FLOAT ddot_kernel_8 (BLASLONG n, FLOAT *x, FLOAT *y)
+{
+	BLASLONG register i = 0;
+	FLOAT dot = 0.0;
+
+	while(i < n)
+        {
+              dot += y[i]  * x[i]
+                  + y[i+1] * x[i+1]
+                  + y[i+2] * x[i+2]
+                  + y[i+3] * x[i+3]
+                  + y[i+4] * x[i+4]
+                  + y[i+5] * x[i+5]
+                  + y[i+6] * x[i+6]
+                  + y[i+7] * x[i+7] ;
+
+              i+=8 ;
+
+       }
+       return dot;
+}
+
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	FLOAT  dot = 0.0 ;
+
+	if ( n <= 0 )  return(dot);
+
+	if ( (inc_x == 1) && (inc_y == 1) )
+	{
+
+		BLASLONG n1 = n & -16;
+
+		if ( n1 )
+			dot = ddot_kernel_8(n1, x, y);
+
+		i = n1;
+		while(i < n)
+		{
+
+			dot += y[i] * x[i] ;
+			i++ ;
+
+		}
+		return(dot);
+
+
+	}
+
+	FLOAT temp1 = 0.0;
+	FLOAT temp2 = 0.0;
+
+        BLASLONG n1 = n & -4;	
+
+	while(i < n1)
+	{
+
+		FLOAT m1 = y[iy]       * x[ix] ;
+		FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ;
+
+		FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ;
+		FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ;
+
+		ix  += inc_x*4 ;
+		iy  += inc_y*4 ;
+
+		temp1 += m1+m3;
+		temp2 += m2+m4;
+
+		i+=4 ;
+
+	}
+
+	while(i < n)
+	{
+
+		temp1 += y[iy] * x[ix] ;
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	dot = temp1 + temp2;
+	return(dot);
+
+}
+
+
diff --git a/kernel/power/sdot_microk_power10.c b/kernel/power/sdot_microk_power10.c
new file mode 100644
index 000000000..2f028c5a0
--- /dev/null
+++ b/kernel/power/sdot_microk_power10.c
@@ -0,0 +1,135 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16 1
+
+static float sdot_kernel_16 (long n, float *x, float *y)
+{
+  float dot;
+
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+       "dcbt		0, %3		\n\t"
+
+       "xxlxor		32, 32,	32	\n\t"
+       "xxlxor		33, 33,	33	\n\t"
+       "xxlxor		34, 34,	34	\n\t"
+       "xxlxor		35, 35,	35	\n\t"
+       "xxlxor		36, 36,	36	\n\t"
+       "xxlxor		37, 37,	37	\n\t"
+       "xxlxor		38, 38,	38	\n\t"
+       "xxlxor		39, 39,	39	\n\t"
+
+       "lxvp		40, 0(%2)	\n\t"
+       "lxvp		42, 32(%2)	\n\t"
+       "lxvp		44, 64(%2)	\n\t"
+       "lxvp		46, 96(%2)	\n\t"
+       "lxvp		48, 0(%3)	\n\t"
+       "lxvp		50, 32(%3)	\n\t"
+       "lxvp		52, 64(%3)	\n\t"
+       "lxvp		54, 96(%3)	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "xvmaddasp	32, 40, 48	\n\t"
+       "xvmaddasp	33, 41, 49	\n\t"
+       "lxvp		40, 0(%2)	\n\t"
+       "lxvp		48, 0(%3)	\n\t"
+       "xvmaddasp	34, 42, 50	\n\t"
+       "xvmaddasp	35, 43, 51	\n\t"
+       "lxvp		42, 32(%2)	\n\t"
+       "lxvp		50, 32(%3)	\n\t"
+       "xvmaddasp	36, 44, 52	\n\t"
+       "xvmaddasp	37, 45, 53	\n\t"
+       "lxvp		44, 64(%2)	\n\t"
+       "lxvp		52, 64(%3)	\n\t"
+       "xvmaddasp	38, 46, 54 	\n\t"
+       "xvmaddasp	39, 47, 55 	\n\t"
+       "lxvp		46, 96(%2)	\n\t"
+       "lxvp		54, 96(%3)	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "xvmaddasp	32, 40, 48	\n\t"
+       "xvmaddasp	33, 41, 49	\n\t"
+       "xvmaddasp	34, 42, 50	\n\t"
+       "xvmaddasp	35, 43, 51	\n\t"
+       "xvmaddasp	36, 44, 52	\n\t"
+       "xvmaddasp	37, 45, 53	\n\t"
+       "xvmaddasp	38, 46, 54	\n\t"
+       "xvmaddasp	39, 47, 55	\n\t"
+
+       "xvaddsp		32, 32, 33	\n\t"
+       "xvaddsp		34, 34, 35	\n\t"
+       "xvaddsp		36, 36, 37	\n\t"
+       "xvaddsp		38, 38, 39	\n\t"
+
+       "xvaddsp		32, 32, 34	\n\t"
+       "xvaddsp		36, 36, 38	\n\t"
+
+       "xvaddsp		32, 32, 36	\n\t"
+
+       "xxsldwi		33, 32, 32, 2	\n\t"
+       "xvaddsp		32, 32, 33	\n\t"
+
+       "xxsldwi		33, 32, 32, 1	\n\t"
+       "xvaddsp		32, 32, 33	\n\t"
+
+       "xscvspdp	%x0, 32		\n"
+
+     "#dot=%0 n=%1 x=%4=%2 y=%5=%3\n"
+     :
+       "=f" (dot),	// 0
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y)		// 3
+     :
+       "m" (*x),
+       "m" (*y)
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55"
+     );
+
+  return dot;
+}
diff --git a/kernel/power/sdot_power10.c b/kernel/power/sdot_power10.c
new file mode 100644
index 000000000..b61f0a90d
--- /dev/null
+++ b/kernel/power/sdot_power10.c
@@ -0,0 +1,154 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "sdot_microk_power10.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_16
+
+static FLOAT sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+	BLASLONG register i = 0;
+	FLOAT dot = 0.0;
+
+	while(i < n)
+        {
+              dot += y[i]  * x[i]
+                  + y[i+1] * x[i+1]
+                  + y[i+2] * x[i+2]
+                  + y[i+3] * x[i+3]
+                  + y[i+4] * x[i+4]
+                  + y[i+5] * x[i+5]
+                  + y[i+6] * x[i+6]
+                  + y[i+7] * x[i+7] ;
+
+              i+=8 ;
+
+       }
+       return dot;
+}
+
+#endif
+
+#if defined (DSDOT)
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	double dot = 0.0 ;
+
+#if defined (DSDOT)
+        double mydot = 0.0;
+        FLOAT asmdot = 0.0;
+#else
+	FLOAT mydot=0.0;
+#endif
+	BLASLONG n1;
+
+	if ( n <= 0 )  return(dot);
+
+	if ( (inc_x == 1) && (inc_y == 1) )
+	{
+
+	        n1 = n & (BLASLONG)(-32);
+
+		if ( n1 )
+#if defined(DSDOT)
+			{
+			FLOAT *x1=x;
+			FLOAT *y1=y;
+			BLASLONG n2 = 32;
+			while (i<n1) {
+				asmdot = sdot_kernel_16(n2, x1, y1);
+				mydot += (double)asmdot;
+				asmdot=0.;
+				x1+=32;
+				y1+=32;
+				i+=32;
+			}
+		}
+#else		
+			mydot = sdot_kernel_16(n1, x, y);
+#endif
+		i = n1;
+		while(i < n)
+		{
+#if defined(DSDOT)
+			dot += (double)y[i] * (double)x[i] ;
+#else
+			dot += y[i] * x[i] ;
+#endif
+			i++ ;
+
+		}
+
+		dot+=mydot;
+		return(dot);
+
+
+	}
+
+	n1 = n & (BLASLONG)(-2);
+
+	while(i < n1)
+	{
+#if defined (DSDOT)
+		dot += (double)y[iy] * (double)x[ix] + (double)y[iy+inc_y] * (double)x[ix+inc_x];
+#else
+		dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
+#endif
+		ix  += inc_x*2 ;
+		iy  += inc_y*2 ;
+		i+=2 ;
+
+	}
+
+	while(i < n)
+	{
+#if defined (DSDOT)
+		dot += (double)y[iy] * (double)x[ix] ;
+#else
+		dot += y[iy] * x[ix] ;
+#endif
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	return(dot);
+
+}
+
+

From f4b7ba12b71f97b6e5f8cec462635b9334c62a72 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 7 Nov 2020 23:37:21 +0100
Subject: [PATCH 80/83] Update Makefile.system

---
 Makefile.system | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index dc7ed3f3a..258a84262 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -252,7 +252,9 @@ DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)"
 ifndef TARGET_CORE
 include $(TOPDIR)/Makefile.conf
 else
+ifdef HAVE_NEON
 undefine HAVE_NEON
+endif
 undefine HAVE_VFP
 undefine HAVE_VFPV3
 undefine HAVE_VFPV4

From f6a57d8f63ed0f1fa4823d27daafc2cb3a6dc96b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 8 Nov 2020 00:01:36 +0100
Subject: [PATCH 81/83] Update Makefile.system

---
 Makefile.system | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index 258a84262..da2d452b2 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -255,9 +255,15 @@ else
 ifdef HAVE_NEON
 undefine HAVE_NEON
 endif
+ifdef HAVE_VFP
 undefine HAVE_VFP
+endif
+ifdef HAVE_VFPV3
 undefine HAVE_VFPV3
+endif
+ifdef HAVE_VFPV4
 undefine HAVE_VFPV4
+endif
 undefine HAVE_MMX
 undefine HAVE_SSE
 undefine HAVE_SSE2

From 1c4cfdc13937765dd9bd0ef8b846ba027ec086b3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 8 Nov 2020 00:12:55 +0100
Subject: [PATCH 82/83] Stay compatible with old gmake that did not support
 undefine

---
 Makefile.system | 42 +++++++++++++++++-------------------------
 1 file changed, 17 insertions(+), 25 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index da2d452b2..aae7ba503 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -6,7 +6,7 @@
 INCLUDED = 1
 
 ifndef TOPDIR
-TOPDIR = .
+TOPDIR = . 
 endif
 
  # If ARCH is not set, we use the host system's architecture for getarch compile options.
@@ -252,30 +252,22 @@ DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)"
 ifndef TARGET_CORE
 include $(TOPDIR)/Makefile.conf
 else
-ifdef HAVE_NEON
-undefine HAVE_NEON
-endif
-ifdef HAVE_VFP
-undefine HAVE_VFP
-endif
-ifdef HAVE_VFPV3
-undefine HAVE_VFPV3
-endif
-ifdef HAVE_VFPV4
-undefine HAVE_VFPV4
-endif
-undefine HAVE_MMX
-undefine HAVE_SSE
-undefine HAVE_SSE2
-undefine HAVE_SSE3
-undefine HAVE_SSSE3
-undefine HAVE_SSE4_1
-undefine HAVE_SSE4_2
-undefine HAVE_SSE4A
-undefine HAVE_SSE5
-undefine HAVE_AVX
-undefine HAVE_AVX2
-undefine HAVE_FMA3
+HAVE_NEON=
+HAVE_VFP=
+HAVE_VFPV3=
+HAVE_VFPV4=
+HAVE_MMX=
+HAVE_SSE=
+HAVE_SSE2=
+HAVE_SSE3=
+HAVE_SSSE3=
+HAVE_SSE4_1=
+HAVE_SSE4_2=
+HAVE_SSE4A=
+HAVE_SSE5=
+HAVE_AVX=
+HAVE_AVX2=
+HAVE_FMA3=
 include $(TOPDIR)/Makefile_kernel.conf
 endif
 

From ec088bf33aa3034a82b713ea304fe30e36c278ec Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 8 Nov 2020 13:15:40 +0100
Subject: [PATCH 83/83] Fix missing AVX2 and FMA3 capabilities in FORCE_target
 mode

---
 getarch.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/getarch.c b/getarch.c
index ab90f36d9..daf669e56 100644
--- a/getarch.c
+++ b/getarch.c
@@ -330,7 +330,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
 		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
 		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
-                     "-DFMA3"
+                     "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
 #define LIBNAME   "haswell"
 #define CORENAME  "HASWELL"
 #endif
@@ -346,7 +346,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
 		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
 		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
-                     "-DFMA3"
+                     "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
 #define LIBNAME   "haswell"
 #define CORENAME  "HASWELL"
 #else
@@ -359,7 +359,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
 		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
 		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
-                     "-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512"
+                     "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -march=skylake-avx512"
 #define LIBNAME   "skylakex"
 #define CORENAME  "SKYLAKEX"
 #endif
@@ -376,7 +376,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
                      "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
                      "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
                      "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
-                     "-DFMA3"
+                     "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
 #define LIBNAME   "haswell"
 #define CORENAME  "HASWELL"
 #else
@@ -389,7 +389,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
                      "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
                      "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
                      "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
-                     "-DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake"
+                     "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake"
 #define LIBNAME   "cooperlake"
 #define CORENAME  "COOPERLAKE"
 #endif
@@ -559,7 +559,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
 		     "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
 		     "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
-		     "-DHAVE_AVX -DHAVE_FMA3 -DFMA3"
+		     "-DHAVE_AVX -DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
 #define LIBNAME   "zen"
 #define CORENAME  "ZEN"
 #endif