From 961d25e9c7e4a1758adb1dbeaa15187de69dd052 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 23 May 2018 22:54:39 +0200
Subject: [PATCH 01/86] Use the new zrot.c on POWER8 for crot as well

fixes #1571 (the old zrot.S assembly does not handle incx=0 correctly)
---
 kernel/power/KERNEL.POWER8 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8
index 00ff8682a..1aa061078 100644
--- a/kernel/power/KERNEL.POWER8
+++ b/kernel/power/KERNEL.POWER8
@@ -133,7 +133,7 @@ ZNRM2KERNEL  = ../arm/znrm2.c
 #
 SROTKERNEL   = srot.c
 DROTKERNEL   = drot.c
-#CROTKERNEL   = ../arm/zrot.c
+CROTKERNEL   = zrot.c
 ZROTKERNEL   = zrot.c
 #
 SSCALKERNEL  = sscal.c

From 43e592ceb38a56716279a6514ceca1ec9bdb0865 Mon Sep 17 00:00:00 2001
From: Zhang Xianyi <xianyi@perfxlab.com>
Date: Thu, 24 May 2018 20:56:24 +0800
Subject: [PATCH 02/86] Add -lm for Android.

Conflicts:
	exports/Makefile
---
 exports/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/exports/Makefile b/exports/Makefile
index 53d4f75bb..127b05057 100644
--- a/exports/Makefile
+++ b/exports/Makefile
@@ -128,6 +128,8 @@ so : ../$(LIBSONAME)
 
 ifeq ($(OSNAME), Android)
 INTERNALNAME = $(LIBPREFIX).so
+FEXTRALIB += -lm
+EXTRALIB += -lm
 else
 INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
 endif

From 908d40be715bfb252972a0a4abf27726a729945f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 29 May 2018 14:27:46 +0200
Subject: [PATCH 03/86] Adapt lapack-test and blas-test to changes in netlib
 directory layout

partial fix for #1574 - the problem with lapack_testing.py looks like an upstream bug
---
 Makefile | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index c0e5fbcf8..380ba1ce8 100644
--- a/Makefile
+++ b/Makefile
@@ -294,9 +294,10 @@ endif
 
 lapack-test :
 	(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
-	$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc  xeigtstd  xeigtsts  xeigtstz  xlintstc  xlintstd  xlintstds  xlintstrfd  xlintstrfz  xlintsts  xlintstz  xlintstzc xlintstrfs xlintstrfc
+	$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc  xeigtstd  xeigtsts  xeigtstz 
+	$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc  xlintstd  xlintstds  xlintstrfd  xlintstrfz  xlintsts  xlintstz  xlintstzc xlintstrfs xlintstrfc
 ifneq ($(CROSS), 1)
-	( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
+	( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \
         ./testsecond; ./testdsecnd; ./testieee; ./testversion )
 	(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
 endif
@@ -308,9 +309,9 @@ lapack-runtest:
 
 
 blas-test:
-	(cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out)
+	(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out)
 	$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
-	(cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out)
+	(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out)
 
 
 dummy :

From a7dbd4c57d22b580b32f3a97b0b327bf2fedf551 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 31 May 2018 11:19:33 +0200
Subject: [PATCH 04/86] Fix paths to LIN and EIG tests

should fix 1574
---
 lapack-netlib/lapack_testing.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lapack-netlib/lapack_testing.py b/lapack-netlib/lapack_testing.py
index 3c917482d..5d07e1e87 100755
--- a/lapack-netlib/lapack_testing.py
+++ b/lapack-netlib/lapack_testing.py
@@ -257,16 +257,16 @@ for dtype in range_prec:
         else:
             if dtest==16:
                 # LIN TESTS
-                cmdbase="xlintst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
+                cmdbase="LIN/xlintst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
             elif dtest==17:
                 # PROTO LIN TESTS
-                cmdbase="xlintst"+letter+dtypes[0][dtype-1]+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
+                cmdbase="LIN/xlintst"+letter+dtypes[0][dtype-1]+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
             elif dtest==18:
                 # PROTO LIN TESTS
-                cmdbase="xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
+                cmdbase="LIN/xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
             else:
                 # EIG TESTS
-                cmdbase="xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
+                cmdbase="EIG/xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
         if (not just_errors and not short_summary):
             print("Testing "+name+" "+dtests[1][dtest]+"-"+cmdbase, end=' ')
         # Run the process: either to read the file or run the LAPACK testing

From 5fae96fb70cbc1205e50220f77722ac5ff92f0d8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 31 May 2018 12:43:45 +0200
Subject: [PATCH 05/86] Update version to 0.3.1.dev

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b5789119a..f49f20513 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 0.dev)
+set(OpenBLAS_PATCH_VERSION 1.dev)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions

From b491b10057196c5735a261608ec110b1bbd134d1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 31 May 2018 12:44:36 +0200
Subject: [PATCH 06/86] Update version to 0.3.1.dev

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index 12734464b..1b4b8eb63 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.0.dev
+VERSION = 0.3.1.dev
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

From d1b7be14aa9b57ca4df9c00cdb4611974729b3be Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 31 May 2018 12:52:04 +0200
Subject: [PATCH 07/86] Handle INCX=0,INCY=0 case

Fixes #1575 (sswap/dswap failing the swap utest on x86) as suggested by atsampson.
---
 kernel/x86/swap.S | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel/x86/swap.S b/kernel/x86/swap.S
index 54b00b33e..d3cf04942 100644
--- a/kernel/x86/swap.S
+++ b/kernel/x86/swap.S
@@ -138,6 +138,14 @@
 /* INCX != 1 or INCY != 1 */
 
 .L14:
+	cmpl	$0, %ebx
+	jne	.L141
+	cmpl	$0, %ecx
+	jne	.L141
+/* INCX == 0 and INCY == 0 */	
+	jmp	.L27
+
+.L141	
 	movl	%edx, %eax
 	sarl	$2,   %eax
 	jle	.L28

From a91f1587b9be6c9bbc403a79970d3e2a03bf866c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 31 May 2018 13:26:00 +0200
Subject: [PATCH 08/86] Work around name clash with Windows10's winnt.h

fixes #1503
---
 driver/level3/Makefile | 48 +++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/driver/level3/Makefile b/driver/level3/Makefile
index 352225206..e320092e3 100644
--- a/driver/level3/Makefile
+++ b/driver/level3/Makefile
@@ -362,7 +362,7 @@ cgemm_ct.$(SUFFIX) : gemm.c level3.c  ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 cgemm_cr.$(SUFFIX) : gemm.c level3.c  ../../param.h
-	$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 cgemm_cc.$(SUFFIX) : gemm.c level3.c  ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -410,7 +410,7 @@ zgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
-	$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -458,7 +458,7 @@ xgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
-	$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -558,7 +558,7 @@ cgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
-	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -606,7 +606,7 @@ zgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
-	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -654,7 +654,7 @@ xgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
-	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -1821,7 +1821,7 @@ cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
 	$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
-	$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
 	$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -1869,7 +1869,7 @@ zgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
-	$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -1917,7 +1917,7 @@ xgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
-	$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -1974,7 +1974,7 @@ cgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
-	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -2022,7 +2022,7 @@ zgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
-	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -2070,7 +2070,7 @@ xgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
-	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -2731,7 +2731,7 @@ cgemm_ct.$(PSUFFIX) : gemm.c level3.c  ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 cgemm_cr.$(PSUFFIX) : gemm.c level3.c  ../../param.h
-	$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 cgemm_cc.$(PSUFFIX) : gemm.c level3.c  ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -2779,7 +2779,7 @@ zgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
-	$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -2827,7 +2827,7 @@ xgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
-	$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -2927,7 +2927,7 @@ cgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
-	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -2975,7 +2975,7 @@ zgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
-	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -3023,7 +3023,7 @@ xgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
-	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -4190,7 +4190,7 @@ cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
 	$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
-	$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
 	$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -4238,7 +4238,7 @@ zgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
-	$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -4286,7 +4286,7 @@ xgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
-	$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -4343,7 +4343,7 @@ cgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
-	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -4391,7 +4391,7 @@ zgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
-	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -4439,7 +4439,7 @@ xgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
 
 xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
-	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
 
 xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)

From 2fc748bf7200ca53d66d43107dc2c732685519d0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 31 May 2018 13:41:12 +0200
Subject: [PATCH 09/86] Restore optimized swap kernel now that we have a proper
 fix

---
 kernel/x86/KERNEL.NEHALEM | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/x86/KERNEL.NEHALEM b/kernel/x86/KERNEL.NEHALEM
index 835520efb..65b03ae50 100644
--- a/kernel/x86/KERNEL.NEHALEM
+++ b/kernel/x86/KERNEL.NEHALEM
@@ -1,3 +1 @@
 include $(KERNELDIR)/KERNEL.PENRYN
-SSWAPKERNEL  = ../arm/swap.c
-DSWAPKERNEL  = ../arm/swap.c

From 7df8c4f76fa7aadd8d1bce1d99fe826a4826d775 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 31 May 2018 17:23:08 +0200
Subject: [PATCH 10/86] typo fix

---
 kernel/x86/swap.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86/swap.S b/kernel/x86/swap.S
index d3cf04942..e30c27898 100644
--- a/kernel/x86/swap.S
+++ b/kernel/x86/swap.S
@@ -145,7 +145,7 @@
 /* INCX == 0 and INCY == 0 */	
 	jmp	.L27
 
-.L141	
+.L141:	
 	movl	%edx, %eax
 	sarl	$2,   %eax
 	jle	.L28

From e2a8c35e5a6897e5aebf5e2fb8ba18f94735c89a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 1 Jun 2018 15:08:14 +0200
Subject: [PATCH 11/86] Fixes from netlib PR253

LAPACKE interfaces for Aasen's functions now call ?sytrf_aa and ?hetrf_aa instead of ?sytrf and ?hetrf
---
 lapack-netlib/LAPACKE/src/lapacke_chetrf_aa_work.c | 6 +++---
 lapack-netlib/LAPACKE/src/lapacke_csytrf_aa_work.c | 6 +++---
 lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_work.c | 6 +++---
 lapack-netlib/LAPACKE/src/lapacke_ssytrf_aa_work.c | 6 +++---
 lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_work.c | 6 +++---
 lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_work.c | 6 +++---
 6 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/lapack-netlib/LAPACKE/src/lapacke_chetrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_chetrf_aa_work.c
index b4a7595d8..e4d538779 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chetrf_aa_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chetrf_aa_work.c
@@ -41,7 +41,7 @@ lapack_int LAPACKE_chetrf_aa_work( int matrix_layout, char uplo, lapack_int n,
     lapack_int info = 0;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
-        LAPACK_chetrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info );
+        LAPACK_chetrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
@@ -56,7 +56,7 @@ lapack_int LAPACKE_chetrf_aa_work( int matrix_layout, char uplo, lapack_int n,
         }
         /* Query optimal working array(s) size if requested */
         if( lwork == -1 ) {
-            LAPACK_chetrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info );
+            LAPACK_chetrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info );
             return (info < 0) ? (info - 1) : info;
         }
         /* Allocate memory for temporary array(s) */
@@ -69,7 +69,7 @@ lapack_int LAPACKE_chetrf_aa_work( int matrix_layout, char uplo, lapack_int n,
         /* Transpose input matrices */
         LAPACKE_che_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
-        LAPACK_chetrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info );
+        LAPACK_chetrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_csytrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_csytrf_aa_work.c
index d4f24142b..f6661c85c 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_csytrf_aa_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_csytrf_aa_work.c
@@ -41,7 +41,7 @@ lapack_int LAPACKE_csytrf_aa_work( int matrix_layout, char uplo, lapack_int n,
     lapack_int info = 0;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
-        LAPACK_csytrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info );
+        LAPACK_csytrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
@@ -56,7 +56,7 @@ lapack_int LAPACKE_csytrf_aa_work( int matrix_layout, char uplo, lapack_int n,
         }
         /* Query optimal working array(s) size if requested */
         if( lwork == -1 ) {
-            LAPACK_csytrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info );
+            LAPACK_csytrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info );
             return (info < 0) ? (info - 1) : info;
         }
         /* Allocate memory for temporary array(s) */
@@ -69,7 +69,7 @@ lapack_int LAPACKE_csytrf_aa_work( int matrix_layout, char uplo, lapack_int n,
         /* Transpose input matrices */
         LAPACKE_csy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
-        LAPACK_csytrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info );
+        LAPACK_csytrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_work.c
index cbf97b632..e72bfa6de 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_work.c
@@ -40,7 +40,7 @@ lapack_int LAPACKE_dsytrf_aa_work( int matrix_layout, char uplo, lapack_int n,
     lapack_int info = 0;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
-        LAPACK_dsytrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info );
+        LAPACK_dsytrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
@@ -55,7 +55,7 @@ lapack_int LAPACKE_dsytrf_aa_work( int matrix_layout, char uplo, lapack_int n,
         }
         /* Query optimal working array(s) size if requested */
         if( lwork == -1 ) {
-            LAPACK_dsytrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info );
+            LAPACK_dsytrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info );
             return (info < 0) ? (info - 1) : info;
         }
         /* Allocate memory for temporary array(s) */
@@ -67,7 +67,7 @@ lapack_int LAPACKE_dsytrf_aa_work( int matrix_layout, char uplo, lapack_int n,
         /* Transpose input matrices */
         LAPACKE_dsy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
-        LAPACK_dsytrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info );
+        LAPACK_dsytrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssytrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssytrf_aa_work.c
index d68cb17c1..182946a45 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssytrf_aa_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssytrf_aa_work.c
@@ -40,7 +40,7 @@ lapack_int LAPACKE_ssytrf_aa_work( int matrix_layout, char uplo, lapack_int n,
     lapack_int info = 0;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
-        LAPACK_ssytrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info );
+        LAPACK_ssytrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
@@ -55,7 +55,7 @@ lapack_int LAPACKE_ssytrf_aa_work( int matrix_layout, char uplo, lapack_int n,
         }
         /* Query optimal working array(s) size if requested */
         if( lwork == -1 ) {
-            LAPACK_ssytrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info );
+            LAPACK_ssytrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info );
             return (info < 0) ? (info - 1) : info;
         }
         /* Allocate memory for temporary array(s) */
@@ -67,7 +67,7 @@ lapack_int LAPACKE_ssytrf_aa_work( int matrix_layout, char uplo, lapack_int n,
         /* Transpose input matrices */
         LAPACKE_ssy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
-        LAPACK_ssytrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info );
+        LAPACK_ssytrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_work.c
index 5214217fb..dbad2d81e 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_work.c
@@ -41,7 +41,7 @@ lapack_int LAPACKE_zhetrf_aa_work( int matrix_layout, char uplo, lapack_int n,
     lapack_int info = 0;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
-        LAPACK_zhetrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info );
+        LAPACK_zhetrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
@@ -56,7 +56,7 @@ lapack_int LAPACKE_zhetrf_aa_work( int matrix_layout, char uplo, lapack_int n,
         }
         /* Query optimal working array(s) size if requested */
         if( lwork == -1 ) {
-            LAPACK_zhetrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info );
+            LAPACK_zhetrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info );
             return (info < 0) ? (info - 1) : info;
         }
         /* Allocate memory for temporary array(s) */
@@ -69,7 +69,7 @@ lapack_int LAPACKE_zhetrf_aa_work( int matrix_layout, char uplo, lapack_int n,
         /* Transpose input matrices */
         LAPACKE_zhe_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
-        LAPACK_zhetrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info );
+        LAPACK_zhetrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_work.c
index 29d75319e..03726c63e 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_work.c
@@ -41,7 +41,7 @@ lapack_int LAPACKE_zsytrf_aa_work( int matrix_layout, char uplo, lapack_int n,
     lapack_int info = 0;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
-        LAPACK_zsytrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info );
+        LAPACK_zsytrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
@@ -56,7 +56,7 @@ lapack_int LAPACKE_zsytrf_aa_work( int matrix_layout, char uplo, lapack_int n,
         }
         /* Query optimal working array(s) size if requested */
         if( lwork == -1 ) {
-            LAPACK_zsytrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info );
+            LAPACK_zsytrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info );
             return (info < 0) ? (info - 1) : info;
         }
         /* Allocate memory for temporary array(s) */
@@ -69,7 +69,7 @@ lapack_int LAPACKE_zsytrf_aa_work( int matrix_layout, char uplo, lapack_int n,
         /* Transpose input matrices */
         LAPACKE_zsy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
-        LAPACK_zsytrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info );
+        LAPACK_zsytrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }

From 677e42d7b0c6b6c40af94268fbb9d9be60f7af0a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 1 Jun 2018 15:12:59 +0200
Subject: [PATCH 12/86] Fixes from netlib PR 253

When minimal workspace is given in ?hesv_aa, ?sysv_aa, ?hesv_aa_2stage, ?sysv_aa_2stage, now no error is given
Quick return for ?laqr1
---
 lapack-netlib/SRC/cgejsv.f           |  4 ++--
 lapack-netlib/SRC/chesv_aa.f         |  5 ++---
 lapack-netlib/SRC/chesv_aa_2stage.f  | 15 +++++++++------
 lapack-netlib/SRC/chetrf_aa_2stage.f |  6 +++++-
 lapack-netlib/SRC/chetrs_aa_2stage.f |  1 +
 lapack-netlib/SRC/cla_syamv.f        |  2 +-
 lapack-netlib/SRC/claqr1.f           |  7 +++++++
 lapack-netlib/SRC/csysv_aa.f         |  3 ---
 lapack-netlib/SRC/csysv_aa_2stage.f  | 15 +++++++++------
 lapack-netlib/SRC/csytrf_aa_2stage.f |  6 +++++-
 lapack-netlib/SRC/csytri2.f          |  6 +++---
 lapack-netlib/SRC/csytrs_aa_2stage.f |  1 +
 lapack-netlib/SRC/ctrevc3.f          | 18 +++++++++---------
 lapack-netlib/SRC/dgelqt.f           |  2 +-
 lapack-netlib/SRC/dla_syamv.f        |  2 +-
 lapack-netlib/SRC/dlaqr1.f           |  7 +++++++
 lapack-netlib/SRC/dsysv_aa.f         |  3 ---
 lapack-netlib/SRC/dsysv_aa_2stage.f  | 13 +++++++------
 lapack-netlib/SRC/dsytrf_aa_2stage.f |  8 ++++++--
 lapack-netlib/SRC/dsytri2.f          |  6 +++---
 lapack-netlib/SRC/dsytrs_aa_2stage.f |  1 +
 lapack-netlib/SRC/dtrevc3.f          |  4 ++--
 lapack-netlib/SRC/iparmq.f           |  4 ++--
 lapack-netlib/SRC/sla_syamv.f        |  2 +-
 lapack-netlib/SRC/slaqr1.f           |  7 +++++++
 lapack-netlib/SRC/ssysv_aa.f         |  3 ---
 lapack-netlib/SRC/ssysv_aa_2stage.f  | 13 +++++++------
 lapack-netlib/SRC/ssytrf_aa_2stage.f |  6 +++++-
 lapack-netlib/SRC/ssytri2.f          |  4 ++--
 lapack-netlib/SRC/ssytrs_aa_2stage.f |  1 +
 lapack-netlib/SRC/strevc3.f          | 12 ++++++------
 lapack-netlib/SRC/zgejsv.f           |  4 ++--
 lapack-netlib/SRC/zhesv_aa.f         |  5 ++---
 lapack-netlib/SRC/zhesv_aa_2stage.f  | 13 +++++++------
 lapack-netlib/SRC/zhetrf_aa_2stage.f |  6 +++++-
 lapack-netlib/SRC/zhetrs_aa_2stage.f |  7 ++++---
 lapack-netlib/SRC/zla_syamv.f        |  2 +-
 lapack-netlib/SRC/zlaqr1.f           |  7 +++++++
 lapack-netlib/SRC/zsysv_aa.f         |  3 ---
 lapack-netlib/SRC/zsysv_aa_2stage.f  | 13 +++++++------
 lapack-netlib/SRC/zsytrf_aa_2stage.f |  6 +++++-
 lapack-netlib/SRC/zsytri2.f          |  2 +-
 lapack-netlib/SRC/zsytrs_aa_2stage.f |  1 +
 43 files changed, 155 insertions(+), 101 deletions(-)

diff --git a/lapack-netlib/SRC/cgejsv.f b/lapack-netlib/SRC/cgejsv.f
index 8eb43cf50..a7b1c451c 100644
--- a/lapack-netlib/SRC/cgejsv.f
+++ b/lapack-netlib/SRC/cgejsv.f
@@ -701,7 +701,7 @@
           LWSVDJ  = MAX( 2 * N, 1 )         
           LWSVDJV = MAX( 2 * N, 1 )
 *         .. minimal REAL workspace length for CGEQP3, CPOCON, CGESVJ
-          LRWQP3  = N 
+          LRWQP3  = 2 * N 
           LRWCON  = N 
           LRWSVDJ = N 
           IF ( LQUERY ) THEN 
@@ -939,7 +939,7 @@
              END IF 
           END IF
           MINWRK = MAX( 2, MINWRK )
-          OPTWRK = MAX( 2, OPTWRK )
+          OPTWRK = MAX( OPTWRK, MINWRK )
           IF ( LWORK  .LT. MINWRK  .AND. (.NOT.LQUERY) ) INFO = - 17
           IF ( LRWORK .LT. MINRWRK .AND. (.NOT.LQUERY) ) INFO = - 19   
       END IF
diff --git a/lapack-netlib/SRC/chesv_aa.f b/lapack-netlib/SRC/chesv_aa.f
index 0bf636b48..470f910bc 100644
--- a/lapack-netlib/SRC/chesv_aa.f
+++ b/lapack-netlib/SRC/chesv_aa.f
@@ -209,6 +209,8 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
+      ELSE IF( LWORK.LT.MAX( 2*N, 3*N-2 ) .AND. .NOT.LQUERY ) THEN
+         INFO = -10
       END IF
 *
       IF( INFO.EQ.0 ) THEN
@@ -219,9 +221,6 @@
          LWKOPT_HETRS = INT( WORK(1) )
          LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS )
          WORK( 1 ) = LWKOPT
-         IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN
-            INFO = -10
-         END IF
       END IF
 *
       IF( INFO.NE.0 ) THEN
diff --git a/lapack-netlib/SRC/chesv_aa_2stage.f b/lapack-netlib/SRC/chesv_aa_2stage.f
index 057d9c57a..05f6b7bb7 100644
--- a/lapack-netlib/SRC/chesv_aa_2stage.f
+++ b/lapack-netlib/SRC/chesv_aa_2stage.f
@@ -105,6 +105,7 @@
 *>
 *> \param[in] LTB
 *> \verbatim
+*>          LTB is INTEGER
 *>          The size of the array TB. LTB >= 4*N, internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
@@ -124,7 +125,7 @@
 *>
 *> \param[out] IPIV2
 *> \verbatim
-*>          IPIV is INTEGER array, dimension (N)
+*>          IPIV2 is INTEGER array, dimension (N)
 *>          On exit, it contains the details of the interchanges, i.e.,
 *>          the row and column k of T were interchanged with the
 *>          row and column IPIV(k).
@@ -150,6 +151,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The size of WORK. LWORK >= N, internally used to select NB
 *>          such that LWORK >= N*NB.
 *>
@@ -233,19 +235,18 @@
          INFO = -3
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -5
+      ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN
+         INFO = -7
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -11
+      ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN
+         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV,
      $                          IPIV2, WORK, -1, INFO )
          LWKOPT = INT( WORK(1) )
-         IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN
-            INFO = -7
-         ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN
-            INFO = -13
-         END IF
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -270,6 +271,8 @@
       END IF
 *
       WORK( 1 ) = LWKOPT
+*
+      RETURN
 *
 *     End of CHESV_AA_2STAGE
 *
diff --git a/lapack-netlib/SRC/chetrf_aa_2stage.f b/lapack-netlib/SRC/chetrf_aa_2stage.f
index 0fa2ae3a0..ce34d73cc 100644
--- a/lapack-netlib/SRC/chetrf_aa_2stage.f
+++ b/lapack-netlib/SRC/chetrf_aa_2stage.f
@@ -93,6 +93,7 @@
 *>
 *> \param[in] LTB
 *> \verbatim
+*>          LTB is INTEGER
 *>          The size of the array TB. LTB >= 4*N, internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
@@ -112,7 +113,7 @@
 *>
 *> \param[out] IPIV2
 *> \verbatim
-*>          IPIV is INTEGER array, dimension (N)
+*>          IPIV2 is INTEGER array, dimension (N)
 *>          On exit, it contains the details of the interchanges, i.e.,
 *>          the row and column k of T were interchanged with the
 *>          row and column IPIV(k).
@@ -125,6 +126,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The size of WORK. LWORK >= N, internally used to select NB
 *>          such that LWORK >= N*NB.
 *>
@@ -658,6 +660,8 @@ c     $                     (J+1)*NB+1, (J+1)*NB+KB, IPIV, 1 )
 *
 *     Factor the band matrix
       CALL CGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO )
+*
+      RETURN
 *
 *     End of CHETRF_AA_2STAGE
 *
diff --git a/lapack-netlib/SRC/chetrs_aa_2stage.f b/lapack-netlib/SRC/chetrs_aa_2stage.f
index 3f8576673..05d09275b 100644
--- a/lapack-netlib/SRC/chetrs_aa_2stage.f
+++ b/lapack-netlib/SRC/chetrs_aa_2stage.f
@@ -87,6 +87,7 @@
 *>
 *> \param[in] LTB
 *> \verbatim
+*>          LTB is INTEGER
 *>          The size of the array TB. LTB >= 4*N.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/cla_syamv.f b/lapack-netlib/SRC/cla_syamv.f
index e1d3df960..695b5e478 100644
--- a/lapack-netlib/SRC/cla_syamv.f
+++ b/lapack-netlib/SRC/cla_syamv.f
@@ -241,7 +241,7 @@
          INFO = 10
       END IF
       IF( INFO.NE.0 )THEN
-         CALL XERBLA( 'SSYMV ', INFO )
+         CALL XERBLA( 'CLA_SYAMV', INFO )
          RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/claqr1.f b/lapack-netlib/SRC/claqr1.f
index b76bedf60..977947196 100644
--- a/lapack-netlib/SRC/claqr1.f
+++ b/lapack-netlib/SRC/claqr1.f
@@ -142,6 +142,13 @@
       CABS1( CDUM ) = ABS( REAL( CDUM ) ) + ABS( AIMAG( CDUM ) )
 *     ..
 *     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( N.NE.2 .AND. N.NE.3 ) THEN
+         RETURN
+      END IF
+*
       IF( N.EQ.2 ) THEN
          S = CABS1( H( 1, 1 )-S2 ) + CABS1( H( 2, 1 ) )
          IF( S.EQ.RZERO ) THEN
diff --git a/lapack-netlib/SRC/csysv_aa.f b/lapack-netlib/SRC/csysv_aa.f
index 9cd669d33..87be734cc 100644
--- a/lapack-netlib/SRC/csysv_aa.f
+++ b/lapack-netlib/SRC/csysv_aa.f
@@ -221,9 +221,6 @@
          LWKOPT_SYTRS = INT( WORK(1) )
          LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS )
          WORK( 1 ) = LWKOPT
-         IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN
-            INFO = -10
-         END IF
       END IF
 *
       IF( INFO.NE.0 ) THEN
diff --git a/lapack-netlib/SRC/csysv_aa_2stage.f b/lapack-netlib/SRC/csysv_aa_2stage.f
index cba57fc3e..a13349824 100644
--- a/lapack-netlib/SRC/csysv_aa_2stage.f
+++ b/lapack-netlib/SRC/csysv_aa_2stage.f
@@ -105,6 +105,7 @@
 *>
 *> \param[in] LTB
 *> \verbatim
+*>          LTB is INTEGER
 *>          The size of the array TB. LTB >= 4*N, internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
@@ -124,7 +125,7 @@
 *>
 *> \param[out] IPIV2
 *> \verbatim
-*>          IPIV is INTEGER array, dimension (N)
+*>          IPIV2 is INTEGER array, dimension (N)
 *>          On exit, it contains the details of the interchanges, i.e.,
 *>          the row and column k of T were interchanged with the
 *>          row and column IPIV(k).
@@ -150,6 +151,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The size of WORK. LWORK >= N, internally used to select NB
 *>          such that LWORK >= N*NB.
 *>
@@ -233,19 +235,18 @@
          INFO = -3
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -5
+      ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN
+         INFO = -7
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -11
+      ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN
+         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL CSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV,
      $                          IPIV2, WORK, -1, INFO )
          LWKOPT = INT( WORK(1) )
-         IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN
-            INFO = -7
-         ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN
-            INFO = -13
-         END IF
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -270,6 +271,8 @@
       END IF
 *
       WORK( 1 ) = LWKOPT
+*
+      RETURN
 *
 *     End of CSYSV_AA_2STAGE
 *
diff --git a/lapack-netlib/SRC/csytrf_aa_2stage.f b/lapack-netlib/SRC/csytrf_aa_2stage.f
index 0a6bfbe31..0d0bd156c 100644
--- a/lapack-netlib/SRC/csytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/csytrf_aa_2stage.f
@@ -93,6 +93,7 @@
 *>
 *> \param[in] LTB
 *> \verbatim
+*>          LTB is INTEGER
 *>          The size of the array TB. LTB >= 4*N, internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
@@ -112,7 +113,7 @@
 *>
 *> \param[out] IPIV2
 *> \verbatim
-*>          IPIV is INTEGER array, dimension (N)
+*>          IPIV2 is INTEGER array, dimension (N)
 *>          On exit, it contains the details of the interchanges, i.e.,
 *>          the row and column k of T were interchanged with the
 *>          row and column IPIV(k).
@@ -125,6 +126,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The size of WORK. LWORK >= N, internally used to select NB
 *>          such that LWORK >= N*NB.
 *>
@@ -662,6 +664,8 @@ c     $                     (J+1)*NB+1, (J+1)*NB+KB, IPIV, 1 )
 *
 *     Factor the band matrix
       CALL CGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO )
+*
+      RETURN
 *
 *     End of CSYTRF_AA_2STAGE
 *
diff --git a/lapack-netlib/SRC/csytri2.f b/lapack-netlib/SRC/csytri2.f
index 4c6baaa3e..4bd8e4f99 100644
--- a/lapack-netlib/SRC/csytri2.f
+++ b/lapack-netlib/SRC/csytri2.f
@@ -96,11 +96,11 @@
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
 *>          WORK is size >= (N+NB+1)*(NB+3)
-*>          If LDWORK = -1, then a workspace query is assumed; the routine
+*>          If LWORK = -1, then a workspace query is assumed; the routine
 *>           calculates:
 *>              - the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array,
-*>              - and no error message related to LDWORK is issued by XERBLA.
+*>              - and no error message related to LWORK is issued by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -163,7 +163,7 @@
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
 *     Get blocksize
-      NBMAX = ILAENV( 1, 'CSYTRF', UPLO, N, -1, -1, -1 )
+      NBMAX = ILAENV( 1, 'CSYTRI2', UPLO, N, -1, -1, -1 )
       IF ( NBMAX .GE. N ) THEN
          MINSIZE = N
       ELSE
diff --git a/lapack-netlib/SRC/csytrs_aa_2stage.f b/lapack-netlib/SRC/csytrs_aa_2stage.f
index 03bccda82..d025c08fe 100644
--- a/lapack-netlib/SRC/csytrs_aa_2stage.f
+++ b/lapack-netlib/SRC/csytrs_aa_2stage.f
@@ -85,6 +85,7 @@
 *>
 *> \param[in] LTB
 *> \verbatim
+*>          LTB is INTEGER
 *>          The size of the array TB. LTB >= 4*N.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/ctrevc3.f b/lapack-netlib/SRC/ctrevc3.f
index c06b40477..a134c1a50 100644
--- a/lapack-netlib/SRC/ctrevc3.f
+++ b/lapack-netlib/SRC/ctrevc3.f
@@ -27,8 +27,8 @@
 *       ..
 *       .. Array Arguments ..
 *       LOGICAL            SELECT( * )
-*       REAL   RWORK( * )
-*       COMPLEX         T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ),
+*       REAL               RWORK( * )
+*       COMPLEX            T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ),
 *      $                   WORK( * )
 *       ..
 *
@@ -258,17 +258,17 @@
 *     ..
 *     .. Array Arguments ..
       LOGICAL            SELECT( * )
-      REAL   RWORK( * )
-      COMPLEX         T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ),
+      REAL               RWORK( * )
+      COMPLEX            T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ),
      $                   WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
-      REAL   ZERO, ONE
+      REAL               ZERO, ONE
       PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
-      COMPLEX         CZERO, CONE
+      COMPLEX            CZERO, CONE
       PARAMETER          ( CZERO = ( 0.0E+0, 0.0E+0 ),
      $                     CONE  = ( 1.0E+0, 0.0E+0 ) )
       INTEGER            NBMIN, NBMAX
@@ -277,13 +277,13 @@
 *     .. Local Scalars ..
       LOGICAL            ALLV, BOTHV, LEFTV, LQUERY, OVER, RIGHTV, SOMEV
       INTEGER            I, II, IS, J, K, KI, IV, MAXWRK, NB
-      REAL   OVFL, REMAX, SCALE, SMIN, SMLNUM, ULP, UNFL
-      COMPLEX         CDUM
+      REAL               OVFL, REMAX, SCALE, SMIN, SMLNUM, ULP, UNFL
+      COMPLEX            CDUM
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV, ICAMAX
-      REAL   SLAMCH, SCASUM
+      REAL               SLAMCH, SCASUM
       EXTERNAL           LSAME, ILAENV, ICAMAX, SLAMCH, SCASUM
 *     ..
 *     .. External Subroutines ..
diff --git a/lapack-netlib/SRC/dgelqt.f b/lapack-netlib/SRC/dgelqt.f
index 2124f3dc3..5b4ee65b5 100644
--- a/lapack-netlib/SRC/dgelqt.f
+++ b/lapack-netlib/SRC/dgelqt.f
@@ -158,7 +158,7 @@
       INTEGER    I, IB, IINFO, K
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL   DGEQRT2, DGELQT3, DGEQRT3, DLARFB, XERBLA
+      EXTERNAL   DGELQT3, DLARFB, XERBLA
 *     ..
 *     .. Executable Statements ..
 *
diff --git a/lapack-netlib/SRC/dla_syamv.f b/lapack-netlib/SRC/dla_syamv.f
index 29566a6e9..bb6dbe288 100644
--- a/lapack-netlib/SRC/dla_syamv.f
+++ b/lapack-netlib/SRC/dla_syamv.f
@@ -230,7 +230,7 @@
          INFO = 10
       END IF
       IF( INFO.NE.0 )THEN
-         CALL XERBLA( 'DSYMV ', INFO )
+         CALL XERBLA( 'DLA_SYAMV', INFO )
          RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/dlaqr1.f b/lapack-netlib/SRC/dlaqr1.f
index 81a462fb3..795b072ab 100644
--- a/lapack-netlib/SRC/dlaqr1.f
+++ b/lapack-netlib/SRC/dlaqr1.f
@@ -147,6 +147,13 @@
       INTRINSIC          ABS
 *     ..
 *     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( N.NE.2 .AND. N.NE.3 ) THEN
+         RETURN
+      END IF
+*
       IF( N.EQ.2 ) THEN
          S = ABS( H( 1, 1 )-SR2 ) + ABS( SI2 ) + ABS( H( 2, 1 ) )
          IF( S.EQ.ZERO ) THEN
diff --git a/lapack-netlib/SRC/dsysv_aa.f b/lapack-netlib/SRC/dsysv_aa.f
index cbccd5e65..7192928c6 100644
--- a/lapack-netlib/SRC/dsysv_aa.f
+++ b/lapack-netlib/SRC/dsysv_aa.f
@@ -221,9 +221,6 @@
          LWKOPT_SYTRS = INT( WORK(1) )
          LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS )
          WORK( 1 ) = LWKOPT
-         IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN
-            INFO = -10
-         END IF
       END IF
 *
       IF( INFO.NE.0 ) THEN
diff --git a/lapack-netlib/SRC/dsysv_aa_2stage.f b/lapack-netlib/SRC/dsysv_aa_2stage.f
index ac3c77d76..05e538f0b 100644
--- a/lapack-netlib/SRC/dsysv_aa_2stage.f
+++ b/lapack-netlib/SRC/dsysv_aa_2stage.f
@@ -107,6 +107,7 @@
 *>
 *> \param[in] LTB
 *> \verbatim
+*>          LTB is INTEGER
 *>          The size of the array TB. LTB >= 4*N, internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
@@ -126,7 +127,7 @@
 *>
 *> \param[out] IPIV2
 *> \verbatim
-*>          IPIV is INTEGER array, dimension (N)
+*>          IPIV2 is INTEGER array, dimension (N)
 *>          On exit, it contains the details of the interchanges, i.e.,
 *>          the row and column k of T were interchanged with the
 *>          row and column IPIV(k).
@@ -152,6 +153,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The size of WORK. LWORK >= N, internally used to select NB
 *>          such that LWORK >= N*NB.
 *>
@@ -235,19 +237,18 @@
          INFO = -3
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -5
+      ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN
+         INFO = -7
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -11
+      ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN
+         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV,
      $                          IPIV2, WORK, -1, INFO )
          LWKOPT = INT( WORK(1) )
-         IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN
-            INFO = -7
-         ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN
-            INFO = -13
-         END IF
       END IF
 *
       IF( INFO.NE.0 ) THEN
diff --git a/lapack-netlib/SRC/dsytrf_aa_2stage.f b/lapack-netlib/SRC/dsytrf_aa_2stage.f
index f5f06cc1d..25fc1a2eb 100644
--- a/lapack-netlib/SRC/dsytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/dsytrf_aa_2stage.f
@@ -93,6 +93,7 @@
 *>
 *> \param[in] LTB
 *> \verbatim
+*>          LTB is INTEGER
 *>          The size of the array TB. LTB >= 4*N, internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
@@ -109,6 +110,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The size of WORK. LWORK >= N, internally used to select NB
 *>          such that LWORK >= N*NB.
 *>
@@ -128,10 +130,10 @@
 *>
 *> \param[out] IPIV2
 *> \verbatim
-*>          IPIV is INTEGER array, dimension (N)
+*>          IPIV2 is INTEGER array, dimension (N)
 *>          On exit, it contains the details of the interchanges, i.e.,
 *>          the row and column k of T were interchanged with the
-*>          row and column IPIV(k).
+*>          row and column IPIV2(k).
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -641,6 +643,8 @@ c     $                     (J+1)*NB+1, (J+1)*NB+KB, IPIV, 1 )
 *
 *     Factor the band matrix
       CALL DGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO )
+*
+      RETURN
 *
 *     End of DSYTRF_AA_2STAGE
 *
diff --git a/lapack-netlib/SRC/dsytri2.f b/lapack-netlib/SRC/dsytri2.f
index 9aa21a854..23f8b9fa2 100644
--- a/lapack-netlib/SRC/dsytri2.f
+++ b/lapack-netlib/SRC/dsytri2.f
@@ -96,11 +96,11 @@
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
 *>          WORK is size >= (N+NB+1)*(NB+3)
-*>          If LDWORK = -1, then a workspace query is assumed; the routine
+*>          If LWORK = -1, then a workspace query is assumed; the routine
 *>           calculates:
 *>              - the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array,
-*>              - and no error message related to LDWORK is issued by XERBLA.
+*>              - and no error message related to LWORK is issued by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -163,7 +163,7 @@
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
 *     Get blocksize
-      NBMAX = ILAENV( 1, 'DSYTRF', UPLO, N, -1, -1, -1 )
+      NBMAX = ILAENV( 1, 'DSYTRI2', UPLO, N, -1, -1, -1 )
       IF ( NBMAX .GE. N ) THEN
          MINSIZE = N
       ELSE
diff --git a/lapack-netlib/SRC/dsytrs_aa_2stage.f b/lapack-netlib/SRC/dsytrs_aa_2stage.f
index caff5d4ad..bb283cb95 100644
--- a/lapack-netlib/SRC/dsytrs_aa_2stage.f
+++ b/lapack-netlib/SRC/dsytrs_aa_2stage.f
@@ -85,6 +85,7 @@
 *>
 *> \param[in] LTB
 *> \verbatim
+*>          LTB is INTEGER
 *>          The size of the array TB. LTB >= 4*N.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/dtrevc3.f b/lapack-netlib/SRC/dtrevc3.f
index 745f636d0..957baf4f0 100644
--- a/lapack-netlib/SRC/dtrevc3.f
+++ b/lapack-netlib/SRC/dtrevc3.f
@@ -45,9 +45,9 @@
 *> The right eigenvector x and the left eigenvector y of T corresponding
 *> to an eigenvalue w are defined by:
 *>
-*>    T*x = w*x,     (y**H)*T = w*(y**H)
+*>    T*x = w*x,     (y**T)*T = w*(y**T)
 *>
-*> where y**H denotes the conjugate transpose of y.
+*> where y**T denotes the transpose of the vector y.
 *> The eigenvalues are not input to this routine, but are read directly
 *> from the diagonal blocks of T.
 *>
diff --git a/lapack-netlib/SRC/iparmq.f b/lapack-netlib/SRC/iparmq.f
index e576e0db0..a9212b3e0 100644
--- a/lapack-netlib/SRC/iparmq.f
+++ b/lapack-netlib/SRC/iparmq.f
@@ -104,13 +104,13 @@
 *>
 *> \param[in] NAME
 *> \verbatim
-*>          NAME is character string
+*>          NAME is CHARACTER string
 *>               Name of the calling subroutine
 *> \endverbatim
 *>
 *> \param[in] OPTS
 *> \verbatim
-*>          OPTS is character string
+*>          OPTS is CHARACTER string
 *>               This is a concatenation of the string arguments to
 *>               TTQRE.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sla_syamv.f b/lapack-netlib/SRC/sla_syamv.f
index d40e7bd95..4459f4d8b 100644
--- a/lapack-netlib/SRC/sla_syamv.f
+++ b/lapack-netlib/SRC/sla_syamv.f
@@ -230,7 +230,7 @@
          INFO = 10
       END IF
       IF( INFO.NE.0 )THEN
-         CALL XERBLA( 'SSYMV ', INFO )
+         CALL XERBLA( 'SLA_SYAMV', INFO )
          RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/slaqr1.f b/lapack-netlib/SRC/slaqr1.f
index 7d7d851ee..2de33849d 100644
--- a/lapack-netlib/SRC/slaqr1.f
+++ b/lapack-netlib/SRC/slaqr1.f
@@ -147,6 +147,13 @@
       INTRINSIC          ABS
 *     ..
 *     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( N.NE.2 .AND. N.NE.3 ) THEN
+         RETURN
+      END IF
+*
       IF( N.EQ.2 ) THEN
          S = ABS( H( 1, 1 )-SR2 ) + ABS( SI2 ) + ABS( H( 2, 1 ) )
          IF( S.EQ.ZERO ) THEN
diff --git a/lapack-netlib/SRC/ssysv_aa.f b/lapack-netlib/SRC/ssysv_aa.f
index abf52b143..e470f5883 100644
--- a/lapack-netlib/SRC/ssysv_aa.f
+++ b/lapack-netlib/SRC/ssysv_aa.f
@@ -220,9 +220,6 @@
          LWKOPT_SYTRS = INT( WORK(1) )
          LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS )
          WORK( 1 ) = LWKOPT
-         IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN
-            INFO = -10
-         END IF
       END IF
 *
       IF( INFO.NE.0 ) THEN
diff --git a/lapack-netlib/SRC/ssysv_aa_2stage.f b/lapack-netlib/SRC/ssysv_aa_2stage.f
index a738c7415..43d937141 100644
--- a/lapack-netlib/SRC/ssysv_aa_2stage.f
+++ b/lapack-netlib/SRC/ssysv_aa_2stage.f
@@ -106,6 +106,7 @@
 *>
 *> \param[in] LTB
 *> \verbatim
+*>          LTB is INTEGER
 *>          The size of the array TB. LTB >= 4*N, internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
@@ -125,7 +126,7 @@
 *>
 *> \param[out] IPIV2
 *> \verbatim
-*>          IPIV is INTEGER array, dimension (N)
+*>          IPIV2 is INTEGER array, dimension (N)
 *>          On exit, it contains the details of the interchanges, i.e.,
 *>          the row and column k of T were interchanged with the
 *>          row and column IPIV(k).
@@ -151,6 +152,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The size of WORK. LWORK >= N, internally used to select NB
 *>          such that LWORK >= N*NB.
 *>
@@ -234,19 +236,18 @@
          INFO = -3
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -5
+      ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN
+         INFO = -7
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -11
+      ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN
+         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV,
      $                          IPIV2, WORK, -1, INFO )
          LWKOPT = INT( WORK(1) )
-         IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN
-            INFO = -7
-         ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN
-            INFO = -13
-         END IF
       END IF
 *
       IF( INFO.NE.0 ) THEN
diff --git a/lapack-netlib/SRC/ssytrf_aa_2stage.f b/lapack-netlib/SRC/ssytrf_aa_2stage.f
index a92974930..0e0f6edb7 100644
--- a/lapack-netlib/SRC/ssytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/ssytrf_aa_2stage.f
@@ -93,6 +93,7 @@
 *>
 *> \param[in] LTB
 *> \verbatim
+*>          LTB is INTEGER
 *>          The size of the array TB. LTB >= 4*N, internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
@@ -112,7 +113,7 @@
 *>
 *> \param[out] IPIV2
 *> \verbatim
-*>          IPIV is INTEGER array, dimension (N)
+*>          IPIV2 is INTEGER array, dimension (N)
 *>          On exit, it contains the details of the interchanges, i.e.,
 *>          the row and column k of T were interchanged with the
 *>          row and column IPIV(k).
@@ -125,6 +126,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The size of WORK. LWORK >= N, internally used to select NB
 *>          such that LWORK >= N*NB.
 *>
@@ -641,6 +643,8 @@ c     $                     (J+1)*NB+1, (J+1)*NB+KB, IPIV, 1 )
 *
 *     Factor the band matrix
       CALL SGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO )
+*
+      RETURN
 *
 *     End of SSYTRF_AA_2STAGE
 *
diff --git a/lapack-netlib/SRC/ssytri2.f b/lapack-netlib/SRC/ssytri2.f
index 97b539005..4b9ea4e7b 100644
--- a/lapack-netlib/SRC/ssytri2.f
+++ b/lapack-netlib/SRC/ssytri2.f
@@ -96,11 +96,11 @@
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
 *>          WORK is size >= (N+NB+1)*(NB+3)
-*>          If LDWORK = -1, then a workspace query is assumed; the routine
+*>          If LWORK = -1, then a workspace query is assumed; the routine
 *>           calculates:
 *>              - the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array,
-*>              - and no error message related to LDWORK is issued by XERBLA.
+*>              - and no error message related to LWORK is issued by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
diff --git a/lapack-netlib/SRC/ssytrs_aa_2stage.f b/lapack-netlib/SRC/ssytrs_aa_2stage.f
index c9c7181f2..d271b9481 100644
--- a/lapack-netlib/SRC/ssytrs_aa_2stage.f
+++ b/lapack-netlib/SRC/ssytrs_aa_2stage.f
@@ -85,6 +85,7 @@
 *>
 *> \param[in] LTB
 *> \verbatim
+*>          LTB is INTEGER
 *>          The size of the array TB. LTB >= 4*N.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/strevc3.f b/lapack-netlib/SRC/strevc3.f
index 0df1189f0..525978071 100644
--- a/lapack-netlib/SRC/strevc3.f
+++ b/lapack-netlib/SRC/strevc3.f
@@ -27,7 +27,7 @@
 *       ..
 *       .. Array Arguments ..
 *       LOGICAL            SELECT( * )
-*       REAL   T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ),
+*       REAL               T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ),
 *      $                   WORK( * )
 *       ..
 *
@@ -45,9 +45,9 @@
 *> The right eigenvector x and the left eigenvector y of T corresponding
 *> to an eigenvalue w are defined by:
 *>
-*>    T*x = w*x,     (y**H)*T = w*(y**H)
+*>    T*x = w*x,     (y**T)*T = w*(y**T)
 *>
-*> where y**H denotes the conjugate transpose of y.
+*> where y**T denotes the transpose of the vector y.
 *> The eigenvalues are not input to this routine, but are read directly
 *> from the diagonal blocks of T.
 *>
@@ -251,14 +251,14 @@
 *     ..
 *     .. Array Arguments ..
       LOGICAL            SELECT( * )
-      REAL   T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ),
+      REAL               T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ),
      $                   WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
-      REAL   ZERO, ONE
+      REAL               ZERO, ONE
       PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
       INTEGER            NBMIN, NBMAX
       PARAMETER          ( NBMIN = 8, NBMAX = 128 )
@@ -268,7 +268,7 @@
      $                   RIGHTV, SOMEV
       INTEGER            I, IERR, II, IP, IS, J, J1, J2, JNXT, K, KI,
      $                   IV, MAXWRK, NB, KI2
-      REAL   BETA, BIGNUM, EMAX, OVFL, REC, REMAX, SCALE,
+      REAL               BETA, BIGNUM, EMAX, OVFL, REC, REMAX, SCALE,
      $                   SMIN, SMLNUM, ULP, UNFL, VCRIT, VMAX, WI, WR,
      $                   XNORM
 *     ..
diff --git a/lapack-netlib/SRC/zgejsv.f b/lapack-netlib/SRC/zgejsv.f
index e8418c680..d553da90b 100644
--- a/lapack-netlib/SRC/zgejsv.f
+++ b/lapack-netlib/SRC/zgejsv.f
@@ -704,7 +704,7 @@
           LWSVDJ  = MAX( 2 * N, 1 )         
           LWSVDJV = MAX( 2 * N, 1 )
 *         .. minimal REAL workspace length for ZGEQP3, ZPOCON, ZGESVJ
-          LRWQP3  = N 
+          LRWQP3  = 2 * N 
           LRWCON  = N 
           LRWSVDJ = N 
           IF ( LQUERY ) THEN 
@@ -942,7 +942,7 @@
              END IF 
           END IF
           MINWRK = MAX( 2, MINWRK )
-          OPTWRK = MAX( 2, OPTWRK )
+          OPTWRK = MAX( MINWRK, OPTWRK )
           IF ( LWORK  .LT. MINWRK  .AND. (.NOT.LQUERY) ) INFO = - 17
           IF ( LRWORK .LT. MINRWRK .AND. (.NOT.LQUERY) ) INFO = - 19   
       END IF
diff --git a/lapack-netlib/SRC/zhesv_aa.f b/lapack-netlib/SRC/zhesv_aa.f
index bbd0fdff4..8511f0e7d 100644
--- a/lapack-netlib/SRC/zhesv_aa.f
+++ b/lapack-netlib/SRC/zhesv_aa.f
@@ -209,6 +209,8 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
+      ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN
+         INFO = -10
       END IF
 *
       IF( INFO.EQ.0 ) THEN
@@ -219,9 +221,6 @@
          LWKOPT_HETRS = INT( WORK(1) )
          LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS )
          WORK( 1 ) = LWKOPT
-         IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN
-             INFO = -10
-         END IF
       END IF
 *
       IF( INFO.NE.0 ) THEN
diff --git a/lapack-netlib/SRC/zhesv_aa_2stage.f b/lapack-netlib/SRC/zhesv_aa_2stage.f
index a34440029..ed221dc69 100644
--- a/lapack-netlib/SRC/zhesv_aa_2stage.f
+++ b/lapack-netlib/SRC/zhesv_aa_2stage.f
@@ -106,6 +106,7 @@
 *>
 *> \param[in] LTB
 *> \verbatim
+*>          LTB is INTEGER
 *>          The size of the array TB. LTB >= 4*N, internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
@@ -125,7 +126,7 @@
 *>
 *> \param[out] IPIV2
 *> \verbatim
-*>          IPIV is INTEGER array, dimension (N)
+*>          IPIV2 is INTEGER array, dimension (N)
 *>          On exit, it contains the details of the interchanges, i.e.,
 *>          the row and column k of T were interchanged with the
 *>          row and column IPIV(k).
@@ -151,6 +152,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The size of WORK. LWORK >= N, internally used to select NB
 *>          such that LWORK >= N*NB.
 *>
@@ -240,19 +242,18 @@
          INFO = -3
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -5
+      ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN
+         INFO = -7
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -11
+      ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN
+         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV,
      $                          IPIV2, WORK, -1, INFO )
          LWKOPT = INT( WORK(1) )
-         IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN
-            INFO = -7
-         ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN
-            INFO = -13
-         END IF
       END IF
 *
       IF( INFO.NE.0 ) THEN
diff --git a/lapack-netlib/SRC/zhetrf_aa_2stage.f b/lapack-netlib/SRC/zhetrf_aa_2stage.f
index 4d62198d6..73c0ebe9a 100644
--- a/lapack-netlib/SRC/zhetrf_aa_2stage.f
+++ b/lapack-netlib/SRC/zhetrf_aa_2stage.f
@@ -93,6 +93,7 @@
 *>
 *> \param[in] LTB
 *> \verbatim
+*>          LTB is INTEGER
 *>          The size of the array TB. LTB >= 4*N, internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
@@ -112,7 +113,7 @@
 *>
 *> \param[out] IPIV2
 *> \verbatim
-*>          IPIV is INTEGER array, dimension (N)
+*>          IPIV2 is INTEGER array, dimension (N)
 *>          On exit, it contains the details of the interchanges, i.e.,
 *>          the row and column k of T were interchanged with the
 *>          row and column IPIV(k).
@@ -125,6 +126,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The size of WORK. LWORK >= N, internally used to select NB
 *>          such that LWORK >= N*NB.
 *>
@@ -657,6 +659,8 @@ c     $                     (J+1)*NB+1, (J+1)*NB+KB, IPIV, 1 )
 *
 *     Factor the band matrix
       CALL ZGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO )
+*
+      RETURN
 *
 *     End of ZHETRF_AA_2STAGE
 *
diff --git a/lapack-netlib/SRC/zhetrs_aa_2stage.f b/lapack-netlib/SRC/zhetrs_aa_2stage.f
index 02e17476f..7fcee1118 100644
--- a/lapack-netlib/SRC/zhetrs_aa_2stage.f
+++ b/lapack-netlib/SRC/zhetrs_aa_2stage.f
@@ -69,7 +69,7 @@
 *>
 *> \param[in] A
 *> \verbatim
-*>          A is COMPLEX*16array, dimension (LDA,N)
+*>          A is COMPLEX*16 array, dimension (LDA,N)
 *>          Details of factors computed by ZHETRF_AA_2STAGE.
 *> \endverbatim
 *>
@@ -81,12 +81,13 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is COMPLEX*16array, dimension (LTB)
+*>          TB is COMPLEX*16 array, dimension (LTB)
 *>          Details of factors computed by ZHETRF_AA_2STAGE.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
+*>          LTB is INTEGER
 *>          The size of the array TB. LTB >= 4*N.
 *> \endverbatim
 *>
@@ -106,7 +107,7 @@
 *>
 *> \param[in,out] B
 *> \verbatim
-*>          B is COMPLEX*16array, dimension (LDB,NRHS)
+*>          B is COMPLEX*16 array, dimension (LDB,NRHS)
 *>          On entry, the right hand side matrix B.
 *>          On exit, the solution matrix X.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zla_syamv.f b/lapack-netlib/SRC/zla_syamv.f
index 02958bef3..cfdb3cdc8 100644
--- a/lapack-netlib/SRC/zla_syamv.f
+++ b/lapack-netlib/SRC/zla_syamv.f
@@ -241,7 +241,7 @@
          INFO = 10
       END IF
       IF( INFO.NE.0 )THEN
-         CALL XERBLA( 'DSYMV ', INFO )
+         CALL XERBLA( 'ZLA_SYAMV', INFO )
          RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/zlaqr1.f b/lapack-netlib/SRC/zlaqr1.f
index 03afb87aa..34341cb10 100644
--- a/lapack-netlib/SRC/zlaqr1.f
+++ b/lapack-netlib/SRC/zlaqr1.f
@@ -142,6 +142,13 @@
       CABS1( CDUM ) = ABS( DBLE( CDUM ) ) + ABS( DIMAG( CDUM ) )
 *     ..
 *     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( N.NE.2 .AND. N.NE.3 ) THEN
+         RETURN
+      END IF
+*
       IF( N.EQ.2 ) THEN
          S = CABS1( H( 1, 1 )-S2 ) + CABS1( H( 2, 1 ) )
          IF( S.EQ.RZERO ) THEN
diff --git a/lapack-netlib/SRC/zsysv_aa.f b/lapack-netlib/SRC/zsysv_aa.f
index 10693c731..325d07c54 100644
--- a/lapack-netlib/SRC/zsysv_aa.f
+++ b/lapack-netlib/SRC/zsysv_aa.f
@@ -221,9 +221,6 @@
          LWKOPT_SYTRS = INT( WORK(1) )
          LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS )
          WORK( 1 ) = LWKOPT
-         IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN
-            INFO = -10
-         END IF
       END IF
 *
       IF( INFO.NE.0 ) THEN
diff --git a/lapack-netlib/SRC/zsysv_aa_2stage.f b/lapack-netlib/SRC/zsysv_aa_2stage.f
index fcf9bc870..029ed587d 100644
--- a/lapack-netlib/SRC/zsysv_aa_2stage.f
+++ b/lapack-netlib/SRC/zsysv_aa_2stage.f
@@ -105,6 +105,7 @@
 *>
 *> \param[in] LTB
 *> \verbatim
+*>          LTB is INTEGER
 *>          The size of the array TB. LTB >= 4*N, internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
@@ -124,7 +125,7 @@
 *>
 *> \param[out] IPIV2
 *> \verbatim
-*>          IPIV is INTEGER array, dimension (N)
+*>          IPIV2 is INTEGER array, dimension (N)
 *>          On exit, it contains the details of the interchanges, i.e.,
 *>          the row and column k of T were interchanged with the
 *>          row and column IPIV(k).
@@ -150,6 +151,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The size of WORK. LWORK >= N, internally used to select NB
 *>          such that LWORK >= N*NB.
 *>
@@ -233,19 +235,18 @@
          INFO = -3
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -5
+      ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN
+         INFO = -7
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -11
+      ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN
+         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL ZSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV,
      $                          IPIV2, WORK, -1, INFO )
          LWKOPT = INT( WORK(1) )
-         IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN
-            INFO = -7
-         ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN
-            INFO = -13
-         END IF
       END IF
 *
       IF( INFO.NE.0 ) THEN
diff --git a/lapack-netlib/SRC/zsytrf_aa_2stage.f b/lapack-netlib/SRC/zsytrf_aa_2stage.f
index 1f916726e..d3486c1a7 100644
--- a/lapack-netlib/SRC/zsytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/zsytrf_aa_2stage.f
@@ -93,6 +93,7 @@
 *>
 *> \param[in] LTB
 *> \verbatim
+*>          LTB is INTEGER
 *>          The size of the array TB. LTB >= 4*N, internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
@@ -112,7 +113,7 @@
 *>
 *> \param[out] IPIV2
 *> \verbatim
-*>          IPIV is INTEGER array, dimension (N)
+*>          IPIV2 is INTEGER array, dimension (N)
 *>          On exit, it contains the details of the interchanges, i.e.,
 *>          the row and column k of T were interchanged with the
 *>          row and column IPIV(k).
@@ -125,6 +126,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The size of WORK. LWORK >= N, internally used to select NB
 *>          such that LWORK >= N*NB.
 *>
@@ -662,6 +664,8 @@ c     $                     (J+1)*NB+1, (J+1)*NB+KB, IPIV, 1 )
 *
 *     Factor the band matrix
       CALL ZGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO )
+*
+      RETURN
 *
 *     End of ZSYTRF_AA_2STAGE
 *
diff --git a/lapack-netlib/SRC/zsytri2.f b/lapack-netlib/SRC/zsytri2.f
index d5aabd43a..e7303c90b 100644
--- a/lapack-netlib/SRC/zsytri2.f
+++ b/lapack-netlib/SRC/zsytri2.f
@@ -163,7 +163,7 @@
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
 *     Get blocksize
-      NBMAX = ILAENV( 1, 'ZSYTRF', UPLO, N, -1, -1, -1 )
+      NBMAX = ILAENV( 1, 'ZSYTRI2', UPLO, N, -1, -1, -1 )
       IF ( NBMAX .GE. N ) THEN
          MINSIZE = N
       ELSE
diff --git a/lapack-netlib/SRC/zsytrs_aa_2stage.f b/lapack-netlib/SRC/zsytrs_aa_2stage.f
index c5d894753..fa15eee90 100644
--- a/lapack-netlib/SRC/zsytrs_aa_2stage.f
+++ b/lapack-netlib/SRC/zsytrs_aa_2stage.f
@@ -85,6 +85,7 @@
 *>
 *> \param[in] LTB
 *> \verbatim
+*>          LTB is INTEGER
 *>          The size of the array TB. LTB >= 4*N.
 *> \endverbatim
 *>

From c5b13d4e10d38eb1bad56aac21bc9ffcf0b577df Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 1 Jun 2018 15:14:45 +0200
Subject: [PATCH 13/86] Fixes from netlib PR 253

---
 lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f | 2 +-
 lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f | 2 +-
 lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f | 2 +-
 lapack-netlib/TESTING/LIN/zchksy_aa_2stage.f | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f
index 5698bcf94..f6d990d1c 100644
--- a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f
@@ -218,7 +218,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAERH, ALAHD, ALASUM, DERRSY, DLACPY, DLARHS,
-     $                   DLATB4, DLATMS, DPOT02, DSYTRF_AA_2STAGE
+     $                   DLATB4, DLATMS, DPOT02, DSYTRF_AA_2STAGE,
      $                   DSYTRS_AA_2STAGE, XLAENV
 *     ..
 *     .. Intrinsic Functions ..
diff --git a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f
index 0be321eb0..898422654 100644
--- a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f
@@ -204,7 +204,7 @@
 *     .. External Subroutines ..
       EXTERNAL           ALADHD, ALAERH, ALASVM, XLAENV, DERRVX,
      $                   DGET04, DLACPY, DLARHS, DLATB4, DLATMS,
-     $                   DSYSV_AA_2STAGE, CHET01_AA, DPOT02,
+     $                   DSYSV_AA_2STAGE, DPOT02,
      $                   DSYTRF_AA_2STAGE
 *     ..
 *     .. Scalars in Common ..
diff --git a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f
index d8d9dc0a9..70e8ff6b8 100644
--- a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f
@@ -203,7 +203,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALADHD, ALAERH, ALASVM, XLAENV, SERRVX,
-     $                   CGET04, SLACPY, SLARHS, SLATB4, SLATMS,
+     $                   SLACPY, SLARHS, SLATB4, SLATMS,
      $                   SSYSV_AA_2STAGE, SSYT01_AA, SPOT02,
      $                   SSYTRF_AA_2STAGE
 *     ..
diff --git a/lapack-netlib/TESTING/LIN/zchksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/zchksy_aa_2stage.f
index d4d8c2939..87fc47f71 100644
--- a/lapack-netlib/TESTING/LIN/zchksy_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/zchksy_aa_2stage.f
@@ -217,8 +217,8 @@
       DOUBLE PRECISION   RESULT( NTESTS )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAERH, ALAHD, ALASUM, CERRSY, ZLACPY, ZLARHS,
-     $                   CLATB4, ZLATMS, ZSYT02, ZSYT01, 
+      EXTERNAL           ALAERH, ALAHD, ALASUM, ZERRSY, ZLACPY, ZLARHS,
+     $                   ZLATB4, ZLATMS, ZSYT02, ZSYT01, 
      $                   ZSYTRF_AA_2STAGE, ZSYTRS_AA_2STAGE,
      $                   XLAENV
 *     ..

From a8002e283a5874946bb464a45045d4651081e675 Mon Sep 17 00:00:00 2001
From: Matthew Brett <matthew.brett@gmail.com>
Date: Fri, 1 Jun 2018 23:20:00 +0100
Subject: [PATCH 14/86] Revert "take out unused variables"

This reverts commit e5752ff9b322c665a7393d6109c2da7ad6ee2523.

The variables i and n are used in the `#if !__GLIBC_PREREQ(2, 7)`
branch.

Closes gh-1586.
---
 driver/others/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index ef328b945..d69e52e97 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -180,7 +180,7 @@ int get_num_procs(void) {
 cpu_set_t *cpusetp;
 size_t size;
 int ret;
-// int i,n;
+int i,n;
 
   if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
 #if !defined(OS_LINUX)

From 99c7bba8e404fcf697f00bc986e106892eff47ad Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 3 Jun 2018 07:24:29 +0000
Subject: [PATCH 15/86] Initial support for SkylakeX / AVX512

This patch adds the basic infrastructure for adding the SkylakeX (Intel Skylake server)
target. The SkylakeX target will use the AVX512 (AVX512VL level) instruction set,
which brings 2 basic things:
1) 512 bit wide SIMD (2x width of AVX2)
2) 32 SIMD registers (2x the number on AVX2)

This initial patch only contains a trivial transofrmation of the Haswell SGEMM kernel
to AVX512VL; more will follow later but this patch aims to get the infrastructure
in place for this "later".

Full performance tuning has not been done yet; with more registers and wider SIMD
it's in theory possible to retune the kernels but even without that there's an
interesting enough performance increase (30-40% range) with just this change.
---
 Makefile.system                            |    8 +-
 TargetList.txt                             |    1 +
 cmake/arch.cmake                           |    3 +
 cmake/system.cmake                         |    2 +-
 cpuid.h                                    |    3 +
 cpuid_x86.c                                |    2 +
 driver/others/dynamic.c                    |    2 +
 driver/others/parameter.c                  |    4 +-
 getarch.c                                  |   15 +
 kernel/CMakeLists.txt                      |    2 +-
 kernel/Makefile.L3                         |    4 +
 kernel/setparam-ref.c                      |   16 +
 kernel/x86/trsm_kernel_LN_2x4_penryn.S     |    2 +-
 kernel/x86/trsm_kernel_LN_4x4_penryn.S     |    2 +-
 kernel/x86/trsm_kernel_LT_2x4_penryn.S     |    2 +-
 kernel/x86/trsm_kernel_LT_4x4_penryn.S     |    2 +-
 kernel/x86/trsm_kernel_RT_2x4_penryn.S     |    2 +-
 kernel/x86/trsm_kernel_RT_4x4_penryn.S     |    2 +-
 kernel/x86/ztrsm_kernel_LN_2x2_penryn.S    |    2 +-
 kernel/x86/ztrsm_kernel_LT_1x2_penryn.S    |    2 +-
 kernel/x86/ztrsm_kernel_LT_2x2_penryn.S    |    2 +-
 kernel/x86/ztrsm_kernel_RT_1x2_penryn.S    |    2 +-
 kernel/x86/ztrsm_kernel_RT_2x2_penryn.S    |    2 +-
 kernel/x86_64/KERNEL.SKYLAKEX              |    4 +
 kernel/x86_64/caxpy.c                      |    2 +-
 kernel/x86_64/cdot.c                       |    2 +-
 kernel/x86_64/cgemv_n_4.c                  |    2 +-
 kernel/x86_64/cgemv_t_4.c                  |    2 +-
 kernel/x86_64/cscal.c                      |    2 +-
 kernel/x86_64/daxpy.c                      |    2 +-
 kernel/x86_64/ddot.c                       |    2 +-
 kernel/x86_64/dgemv_n_4.c                  |    2 +-
 kernel/x86_64/dgemv_t_4.c                  |    2 +-
 kernel/x86_64/dscal.c                      |    2 +-
 kernel/x86_64/dsymv_L.c                    |    2 +-
 kernel/x86_64/dsymv_U.c                    |    2 +-
 kernel/x86_64/saxpy.c                      |    2 +-
 kernel/x86_64/sdot.c                       |    2 +-
 kernel/x86_64/sgemm_kernel_16x4_skylakex.S | 6812 ++++++++++++++++++++
 kernel/x86_64/sgemv_n_4.c                  |    2 +-
 kernel/x86_64/sgemv_t_4.c                  |    2 +-
 kernel/x86_64/ssymv_L.c                    |    2 +-
 kernel/x86_64/ssymv_U.c                    |    2 +-
 kernel/x86_64/symv_L_sse.S                 |    2 +-
 kernel/x86_64/symv_L_sse2.S                |    2 +-
 kernel/x86_64/symv_U_sse.S                 |    2 +-
 kernel/x86_64/symv_U_sse2.S                |    2 +-
 kernel/x86_64/zaxpy.c                      |    2 +-
 kernel/x86_64/zdot.c                       |    2 +-
 kernel/x86_64/zgemv_n_4.c                  |    2 +-
 kernel/x86_64/zgemv_t_4.c                  |    2 +-
 kernel/x86_64/zscal.c                      |    2 +-
 kernel/x86_64/zsymv_L_sse.S                |    2 +-
 kernel/x86_64/zsymv_L_sse2.S               |    2 +-
 kernel/x86_64/zsymv_U_sse.S                |    2 +-
 kernel/x86_64/zsymv_U_sse2.S               |    2 +-
 param.h                                    |  119 +
 57 files changed, 7034 insertions(+), 47 deletions(-)
 create mode 100644 kernel/x86_64/KERNEL.SKYLAKEX
 create mode 100644 kernel/x86_64/sgemm_kernel_16x4_skylakex.S

diff --git a/Makefile.system b/Makefile.system
index 7bfac1fa8..b005b80c9 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -62,6 +62,9 @@ ifeq ($(BINARY), 32)
 ifeq ($(TARGET), HASWELL)
 GETARCH_FLAGS := -DFORCE_NEHALEM
 endif
+ifeq ($(TARGET), SKYLAKEX)
+GETARCH_FLAGS := -DFORCE_NEHALEM
+endif
 ifeq ($(TARGET), SANDYBRIDGE)
 GETARCH_FLAGS := -DFORCE_NEHALEM
 endif
@@ -95,6 +98,9 @@ ifeq ($(BINARY), 32)
 ifeq ($(TARGET_CORE), HASWELL)
 GETARCH_FLAGS := -DFORCE_NEHALEM
 endif
+ifeq ($(TARGET_CORE), SKYLAKEX)
+GETARCH_FLAGS := -DFORCE_NEHALEM
+endif
 ifeq ($(TARGET_CORE), SANDYBRIDGE)
 GETARCH_FLAGS := -DFORCE_NEHALEM
 endif
@@ -467,7 +473,7 @@ ifneq ($(NO_AVX), 1)
 DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR
 endif
 ifneq ($(NO_AVX2), 1)
-DYNAMIC_CORE += HASWELL ZEN
+DYNAMIC_CORE += HASWELL ZEN SKYLAKEX
 endif
 endif
 
diff --git a/TargetList.txt b/TargetList.txt
index aeeaa9ede..31e4881c4 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -20,6 +20,7 @@ DUNNINGTON
 NEHALEM
 SANDYBRIDGE
 HASWELL
+SKYLAKEX
 ATOM
 
 b)AMD CPU:
diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index 798a9ef82..527d2bec6 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -56,6 +56,9 @@ if (DYNAMIC_ARCH)
     if (NOT NO_AVX2)
       set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
     endif ()
+    if (NOT NO_AVX512)
+      set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
+    endif ()
   endif ()
 
   if (NOT DYNAMIC_CORE)
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 645895671..c21fe7c14 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -33,7 +33,7 @@ endif ()
 if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
   message(STATUS "Compiling a ${BINARY}-bit binary.")
   set(NO_AVX 1)
-  if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE")
+  if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX")
     set(TARGET "NEHALEM")
   endif ()
   if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
diff --git a/cpuid.h b/cpuid.h
index 1dacc49ba..a6bc211f3 100644
--- a/cpuid.h
+++ b/cpuid.h
@@ -115,6 +115,7 @@
 #define CORE_STEAMROLLER 25
 #define CORE_EXCAVATOR   26
 #define CORE_ZEN         27
+#define CORE_SKYLAKEX    28
 
 #define HAVE_SSE      (1 <<  0)
 #define HAVE_SSE2     (1 <<  1)
@@ -137,6 +138,7 @@
 #define HAVE_AVX      (1 <<  18)
 #define HAVE_FMA4     (1 <<  19)
 #define HAVE_FMA3     (1 <<  20)
+#define HAVE_AVX512VL (1 <<  21)
 
 #define CACHE_INFO_L1_I     1
 #define CACHE_INFO_L1_D     2
@@ -211,5 +213,6 @@ typedef struct {
 #define CPUTYPE_STEAMROLLER 		49
 #define CPUTYPE_EXCAVATOR 		50
 #define CPUTYPE_ZEN 			51
+#define CPUTYPE_SKYLAKEX		52
 
 #endif
diff --git a/cpuid_x86.c b/cpuid_x86.c
index 342c56525..5f49e7715 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -50,6 +50,8 @@
 #ifdef NO_AVX
 #define CPUTYPE_HASWELL CPUTYPE_NEHALEM
 #define CORE_HASWELL CORE_NEHALEM
+#define CPUTYPE_SKYLAKEX CPUTYPE_NEHALEM
+#define CORE_SKYLAKEX CORE_NEHALEM
 #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
 #define CORE_SANDYBRIDGE CORE_NEHALEM
 #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index fbf7cd40e..a0c9794b1 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -74,6 +74,7 @@ extern gotoblas_t  gotoblas_STEAMROLLER;
 extern gotoblas_t  gotoblas_EXCAVATOR;
 #ifdef NO_AVX2
 #define gotoblas_HASWELL gotoblas_SANDYBRIDGE
+#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
 #define gotoblas_ZEN gotoblas_SANDYBRIDGE
 #else
 extern gotoblas_t  gotoblas_HASWELL;
@@ -83,6 +84,7 @@ extern gotoblas_t  gotoblas_ZEN;
 //Use NEHALEM kernels for sandy bridge
 #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
 #define gotoblas_HASWELL gotoblas_NEHALEM
+#define gotoblas_SKYLAKEX gotoblas_NEHALEM
 #define gotoblas_BULLDOZER gotoblas_BARCELONA
 #define gotoblas_PILEDRIVER gotoblas_BARCELONA
 #define gotoblas_STEAMROLLER gotoblas_BARCELONA
diff --git a/driver/others/parameter.c b/driver/others/parameter.c
index 31a48644f..e7332c0c4 100644
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@@ -167,7 +167,7 @@ int get_L2_size(void){
 #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
     defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
     defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
-    defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN)
+    defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
 
   cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
 
@@ -251,7 +251,7 @@ int get_L2_size(void){
 void blas_set_parameter(void){
 
   int factor;
-#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN)
+#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
   int size = 16;
 #else
   int size = get_L2_size();
diff --git a/getarch.c b/getarch.c
index 992fc2b95..fcffe63e2 100644
--- a/getarch.c
+++ b/getarch.c
@@ -326,6 +326,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "HASWELL"
 #endif
 
+#ifdef FORCE_SKYLAKEX
+#define FORCE
+#define FORCE_INTEL
+#define ARCHITECTURE    "X86"
+#define SUBARCHITECTURE "SKYLAKEX"
+#define ARCHCONFIG   "-DSKYLAKEX " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
+                     "-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512"
+#define LIBNAME   "skylakex"
+#define CORENAME  "SKYLAKEX"
+#endif
+
 #ifdef FORCE_ATOM
 #define FORCE
 #define FORCE_INTEL
diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt
index c06d1eae8..947114ebe 100644
--- a/kernel/CMakeLists.txt
+++ b/kernel/CMakeLists.txt
@@ -121,7 +121,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
     # Makefile.L3
     set(USE_TRMM false)
 
-    if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen")
+    if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen" OR "${TARGET_CORE}" STREQUAL "SKYLAKEX" OR "${CORE}" STREQUAL "skylakex")
       set(USE_TRMM true)
     endif ()
 
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 066426396..b37e536ef 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -32,6 +32,10 @@ ifeq ($(CORE), HASWELL)
 USE_TRMM = 1
 endif
 
+ifeq ($(CORE), SKYLAKEX)
+USE_TRMM = 1
+endif
+
 ifeq ($(CORE), ZEN)
 USE_TRMM = 1
 endif
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index b6c5b54de..9030d7c6d 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -871,6 +871,22 @@ static void init_parameter(void) {
 #endif
 #endif
 
+#ifdef SKYLAKEX
+
+#ifdef DEBUG
+  fprintf(stderr, "SkylakeX\n");
+#endif
+
+  TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
+  TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
+  TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
+  TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
+#ifdef EXPRECISION
+  TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
+  TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
+#endif
+#endif
+
 
 #ifdef OPTERON
 
diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S
index 0b475afa2..34653d400 100644
--- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S
+++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S
@@ -62,7 +62,7 @@
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
 #define PREFETCH     prefetcht0
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S
index e98854f34..492f34344 100644
--- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S
+++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S
@@ -62,7 +62,7 @@
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
 #define PREFETCH     prefetcht0
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S
index 086852cfc..6840c54ad 100644
--- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S
+++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S
@@ -62,7 +62,7 @@
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
 #define PREFETCH     prefetcht0
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S
index 2dd8ad08b..361ccf603 100644
--- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S
+++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S
@@ -62,7 +62,7 @@
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL || defined (SKYLAKEX))
 #define PREFETCH     prefetcht0
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S
index 154276f6a..11825429e 100644
--- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S
+++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S
@@ -62,7 +62,7 @@
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
 #define PREFETCH     prefetcht0
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S
index acdcd6e22..4c054f399 100644
--- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S
+++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S
@@ -62,7 +62,7 @@
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
 #define PREFETCH     prefetcht0
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S
index da561b583..e67496736 100644
--- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S
+++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S
@@ -61,7 +61,7 @@
 #define PREFETCHSIZE 84
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
 #define PREFETCH	prefetcht1
 #define PREFETCHSIZE 84
 #endif
diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S
index a11b0286a..498057697 100644
--- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S
+++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S
@@ -63,7 +63,7 @@
 #define PREFETCHSIZE 84
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
 #define PREFETCH	prefetcht1
 #define PREFETCHSIZE 84
 #endif
diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S
index 787ab5982..f3072983d 100644
--- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S
+++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S
@@ -61,7 +61,7 @@
 #define PREFETCHSIZE 84
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
 #define PREFETCH	prefetcht1
 #define PREFETCHSIZE 84
 #endif
diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S
index 9a3b0cbd7..879ae9c38 100644
--- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S
+++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S
@@ -63,7 +63,7 @@
 #define PREFETCHSIZE 84
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
 #define PREFETCH	prefetcht1
 #define PREFETCHSIZE 84
 #endif
diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S
index bd7a78b5a..6c308197b 100644
--- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S
+++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S
@@ -61,7 +61,7 @@
 #define PREFETCHSIZE 84
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
 #define PREFETCH	prefetcht1
 #define PREFETCHSIZE 84
 #endif
diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
new file mode 100644
index 000000000..744831d67
--- /dev/null
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -0,0 +1,4 @@
+include $(KERNELDIR)/KERNEL.HASWELL
+
+SGEMMKERNEL    =  sgemm_kernel_16x4_skylakex.S
+
diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c
index b1ec19bd3..586d05ac2 100644
--- a/kernel/x86_64/caxpy.c
+++ b/kernel/x86_64/caxpy.c
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "caxpy_microk_steamroller-2.c"
 #elif defined(BULLDOZER)
 #include "caxpy_microk_bulldozer-2.c"
-#elif defined(HASWELL) || defined(ZEN)
+#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX)
 #include "caxpy_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "caxpy_microk_sandy-2.c"
diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c
index 5f01f7eeb..93fca0a0d 100644
--- a/kernel/x86_64/cdot.c
+++ b/kernel/x86_64/cdot.c
@@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cdot_microk_bulldozer-2.c"
 #elif defined(STEAMROLLER) || defined(PILEDRIVER)  || defined(EXCAVATOR)
 #include "cdot_microk_steamroller-2.c"
-#elif defined(HASWELL) || defined(ZEN)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #include "cdot_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "cdot_microk_sandy-2.c"
diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c
index 770c955b2..d81766cd4 100644
--- a/kernel/x86_64/cgemv_n_4.c
+++ b/kernel/x86_64/cgemv_n_4.c
@@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <stdio.h>
 #include "common.h"
 
-#if defined(HASWELL) || defined(ZEN)
+#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #include "cgemv_n_microk_haswell-4.c"
 #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
 #include "cgemv_n_microk_bulldozer-4.c"
diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c
index d75e58fdd..6bdea6787 100644
--- a/kernel/x86_64/cgemv_t_4.c
+++ b/kernel/x86_64/cgemv_t_4.c
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(HASWELL) || defined(ZEN)
+#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #include "cgemv_t_microk_haswell-4.c"
 #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)  || defined(EXCAVATOR)
 #include "cgemv_t_microk_bulldozer-4.c"
diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c
index 9b9179da0..72af99809 100644
--- a/kernel/x86_64/cscal.c
+++ b/kernel/x86_64/cscal.c
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(HASWELL) || defined(ZEN)
+#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #include "cscal_microk_haswell-2.c"
 #elif defined(BULLDOZER)  || defined(PILEDRIVER)
 #include "cscal_microk_bulldozer-2.c"
diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c
index 4bde62824..b4acdccd2 100644
--- a/kernel/x86_64/daxpy.c
+++ b/kernel/x86_64/daxpy.c
@@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "daxpy_microk_steamroller-2.c"
 #elif defined(PILEDRIVER)
 #include "daxpy_microk_piledriver-2.c"
-#elif defined(HASWELL) || defined(ZEN)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #include "daxpy_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "daxpy_microk_sandy-2.c"
diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c
index 8162a5d83..059549028 100644
--- a/kernel/x86_64/ddot.c
+++ b/kernel/x86_64/ddot.c
@@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "ddot_microk_piledriver-2.c"
 #elif defined(NEHALEM) 
 #include "ddot_microk_nehalem-2.c"
-#elif defined(HASWELL) || defined(ZEN)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #include "ddot_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "ddot_microk_sandy-2.c"
diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c
index 1b9ca7a60..309fbe767 100644
--- a/kernel/x86_64/dgemv_n_4.c
+++ b/kernel/x86_64/dgemv_n_4.c
@@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(NEHALEM)
 #include "dgemv_n_microk_nehalem-4.c"
-#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR)
+#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX)
 #include "dgemv_n_microk_haswell-4.c"
 #endif
 
diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c
index 6b99d6fdd..a7478e3a8 100644
--- a/kernel/x86_64/dgemv_t_4.c
+++ b/kernel/x86_64/dgemv_t_4.c
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER)  || defined(EXCAVATOR)
+#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER)  || defined(EXCAVATOR) || defined (SKYLAKEX)
 #include "dgemv_t_microk_haswell-4.c"
 #endif
 
diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c
index 428558617..2c7b3b17c 100644
--- a/kernel/x86_64/dscal.c
+++ b/kernel/x86_64/dscal.c
@@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "dscal_microk_bulldozer-2.c"
 #elif defined(SANDYBRIDGE)
 #include "dscal_microk_sandy-2.c"
-#elif defined(HASWELL) || defined(ZEN)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #include "dscal_microk_haswell-2.c"
 #endif
 
diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c
index 3e8db3fa3..73099462c 100644
--- a/kernel/x86_64/dsymv_L.c
+++ b/kernel/x86_64/dsymv_L.c
@@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
 #include "dsymv_L_microk_bulldozer-2.c"
-#elif defined(HASWELL) || defined(ZEN)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #include "dsymv_L_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "dsymv_L_microk_sandy-2.c"
diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c
index 61cb77a64..431e4bb3f 100644
--- a/kernel/x86_64/dsymv_U.c
+++ b/kernel/x86_64/dsymv_U.c
@@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)  || defined(EXCAVATOR)
 #include "dsymv_U_microk_bulldozer-2.c"
-#elif defined(HASWELL) || defined(ZEN)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #include "dsymv_U_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "dsymv_U_microk_sandy-2.c"
diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c
index d89fe408a..d89c4070d 100644
--- a/kernel/x86_64/saxpy.c
+++ b/kernel/x86_64/saxpy.c
@@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(NEHALEM)
 #include "saxpy_microk_nehalem-2.c"
-#elif defined(HASWELL) || defined(ZEN)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #include "saxpy_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "saxpy_microk_sandy-2.c"
diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c
index b6f3c21af..c3ab2ffe6 100644
--- a/kernel/x86_64/sdot.c
+++ b/kernel/x86_64/sdot.c
@@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "sdot_microk_steamroller-2.c"
 #elif defined(NEHALEM)
 #include "sdot_microk_nehalem-2.c"
-#elif defined(HASWELL) || defined(ZEN)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #include "sdot_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "sdot_microk_sandy-2.c"
diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.S b/kernel/x86_64/sgemm_kernel_16x4_skylakex.S
new file mode 100644
index 000000000..1fab892ca
--- /dev/null
+++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.S
@@ -0,0 +1,6812 @@
+/*********************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+/*********************************************************************
+* 2014/07/28 Saar
+*        BLASTEST               : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+* 2013/10/28 Saar
+* Parameter:
+*	SGEMM_DEFAULT_UNROLL_N	4
+*	SGEMM_DEFAULT_UNROLL_M	16
+*	SGEMM_DEFAULT_P		768
+*	SGEMM_DEFAULT_Q		384
+*	A_PR1			512
+*	B_PR1			512
+*	
+* 
+* 2014/07/28 Saar
+* Performance at 9216x9216x9216:
+*       1 thread:      102 GFLOPS       (SANDYBRIDGE:  59)      (MKL:   83)
+*       2 threads:     195 GFLOPS       (SANDYBRIDGE: 116)      (MKL:  155)
+*       3 threads:     281 GFLOPS       (SANDYBRIDGE: 165)      (MKL:  230)
+*       4 threads:     366 GFLOPS       (SANDYBRIDGE: 223)      (MKL:  267)
+*
+*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define BI	%rbp
+#define BO2	%rbp
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define	CO2	%rdx
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+
+#else
+
+#define STACKSIZE 256
+
+#define OLD_A		40 + STACKSIZE(%rsp)
+#define OLD_B		48 + STACKSIZE(%rsp)
+#define OLD_C		56 + STACKSIZE(%rsp)
+#define OLD_LDC		64 + STACKSIZE(%rsp)
+#define OLD_OFFSET	72 + STACKSIZE(%rsp)
+
+#endif
+
+#if defined(OS_WINDOWS)
+#define L_BUFFER_SIZE 8192
+#else
+#define L_BUFFER_SIZE 12288
+#endif
+
+#define Ndiv6	 24(%rsp)
+#define Nmod6	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA	 48(%rsp)
+#define OFFSET	 56(%rsp)
+#define KK	 64(%rsp)
+#define KKK	 72(%rsp)
+#define BUFFER1	           128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $0,  4096 * 4(%rsp);\
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+#if defined(BULLDOZER)
+
+#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
+
+#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0
+
+#else
+
+#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0
+
+#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0
+
+#endif
+
+
+#define	A_PR1	512
+#define	B_PR1	512
+
+/*******************************************************************************************
+* 6 lines of N
+*******************************************************************************************/
+
+.macro KERNEL16x6_SUB
+	vmovups 	-16 * SIZE(AO), %zmm0
+	vbroadcastss	 -4 * SIZE(BO), %zmm2
+	vbroadcastss	 -3 * SIZE(BO), %zmm3
+	prefetcht0	A_PR1(AO)
+
+	VFMADD231PS_(  	%zmm4,%zmm2,%zmm0 )
+	VFMADD231PS_(  	%zmm6,%zmm3,%zmm0 )
+
+	vbroadcastss	 -2 * SIZE(BO), %zmm2
+	vbroadcastss	 -1 * SIZE(BO), %zmm3
+	VFMADD231PS_(  	%zmm8,%zmm2,%zmm0  )
+	VFMADD231PS_(  	%zmm10,%zmm3,%zmm0 )
+
+	vbroadcastss	  0 * SIZE(BO), %zmm2
+	vbroadcastss	  1 * SIZE(BO), %zmm3
+	VFMADD231PS_(  	%zmm12,%zmm2,%zmm0  )
+	VFMADD231PS_(  	%zmm14,%zmm3,%zmm0 )
+
+	addq	$ 6*SIZE, BO 
+	addq	$ 16*SIZE, AO 
+	decq	%rax 
+.endm
+
+.macro KERNEL16x6_SUB4
+	vmovups 	-16 * SIZE(AO), %zmm0
+	vbroadcastss	 -4 * SIZE(BO), %zmm2
+	vbroadcastss	 -3 * SIZE(BO), %zmm3
+	prefetcht0	A_PR1(AO)
+
+	VFMADD231PS_(  	%zmm4,%zmm2,%zmm0 )
+	VFMADD231PS_(  	%zmm6,%zmm3,%zmm0 )
+
+	vbroadcastss	 -2 * SIZE(BO), %zmm7
+	vbroadcastss	 -1 * SIZE(BO), %zmm9
+	VFMADD231PS_(  	%zmm8,%zmm7,%zmm0  )
+	VFMADD231PS_(  	%zmm10,%zmm9,%zmm0 )
+
+	vbroadcastss	  0 * SIZE(BO), %zmm11
+	vbroadcastss	  1 * SIZE(BO), %zmm13
+	VFMADD231PS_(  	%zmm12,%zmm11,%zmm0  )
+	VFMADD231PS_(  	%zmm14,%zmm13,%zmm0 )
+
+	addq	$ 6*SIZE, BO 
+	addq	$ 16*SIZE, AO 
+	decq	%rax 
+	vmovups 	-16 * SIZE(AO), %zmm0
+	vbroadcastss	 -4 * SIZE(BO), %zmm16
+	vbroadcastss	 -3 * SIZE(BO), %zmm17
+
+	VFMADD231PS_(  	%zmm4,%zmm16,%zmm0 )
+	VFMADD231PS_(  	%zmm6,%zmm17,%zmm0 )
+
+	vbroadcastss	 -2 * SIZE(BO), %zmm18
+	vbroadcastss	 -1 * SIZE(BO), %zmm19
+	VFMADD231PS_(  	%zmm8,%zmm18,%zmm0  )
+	VFMADD231PS_(  	%zmm10,%zmm19,%zmm0 )
+
+	vbroadcastss	  0 * SIZE(BO), %zmm20
+	vbroadcastss	  1 * SIZE(BO), %zmm21
+	VFMADD231PS_(  	%zmm12,%zmm20,%zmm0  )
+	VFMADD231PS_(  	%zmm14,%zmm21,%zmm0 )
+
+	addq	$ 6*SIZE, BO 
+	addq	$ 16*SIZE, AO 
+	decq	%rax 
+
+	vmovups 	-16 * SIZE(AO), %zmm0
+	vbroadcastss	 -4 * SIZE(BO), %zmm22
+	vbroadcastss	 -3 * SIZE(BO), %zmm23
+
+	VFMADD231PS_(  	%zmm4,%zmm22,%zmm0 )
+	VFMADD231PS_(  	%zmm6,%zmm23,%zmm0 )
+
+	vbroadcastss	 -2 * SIZE(BO), %zmm24
+	vbroadcastss	 -1 * SIZE(BO), %zmm25
+	VFMADD231PS_(  	%zmm8,%zmm24,%zmm0  )
+	VFMADD231PS_(  	%zmm10,%zmm25,%zmm0 )
+
+	vbroadcastss	  0 * SIZE(BO), %zmm26
+	vbroadcastss	  1 * SIZE(BO), %zmm27
+	VFMADD231PS_(  	%zmm12,%zmm26,%zmm0  )
+	VFMADD231PS_(  	%zmm14,%zmm27,%zmm0 )
+
+	addq	$ 6*SIZE, BO 
+	addq	$ 16*SIZE, AO 
+	decq	%rax 
+	vmovups 	-16 * SIZE(AO), %zmm0
+	vbroadcastss	 -4 * SIZE(BO), %zmm28
+	vbroadcastss	 -3 * SIZE(BO), %zmm29
+
+	VFMADD231PS_(  	%zmm4,%zmm28,%zmm0 )
+	VFMADD231PS_(  	%zmm6,%zmm29,%zmm0 )
+
+	vbroadcastss	 -2 * SIZE(BO), %zmm30
+	vbroadcastss	 -1 * SIZE(BO), %zmm31
+	VFMADD231PS_(  	%zmm8,%zmm30,%zmm0  )
+	VFMADD231PS_(  	%zmm10,%zmm31,%zmm0 )
+
+	vbroadcastss	  0 * SIZE(BO), %zmm1
+	vbroadcastss	  1 * SIZE(BO), %zmm5
+	VFMADD231PS_(  	%zmm12,%zmm1,%zmm0  )
+	VFMADD231PS_(  	%zmm14,%zmm5,%zmm0 )
+
+	addq	$ 6*SIZE, BO 
+	addq	$ 16*SIZE, AO 
+	decq	%rax 
+.endm
+
+.macro SAVE16x6
+
+	vbroadcastss	ALPHA, %zmm0
+
+	vmulps	%zmm0 , %zmm4 , %zmm4
+	vmulps	%zmm0 , %zmm6 , %zmm6
+	vmulps	%zmm0 , %zmm8 , %zmm8
+	vmulps	%zmm0 , %zmm10, %zmm10
+	vmulps	%zmm0 , %zmm12, %zmm12
+	vmulps	%zmm0 , %zmm14, %zmm14
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %zmm4,%zmm4
+
+	vaddps 	        (CO1, LDC), %zmm6,%zmm6
+
+	vaddps 	        (CO1, LDC,2), %zmm8,%zmm8
+
+	vaddps 	        (CO2), %zmm10,%zmm10
+
+	vaddps 	        (CO2, LDC), %zmm12,%zmm12
+
+	vaddps 	        (CO2, LDC,2), %zmm14,%zmm14
+
+#endif
+
+	vmovups	%zmm4 ,  	(CO1)
+
+	vmovups	%zmm6 ,  	(CO1, LDC)
+
+	vmovups	%zmm8 ,  	(CO1, LDC,2)
+
+	vmovups	%zmm10,  	(CO2)
+
+	vmovups	%zmm12,  	(CO2, LDC)
+
+	vmovups	%zmm14,  	(CO2, LDC,2)
+
+.endm
+
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x6_SUB
+	vmovups 	-16 * SIZE(AO), %ymm0
+	vbroadcastss	 -4 * SIZE(BO), %ymm2
+	vbroadcastss	 -3 * SIZE(BO), %ymm3
+
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
+
+	vbroadcastss	 -2 * SIZE(BO), %ymm2
+	vbroadcastss	 -1 * SIZE(BO), %ymm3
+	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
+	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
+
+	vbroadcastss	  0 * SIZE(BO), %ymm2
+	vbroadcastss	  1 * SIZE(BO), %ymm3
+	VFMADD231PS_(  	%ymm12,%ymm2,%ymm0  )
+	VFMADD231PS_(  	%ymm14,%ymm3,%ymm0 )
+
+	addq	$ 6*SIZE, BO 
+	addq	$ 8*SIZE, AO 
+	decq	%rax 
+.endm
+
+.macro SAVE8x6
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm6 , %ymm6
+	vmulps	%ymm0 , %ymm8 , %ymm8
+	vmulps	%ymm0 , %ymm10, %ymm10
+	vmulps	%ymm0 , %ymm12, %ymm12
+	vmulps	%ymm0 , %ymm14, %ymm14
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps 	        (CO1, LDC), %ymm6,%ymm6
+	vaddps 	        (CO1, LDC,2), %ymm8,%ymm8
+	vaddps 	        (CO2), %ymm10,%ymm10
+	vaddps 	        (CO2, LDC), %ymm12,%ymm12
+	vaddps 	        (CO2, LDC,2), %ymm14,%ymm14
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm6 ,  	(CO1, LDC)
+	vmovups	%ymm8 ,  	(CO1, LDC,2)
+	vmovups	%ymm10,  	(CO2)
+	vmovups	%ymm12,  	(CO2, LDC)
+	vmovups	%ymm14,  	(CO2, LDC,2)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x6_SUB
+	vmovups 	-16 * SIZE(AO), %xmm0
+	vbroadcastss	 -4 * SIZE(BO), %xmm2
+	vbroadcastss	 -3 * SIZE(BO), %xmm3
+
+	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231PS_(  	%xmm6,%xmm3,%xmm0 )
+
+	vbroadcastss	 -2 * SIZE(BO), %xmm2
+	vbroadcastss	 -1 * SIZE(BO), %xmm3
+	VFMADD231PS_(  	%xmm8,%xmm2,%xmm0  )
+	VFMADD231PS_(  	%xmm10,%xmm3,%xmm0 )
+
+	vbroadcastss	  0 * SIZE(BO), %xmm2
+	vbroadcastss	  1 * SIZE(BO), %xmm3
+	VFMADD231PS_(  	%xmm12,%xmm2,%xmm0  )
+	VFMADD231PS_(  	%xmm14,%xmm3,%xmm0 )
+
+	addq	$ 6*SIZE, BO 
+	addq	$ 4*SIZE, AO 
+	decq	%rax 
+.endm
+
+.macro SAVE4x6
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vmulps	%xmm0 , %xmm4 , %xmm4
+	vmulps	%xmm0 , %xmm6 , %xmm6
+	vmulps	%xmm0 , %xmm8 , %xmm8
+	vmulps	%xmm0 , %xmm10, %xmm10
+	vmulps	%xmm0 , %xmm12, %xmm12
+	vmulps	%xmm0 , %xmm14, %xmm14
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %xmm4,%xmm4
+	vaddps 	        (CO1, LDC), %xmm6,%xmm6
+	vaddps 	        (CO1, LDC,2), %xmm8,%xmm8
+	vaddps 	        (CO2), %xmm10,%xmm10
+	vaddps 	        (CO2, LDC), %xmm12,%xmm12
+	vaddps 	        (CO2, LDC,2), %xmm14,%xmm14
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm6 ,  	(CO1, LDC)
+	vmovups	%xmm8 ,  	(CO1, LDC,2)
+	vmovups	%xmm10,  	(CO2)
+	vmovups	%xmm12,  	(CO2, LDC)
+	vmovups	%xmm14,  	(CO2, LDC,2)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x6_SUB
+	vmovss 	-16 * SIZE(AO), %xmm0
+	vmovss 	-15 * SIZE(AO), %xmm1
+	vmovss	 -4 * SIZE(BO), %xmm2
+	vmovss	 -3 * SIZE(BO), %xmm3
+
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
+	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
+	VFMADD231SS_(  	%xmm7,%xmm3,%xmm1 )
+
+	vmovss	 -2 * SIZE(BO), %xmm2
+	vmovss	 -1 * SIZE(BO), %xmm3
+	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm9,%xmm2,%xmm1  )
+	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
+	VFMADD231SS_(  	%xmm11,%xmm3,%xmm1 )
+
+	vmovss	  0 * SIZE(BO), %xmm2
+	vmovss	  1 * SIZE(BO), %xmm3
+	VFMADD231SS_(  	%xmm12,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm13,%xmm2,%xmm1  )
+	VFMADD231SS_(  	%xmm14,%xmm3,%xmm0 )
+	VFMADD231SS_(  	%xmm15,%xmm3,%xmm1 )
+
+	addq	$ 6*SIZE, BO 
+	addq	$ 2*SIZE, AO 
+	decq	%rax 
+.endm
+
+.macro SAVE2x6
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm5 , %xmm5
+	vmulss	%xmm0 , %xmm6 , %xmm6
+	vmulss	%xmm0 , %xmm7 , %xmm7
+	vmulss	%xmm0 , %xmm8 , %xmm8
+	vmulss	%xmm0 , %xmm9 , %xmm9
+	vmulss	%xmm0 , %xmm10, %xmm10
+	vmulss	%xmm0 , %xmm11, %xmm11
+	vmulss	%xmm0 , %xmm12, %xmm12
+	vmulss	%xmm0 , %xmm13, %xmm13
+	vmulss	%xmm0 , %xmm14, %xmm14
+	vmulss	%xmm0 , %xmm15, %xmm15
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
+
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
+
+	vaddss 	        (CO1, LDC,2), %xmm8,%xmm8
+	vaddss  1 * SIZE(CO1, LDC,2), %xmm9,%xmm9
+
+	vaddss 	        (CO2), %xmm10,%xmm10
+	vaddss  1 * SIZE(CO2), %xmm11,%xmm11
+
+	vaddss 	        (CO2, LDC), %xmm12,%xmm12
+	vaddss  1 * SIZE(CO2, LDC), %xmm13,%xmm13
+
+	vaddss 	        (CO2, LDC,2), %xmm14,%xmm14
+	vaddss  1 * SIZE(CO2, LDC,2), %xmm15,%xmm15
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 , 1 * SIZE(CO1)
+
+	vmovss	%xmm6 ,  	(CO1, LDC)
+	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)
+
+	vmovss	%xmm8 ,  	(CO1, LDC,2)
+	vmovss	%xmm9 , 1 * SIZE(CO1, LDC,2)
+
+	vmovss	%xmm10,  	(CO2)
+	vmovss	%xmm11, 1 * SIZE(CO2)
+
+	vmovss	%xmm12,  	(CO2, LDC)
+	vmovss	%xmm13, 1 * SIZE(CO2, LDC)
+
+	vmovss	%xmm14,  	(CO2, LDC,2)
+	vmovss	%xmm15, 1 * SIZE(CO2, LDC,2)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL1x6_SUB
+	vmovss 	-16 * SIZE(AO), %xmm0
+	vmovss	 -4 * SIZE(BO), %xmm2
+	vmovss	 -3 * SIZE(BO), %xmm3
+
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
+
+	vmovss	 -2 * SIZE(BO), %xmm2
+	vmovss	 -1 * SIZE(BO), %xmm3
+	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
+
+	vmovss	  0 * SIZE(BO), %xmm2
+	vmovss	  1 * SIZE(BO), %xmm3
+	VFMADD231SS_(  	%xmm12,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm14,%xmm3,%xmm0 )
+
+	addq	$ 6*SIZE, BO 
+	addq	$ 1*SIZE, AO 
+	decq	%rax 
+.endm
+
+.macro SAVE1x6
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm6 , %xmm6
+	vmulss	%xmm0 , %xmm8 , %xmm8
+	vmulss	%xmm0 , %xmm10, %xmm10
+	vmulss	%xmm0 , %xmm12, %xmm12
+	vmulss	%xmm0 , %xmm14, %xmm14
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss 	        (CO1, LDC,2), %xmm8,%xmm8
+	vaddss 	        (CO2), %xmm10,%xmm10
+	vaddss 	        (CO2, LDC), %xmm12,%xmm12
+	vaddss 	        (CO2, LDC,2), %xmm14,%xmm14
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm6 ,  	(CO1, LDC)
+	vmovss	%xmm8 ,  	(CO1, LDC,2)
+	vmovss	%xmm10,  	(CO2)
+	vmovss	%xmm12,  	(CO2, LDC)
+	vmovss	%xmm14,  	(CO2, LDC,2)
+
+.endm
+
+
+/*******************************************************************************************/
+
+
+/*******************************************************************************************
+* 4 lines of N
+*******************************************************************************************/
+
+.macro KERNEL16x4_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %zmm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %zmm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %zmm3
+	VFMADD231PS_(  	%zmm4,%zmm2,%zmm0 )
+	VFMADD231PS_(  	%zmm6,%zmm3,%zmm0 )
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %zmm2
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %zmm3
+	VFMADD231PS_(  	%zmm8,%zmm2,%zmm0  )
+	VFMADD231PS_(  	%zmm10,%zmm3,%zmm0 )
+	addq	$ 4 , BI	
+	addq	$ 16, %rax 
+.endm
+
+.macro SAVE16x4
+
+	vbroadcastss	ALPHA, %zmm0
+
+	vmulps	%zmm0 , %zmm4 , %zmm4
+	vmulps	%zmm0 , %zmm6 , %zmm6
+	vmulps	%zmm0 , %zmm8 , %zmm8
+	vmulps	%zmm0 , %zmm10, %zmm10
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %zmm4,%zmm4
+
+	vaddps 	        (CO1, LDC), %zmm6,%zmm6
+
+	vaddps 	        (CO2), %zmm8,%zmm8
+
+	vaddps 	        (CO2, LDC), %zmm10,%zmm10
+
+#endif
+
+	vmovups	%zmm4 ,  	(CO1)
+
+	vmovups	%zmm6 ,  	(CO1, LDC)
+
+	vmovups	%zmm8 ,  	(CO2)
+
+	vmovups	%zmm10,  	(CO2, LDC)
+
+	prefetcht0	64(CO1)
+	prefetcht0	64(CO1, LDC)
+	prefetcht0	64(CO2)
+	prefetcht0	64(CO2, LDC)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x4_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %ymm2
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
+	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
+	addq	$ 4 , BI	
+	addq	$ 8 , %rax 
+.endm
+
+.macro SAVE8x4
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm6 , %ymm6
+	vmulps	%ymm0 , %ymm8 , %ymm8
+	vmulps	%ymm0 , %ymm10, %ymm10
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps 	        (CO1, LDC), %ymm6,%ymm6
+	vaddps 	        (CO2), %ymm8,%ymm8
+	vaddps 	        (CO2, LDC), %ymm10,%ymm10
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm6 ,  	(CO1, LDC)
+	vmovups	%ymm8 ,  	(CO2)
+	vmovups	%ymm10,  	(CO2, LDC)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x4_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231PS_(  	%xmm6,%xmm3,%xmm0 )
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm2
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231PS_(  	%xmm8,%xmm2,%xmm0  )
+	VFMADD231PS_(  	%xmm10,%xmm3,%xmm0 )
+	addq	$ 4 , BI	
+	addq	$ 4 , %rax 
+.endm
+
+.macro SAVE4x4
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vmulps	%xmm0 , %xmm4 , %xmm4
+	vmulps	%xmm0 , %xmm6 , %xmm6
+	vmulps	%xmm0 , %xmm8 , %xmm8
+	vmulps	%xmm0 , %xmm10, %xmm10
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %xmm4,%xmm4
+	vaddps 	        (CO1, LDC), %xmm6,%xmm6
+	vaddps 	        (CO2), %xmm8,%xmm8
+	vaddps 	        (CO2, LDC), %xmm10,%xmm10
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm6 ,  	(CO1, LDC)
+	vmovups	%xmm8 ,  	(CO2)
+	vmovups	%xmm10,  	(CO2, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x4_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
+	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
+	VFMADD231SS_(  	%xmm7,%xmm3,%xmm1 )
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm9,%xmm2,%xmm1  )
+	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
+	VFMADD231SS_(  	%xmm11,%xmm3,%xmm1 )
+	addq	$ 4 , BI	
+	addq	$ 2, %rax 
+.endm
+
+.macro SAVE2x4
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm5 , %xmm5
+	vmulss	%xmm0 , %xmm6 , %xmm6
+	vmulss	%xmm0 , %xmm7 , %xmm7
+	vmulss	%xmm0 , %xmm8 , %xmm8
+	vmulss	%xmm0 , %xmm9 , %xmm9
+	vmulss	%xmm0 , %xmm10, %xmm10
+	vmulss	%xmm0 , %xmm11, %xmm11
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
+
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
+
+	vaddss 	        (CO2), %xmm8,%xmm8
+	vaddss  1 * SIZE(CO2), %xmm9,%xmm9
+
+	vaddss 	        (CO2, LDC), %xmm10,%xmm10
+	vaddss  1 * SIZE(CO2, LDC), %xmm11,%xmm11
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 , 1 * SIZE(CO1)
+
+	vmovss	%xmm6 ,  	(CO1, LDC)
+	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)
+
+	vmovss	%xmm8 ,  	(CO2)
+	vmovss	%xmm9 , 1 * SIZE(CO2)
+
+	vmovss	%xmm10,  	(CO2, LDC)
+	vmovss	%xmm11, 1 * SIZE(CO2, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL1x4_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
+	addq	$ 4 , BI	
+	addq	$ 1, %rax 
+.endm
+
+.macro SAVE1x4
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm6 , %xmm6
+	vmulss	%xmm0 , %xmm8 , %xmm8
+	vmulss	%xmm0 , %xmm10, %xmm10
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss 	        (CO2), %xmm8,%xmm8
+	vaddss 	        (CO2, LDC), %xmm10,%xmm10
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm6 ,  	(CO1, LDC)
+	vmovss	%xmm8 ,  	(CO2)
+	vmovss	%xmm10,  	(CO2, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+/*******************************************************************************************
+* 2 lines of N
+*******************************************************************************************/
+
+.macro KERNEL16x2_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %zmm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %zmm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %zmm3
+	VFMADD231PS_(  	%zmm4,%zmm2,%zmm0 )
+	VFMADD231PS_(  	%zmm6,%zmm3,%zmm0 )
+	addq	$ 2 , BI	
+	addq	$ 16, %rax 
+.endm
+
+.macro SAVE16x2
+
+	vbroadcastss	ALPHA, %zmm0
+
+	vmulps	%zmm0 , %zmm4 , %zmm4
+	vmulps	%zmm0 , %zmm6 , %zmm6
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %zmm4,%zmm4
+
+	vaddps 	        (CO1, LDC), %zmm6,%zmm6
+
+#endif
+
+	vmovups	%zmm4 ,  	(CO1)
+
+	vmovups	%zmm6 ,  	(CO1, LDC)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x2_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
+	addq	$ 2 , BI	
+	addq	$ 8 , %rax 
+.endm
+
+.macro SAVE8x2
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm6 , %ymm6
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps 	        (CO1, LDC), %ymm6,%ymm6
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm6 ,  	(CO1, LDC)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x2_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231PS_(  	%xmm6,%xmm3,%xmm0 )
+	addq	$ 2 , BI	
+	addq	$ 4 , %rax 
+.endm
+
+.macro SAVE4x2
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vmulps	%xmm0 , %xmm4 , %xmm4
+	vmulps	%xmm0 , %xmm6 , %xmm6
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %xmm4,%xmm4
+	vaddps 	        (CO1, LDC), %xmm6,%xmm6
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm6 ,  	(CO1, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x2_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
+	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
+	VFMADD231SS_(  	%xmm7,%xmm3,%xmm1 )
+	addq	$ 2 , BI	
+	addq	$ 2, %rax 
+.endm
+
+.macro SAVE2x2
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm5 , %xmm5
+	vmulss	%xmm0 , %xmm6 , %xmm6
+	vmulss	%xmm0 , %xmm7 , %xmm7
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
+
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 , 1 * SIZE(CO1)
+
+	vmovss	%xmm6 ,  	(CO1, LDC)
+	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL1x2_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
+	addq	$ 2 , BI	
+	addq	$ 1, %rax 
+.endm
+
+.macro SAVE1x2
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm6 , %xmm6
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm6 ,  	(CO1, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+/*******************************************************************************************
+* 1 line of N
+*******************************************************************************************/
+
+.macro KERNEL16x1_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %zmm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %zmm2
+	VFMADD231PS_(  	%zmm4,%zmm2,%zmm0 )
+	addq	$ 1 , BI	
+	addq	$ 16, %rax 
+.endm
+
+.macro SAVE16x1
+
+	vbroadcastss	ALPHA, %zmm0
+
+	vmulps	%zmm0 , %zmm4 , %zmm4
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %zmm4,%zmm4
+
+#endif
+
+	vmovups	%zmm4 ,  	(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x1_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	addq	$ 1 , BI	
+	addq	$ 8 , %rax 
+.endm
+
+.macro SAVE8x1
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x1_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
+	addq	$ 1 , BI	
+	addq	$ 4 , %rax 
+.endm
+
+.macro SAVE4x1
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vmulps	%xmm0 , %xmm4 , %xmm4
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %xmm4,%xmm4
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x1_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
+	addq	$ 1 , BI	
+	addq	$ 2 , %rax 
+.endm
+
+.macro SAVE2x1
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm5 , %xmm5
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 , 1 * SIZE(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL1x1_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	addq	$ 1 , BI	
+	addq	$ 1 , %rax 
+.endm
+
+.macro SAVE1x1
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+#if !defined(TRMMKERNEL)
+
+/*************************************************************************************
+* GEMM Kernel
+*************************************************************************************/
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	movups	%xmm6,   64(%rsp)
+	movups	%xmm7,   80(%rsp)
+	movups	%xmm8,   96(%rsp)
+	movups	%xmm9,  112(%rsp)
+	movups	%xmm10, 128(%rsp)
+	movups	%xmm11, 144(%rsp)
+	movups	%xmm12, 160(%rsp)
+	movups	%xmm13, 176(%rsp)
+	movups	%xmm14, 192(%rsp)
+	movups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	vmovsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovss	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $12,  %rdi
+        divq    %rdi                    //    N / 12
+        movq    %rax, Ndiv6             //    N / 12
+        movq    %rdx, Nmod6             //    N % 12
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L4_00
+	ALIGN_4
+
+
+/*******************************************************************************************/
+
+.L6_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	salq	$2, %rax		// 4 values of B
+        leaq    (B, %rax,4), BO2
+        movq    BO2, B                  // next offset of B
+        movq    K, %rax
+
+	ALIGN_4
+
+
+.L6_02c:
+
+	vmovups	(BO1), %xmm0
+	vmovsd	(BO2), %xmm1
+	vmovups	%xmm0, (BO)
+	vmovsd	%xmm1, 4*SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO2
+	addq	$ 6*SIZE,BO
+	decq	%rax
+	jnz	.L6_02c
+
+
+.L6_10:
+	movq	 C, CO1
+	leaq	(C,   LDC, 2), CO2	
+	leaq	(CO2, LDC, 1), CO2		// co2 = c + 3 * ldc
+	leaq	(C,   LDC, 4), C	
+	leaq	(C,   LDC, 2), C		// c = c + 6 * ldc
+
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L6_20
+
+	ALIGN_4
+
+.L6_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L6_16
+
+	ALIGN_4
+
+.L6_12:
+
+	KERNEL16x6_SUB4
+
+	KERNEL16x6_SUB4
+
+	je	.L6_16
+
+	KERNEL16x6_SUB4
+
+	KERNEL16x6_SUB4
+
+	je	.L6_16
+
+	jmp	.L6_12
+	ALIGN_4
+
+.L6_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_19
+
+	ALIGN_4
+
+.L6_17:
+
+	KERNEL16x6_SUB
+
+	jnz	.L6_17
+	ALIGN_4
+
+
+.L6_19:
+
+	SAVE16x6
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	addq	$16 * SIZE, CO2		# coffset += 16
+	decq	I			# i --
+	jg	.L6_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L6_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L6_60		// to next 6 lines of N
+
+	testq	$8, M		
+	jz	.L6_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L6_20_1:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_20_6
+
+	ALIGN_4
+
+.L6_20_2:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	je	.L6_20_6
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	je	.L6_20_6
+
+	jmp	.L6_20_2
+	ALIGN_4
+
+.L6_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_20_9
+
+	ALIGN_4
+
+.L6_20_7:
+
+	KERNEL8x6_SUB
+
+	jnz	.L6_20_7
+	ALIGN_4
+
+
+.L6_20_9:
+
+	SAVE8x6
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	addq	$8 * SIZE, CO2		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L6_21pre:
+
+	testq	$4, M		
+	jz	.L6_30
+	ALIGN_4
+
+.L6_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_26
+
+	ALIGN_4
+
+.L6_22:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	je	.L6_26
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	je	.L6_26
+
+	jmp	.L6_22
+	ALIGN_4
+
+.L6_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_29
+
+	ALIGN_4
+
+.L6_27:
+
+	KERNEL4x6_SUB
+
+	jnz	.L6_27
+	ALIGN_4
+
+
+.L6_29:
+
+	SAVE4x6
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	addq	$4 * SIZE, CO2		# coffset += 4
+	ALIGN_4
+	
+
+.L6_30:
+	testq	$2, M		
+	jz	.L6_40
+
+	ALIGN_4
+
+.L6_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_36
+
+	ALIGN_4
+
+.L6_32:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	je	.L6_36
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	je	.L6_36
+
+	jmp	.L6_32
+	ALIGN_4
+
+.L6_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_39
+
+	ALIGN_4
+
+.L6_37:
+
+	KERNEL2x6_SUB
+
+	jnz	.L6_37
+	ALIGN_4
+
+
+.L6_39:
+
+	SAVE2x6
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	addq	$2 * SIZE, CO2		# coffset += 2
+	ALIGN_4
+
+.L6_40:
+	testq	$1, M		
+	jz	.L6_60		// to next 4 lines of N
+
+	ALIGN_4
+
+.L6_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_46
+
+	ALIGN_4
+
+.L6_42:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	je	.L6_46
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	je	.L6_46
+
+	jmp	.L6_42
+	ALIGN_4
+
+.L6_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_49
+
+	ALIGN_4
+
+.L6_47:
+
+	KERNEL1x6_SUB
+
+	jnz	.L6_47
+	ALIGN_4
+
+
+.L6_49:
+
+	SAVE1x6
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	addq	$1 * SIZE, CO2		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L6_60:
+
+
+/*******************************************************************************************/
+
+
+.L7_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	salq	$2, %rax		// 4 values of B
+        leaq    (B, %rax,4), BO2
+        movq    K, %rax
+
+	ALIGN_4
+
+
+.L7_02c:
+
+	vmovsd	2*SIZE(BO1), %xmm0
+	vmovups	      (BO2), %xmm1
+	vmovsd	%xmm0, (BO)
+	vmovups	%xmm1, 2*SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO2
+	addq	$ 6*SIZE,BO
+	decq	%rax
+	jnz	.L7_02c
+
+        movq    BO2, B                  // next offset of B
+
+.L7_10:
+	movq	 C, CO1
+	leaq	(C,   LDC, 2), CO2	
+	leaq	(CO2, LDC, 1), CO2		// co2 = c + 3 * ldc
+	leaq	(C,   LDC, 4), C	
+	leaq	(C,   LDC, 2), C		// c = c + 6 * ldc
+
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L7_20
+
+	ALIGN_4
+
+.L7_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L7_16
+
+	ALIGN_4
+
+.L7_12:
+
+	KERNEL16x6_SUB4
+
+	KERNEL16x6_SUB4
+
+	je	.L7_16
+
+	KERNEL16x6_SUB4
+
+	KERNEL16x6_SUB4
+
+	je	.L7_16
+
+	jmp	.L7_12
+	ALIGN_4
+
+.L7_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_19
+
+	ALIGN_4
+
+.L7_17:
+
+	KERNEL16x6_SUB
+
+	jnz	.L7_17
+	ALIGN_4
+
+
+.L7_19:
+
+	SAVE16x6
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	addq	$16 * SIZE, CO2		# coffset += 16
+	decq	I			# i --
+	jg	.L7_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L7_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L7_60		// to next 6 lines of N
+
+	testq	$8, M		
+	jz	.L7_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L7_20_1:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_20_6
+
+	ALIGN_4
+
+.L7_20_2:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	je	.L7_20_6
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	je	.L7_20_6
+
+	jmp	.L7_20_2
+	ALIGN_4
+
+.L7_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_20_9
+
+	ALIGN_4
+
+.L7_20_7:
+
+	KERNEL8x6_SUB
+
+	jnz	.L7_20_7
+	ALIGN_4
+
+
+.L7_20_9:
+
+	SAVE8x6
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	addq	$8 * SIZE, CO2		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L7_21pre:
+
+	testq	$4, M		
+	jz	.L7_30
+	ALIGN_4
+
+.L7_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_26
+
+	ALIGN_4
+
+.L7_22:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	je	.L7_26
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	je	.L7_26
+
+	jmp	.L7_22
+	ALIGN_4
+
+.L7_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_29
+
+	ALIGN_4
+
+.L7_27:
+
+	KERNEL4x6_SUB
+
+	jnz	.L7_27
+	ALIGN_4
+
+
+.L7_29:
+
+	SAVE4x6
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	addq	$4 * SIZE, CO2		# coffset += 4
+	ALIGN_4
+	
+
+.L7_30:
+	testq	$2, M		
+	jz	.L7_40
+
+	ALIGN_4
+
+.L7_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_36
+
+	ALIGN_4
+
+.L7_32:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	je	.L7_36
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	je	.L7_36
+
+	jmp	.L7_32
+	ALIGN_4
+
+.L7_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_39
+
+	ALIGN_4
+
+.L7_37:
+
+	KERNEL2x6_SUB
+
+	jnz	.L7_37
+	ALIGN_4
+
+
+.L7_39:
+
+	SAVE2x6
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	addq	$2 * SIZE, CO2		# coffset += 2
+	ALIGN_4
+
+.L7_40:
+	testq	$1, M		
+	jz	.L7_60		// to next 4 lines of N
+
+	ALIGN_4
+
+.L7_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_46
+
+	ALIGN_4
+
+.L7_42:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	je	.L7_46
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	je	.L7_46
+
+	jmp	.L7_42
+	ALIGN_4
+
+.L7_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_49
+
+	ALIGN_4
+
+.L7_47:
+
+	KERNEL1x6_SUB
+
+	jnz	.L7_47
+	ALIGN_4
+
+
+.L7_49:
+
+	SAVE1x6
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	addq	$1 * SIZE, CO2		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L7_60:
+
+	decq	J			// j --
+	jg	.L6_01			// next 12 lines of N
+
+
+
+
+/*******************************************************************************************/
+.L4_00:
+
+ 	movq    Nmod6,  J
+        sarq    $2, J           // j = j / 4
+        cmpq    $ 0, J
+        je      .L2_00
+        ALIGN_4
+
+
+.L4_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// K / 4
+	jz	.L4_01b
+	ALIGN_4
+
+
+.L4_01a:
+        prefetcht0 512(BO1)
+        prefetchw  512(BO)
+
+	vmovups	       (BO1), %xmm0
+	vmovups	 4*SIZE(BO1), %xmm1
+	vmovups	 8*SIZE(BO1), %xmm2
+	vmovups	12*SIZE(BO1), %xmm3
+
+	vmovups	%xmm0,       (BO)
+	vmovups	%xmm1, 4*SIZE(BO)
+	vmovups	%xmm2, 8*SIZE(BO)
+	vmovups	%xmm3,12*SIZE(BO)
+
+	addq	$ 16*SIZE,BO1
+	addq	$ 16*SIZE,BO
+	decq	%rax
+	jnz	.L4_01a
+
+
+.L4_01b:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L4_02d
+        ALIGN_4
+
+.L4_02c:
+
+	vmovups	(BO1), %xmm0
+	vmovups	%xmm0, (BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO
+	decq	%rax
+	jnz	.L4_02c
+
+.L4_02d:
+
+	movq	BO1, B			// next offset of B
+
+.L4_10:
+	movq	 C, CO1
+	leaq	(C, LDC, 2), CO2	
+	leaq	(C, LDC, 4), C		// c += 4 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L4_20
+
+	ALIGN_4
+
+.L4_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             	// first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $4, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4) , BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_12:
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	je	.L4_16
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	je	.L4_16
+
+	jmp	.L4_12
+	ALIGN_4
+
+.L4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_17:
+
+	KERNEL16x4_SUB
+
+	jl	.L4_17
+	ALIGN_4
+
+
+.L4_19:
+
+	SAVE16x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	addq	$16 * SIZE, CO2		# coffset += 16
+	decq	I			# i --
+	jg	.L4_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L4_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L4_60		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L4_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L4_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_20_2:
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	je	.L4_20_6
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	je	.L4_20_6
+
+	jmp	.L4_20_2
+	ALIGN_4
+
+.L4_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_20_7:
+
+	KERNEL8x4_SUB
+
+	jl	.L4_20_7
+	ALIGN_4
+
+
+.L4_20_9:
+
+	SAVE8x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	addq	$8 * SIZE, CO2		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L4_21pre:
+
+	testq	$4, M		
+	jz	.L4_30
+	ALIGN_4
+
+.L4_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_22:
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	je	.L4_26
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	je	.L4_26
+
+	jmp	.L4_22
+	ALIGN_4
+
+.L4_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_27:
+
+	KERNEL4x4_SUB
+
+	jl	.L4_27
+	ALIGN_4
+
+
+.L4_29:
+
+	SAVE4x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	addq	$4 * SIZE, CO2		# coffset += 4
+	ALIGN_4
+	
+
+.L4_30:
+	testq	$2, M		
+	jz	.L4_40
+
+	ALIGN_4
+
+.L4_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_32:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	je	.L4_36
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	je	.L4_36
+
+	jmp	.L4_32
+	ALIGN_4
+
+.L4_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_39
+
+	movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_37:
+
+	KERNEL2x4_SUB
+
+	jl	.L4_37
+	ALIGN_4
+
+
+.L4_39:
+
+	SAVE2x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	addq	$2 * SIZE, CO2		# coffset += 2
+	ALIGN_4
+
+.L4_40:
+	testq	$1, M		
+	jz	.L4_60		// to next 4 lines of N
+
+	ALIGN_4
+
+.L4_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L4_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_42:
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	je	.L4_46
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	je	.L4_46
+
+	jmp	.L4_42
+	ALIGN_4
+
+.L4_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_47:
+
+	KERNEL1x4_SUB
+
+	jl	.L4_47
+	ALIGN_4
+
+
+.L4_49:
+
+	SAVE1x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	addq	$1 * SIZE, CO2		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L4_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $4, KK
+#endif
+
+	decq	J			// j --
+	jg	.L4_01			// next 4 lines of N
+
+
+
+/*******************************************************************************************/
+.L2_00:
+
+	movq	Nmod6, J		
+	andq	$3, J			// j % 4
+	je	.L999
+
+	movq	Nmod6, J		
+	andq	$2, J			// j % 4
+	je	.L1_0
+
+.L2_01:
+
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// K / 4
+	jz	.L2_01b
+	ALIGN_4
+
+.L2_01a:
+
+	vmovsd	      (BO1), %xmm0
+	vmovsd	2*SIZE(BO1), %xmm1
+	vmovsd	4*SIZE(BO1), %xmm2
+	vmovsd	6*SIZE(BO1), %xmm3
+
+	vmovsd	%xmm0,       (BO)
+	vmovsd	%xmm1, 2*SIZE(BO)
+	vmovsd	%xmm2, 4*SIZE(BO)
+	vmovsd	%xmm3, 6*SIZE(BO)
+
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO
+	decq	%rax
+	jnz	.L2_01a
+
+
+.L2_01b:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L2_02d
+        ALIGN_4
+
+.L2_02c:
+
+	vmovsd 	(BO1), %xmm0
+	vmovsd 	%xmm0, (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L2_02c
+
+.L2_02d:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $2, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	je	.L2_16
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL16x2_SUB
+
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	SAVE16x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L2_60		// to next 2 lines of N
+
+	testq	$8, M		
+	jz	.L2_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L2_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_2:
+
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	je	.L2_20_6
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	je	.L2_20_6
+
+	jmp	.L2_20_2
+	ALIGN_4
+
+.L2_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_7:
+
+	KERNEL8x2_SUB
+
+	jl	.L2_20_7
+	ALIGN_4
+
+
+.L2_20_9:
+
+	SAVE8x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L2_21pre:
+
+	testq	$4, M		
+	jz	.L2_30
+	ALIGN_4
+
+.L2_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_26
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL4x2_SUB
+
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	SAVE4x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_32:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_36
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_36
+
+	jmp	.L2_32
+	ALIGN_4
+
+.L2_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB
+
+	jl	.L2_37
+	ALIGN_4
+
+
+.L2_39:
+
+	SAVE2x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_46
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB
+
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	SAVE1x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK
+#endif
+
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovss	(BO1), %xmm0
+	vmovss	%xmm0,       (BO)
+	addq	$1*SIZE,BO1
+	addq	$1*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $1, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	je	.L1_16
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL16x1_SUB
+
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	SAVE16x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L999
+
+	testq	$8, M		
+	jz	.L1_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L1_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_20_6
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_2:
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	je	.L1_20_6
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	je	.L1_20_6
+
+	jmp	.L1_20_2
+	ALIGN_4
+
+.L1_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_20_9
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_7:
+
+	KERNEL8x1_SUB
+
+	jl	.L1_20_7
+	ALIGN_4
+
+
+.L1_20_9:
+
+	SAVE8x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L1_21pre:
+
+	testq	$4, M		
+	jz	.L1_30
+	ALIGN_4
+
+.L1_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_26
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL4x1_SUB
+
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	SAVE4x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_36
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_32:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_36
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_36
+
+	jmp	.L1_32
+	ALIGN_4
+
+.L1_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	movq    %rax, BI                        //  Index for BO
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB
+
+	jl	.L1_37
+	ALIGN_4
+
+
+.L1_39:
+
+	SAVE2x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_46
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB
+
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	SAVE1x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+.L999:
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	movups	 64(%rsp), %xmm6
+	movups	 80(%rsp), %xmm7
+	movups	 96(%rsp), %xmm8
+	movups	112(%rsp), %xmm9
+	movups	128(%rsp), %xmm10
+	movups	144(%rsp), %xmm11
+	movups	160(%rsp), %xmm12
+	movups	176(%rsp), %xmm13
+	movups	192(%rsp), %xmm14
+	movups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+
+#else
+
+/*************************************************************************************
+* TRMM Kernel
+*************************************************************************************/
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	movups	%xmm6,   64(%rsp)
+	movups	%xmm7,   80(%rsp)
+	movups	%xmm8,   96(%rsp)
+	movups	%xmm9,  112(%rsp)
+	movups	%xmm10, 128(%rsp)
+	movups	%xmm11, 144(%rsp)
+	movups	%xmm12, 160(%rsp)
+	movups	%xmm13, 176(%rsp)
+	movups	%xmm14, 192(%rsp)
+	movups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	vmovsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovss	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $4,  %rdi
+        divq    %rdi                    //    N / 4
+        movq    %rax, Ndiv6             //    N / 4
+        movq    %rdx, Nmod6             //    N % 4
+
+	
+
+#ifdef TRMMKERNEL
+	vmovsd	%xmm12, OFFSET
+	vmovsd	%xmm12, KK
+#ifndef LEFT
+	negq	KK
+#endif	
+#endif
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L2_0
+	ALIGN_4
+
+/*******************************************************************************************/
+
+.L4_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// K / 4
+	jz	.L4_01b
+	ALIGN_4
+
+
+.L4_01a:
+        prefetcht0 512(BO1)
+        prefetchw  512(BO)
+
+	vmovups	       (BO1), %xmm0
+	vmovups	 4*SIZE(BO1), %xmm1
+	vmovups	 8*SIZE(BO1), %xmm2
+	vmovups	12*SIZE(BO1), %xmm3
+
+	vmovups	%xmm0,       (BO)
+	vmovups	%xmm1, 4*SIZE(BO)
+	vmovups	%xmm2, 8*SIZE(BO)
+	vmovups	%xmm3,12*SIZE(BO)
+
+	addq	$ 16*SIZE,BO1
+	addq	$ 16*SIZE,BO
+	decq	%rax
+	jnz	.L4_01a
+
+
+.L4_01b:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L4_02d
+        ALIGN_4
+
+.L4_02c:
+
+	vmovups	(BO1), %xmm0
+	vmovups	%xmm0, (BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO
+	decq	%rax
+	jnz	.L4_02c
+
+.L4_02d:
+
+	movq	BO1, B			// next offset of B
+
+.L4_10:
+	movq	 C, CO1
+	leaq	(C, LDC, 2), CO2	
+	leaq	(C, LDC, 4), C		// c += 4 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L4_20
+
+	ALIGN_4
+
+.L4_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             	// first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $4, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4) , BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_12:
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	je	.L4_16
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	je	.L4_16
+
+	jmp	.L4_12
+	ALIGN_4
+
+.L4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_17:
+
+	KERNEL16x4_SUB
+
+	jl	.L4_17
+	ALIGN_4
+
+
+.L4_19:
+
+	SAVE16x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	addq	$16 * SIZE, CO2		# coffset += 16
+	decq	I			# i --
+	jg	.L4_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L4_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L4_60		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L4_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L4_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_20_2:
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	je	.L4_20_6
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	je	.L4_20_6
+
+	jmp	.L4_20_2
+	ALIGN_4
+
+.L4_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_20_7:
+
+	KERNEL8x4_SUB
+
+	jl	.L4_20_7
+	ALIGN_4
+
+
+.L4_20_9:
+
+	SAVE8x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	addq	$8 * SIZE, CO2		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L4_21pre:
+
+	testq	$4, M		
+	jz	.L4_30
+	ALIGN_4
+
+.L4_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_22:
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	je	.L4_26
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	je	.L4_26
+
+	jmp	.L4_22
+	ALIGN_4
+
+.L4_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_27:
+
+	KERNEL4x4_SUB
+
+	jl	.L4_27
+	ALIGN_4
+
+
+.L4_29:
+
+	SAVE4x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	addq	$4 * SIZE, CO2		# coffset += 4
+	ALIGN_4
+	
+
+.L4_30:
+	testq	$2, M		
+	jz	.L4_40
+
+	ALIGN_4
+
+.L4_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_32:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	je	.L4_36
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	je	.L4_36
+
+	jmp	.L4_32
+	ALIGN_4
+
+.L4_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_39
+
+	movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_37:
+
+	KERNEL2x4_SUB
+
+	jl	.L4_37
+	ALIGN_4
+
+
+.L4_39:
+
+	SAVE2x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	addq	$2 * SIZE, CO2		# coffset += 2
+	ALIGN_4
+
+.L4_40:
+	testq	$1, M		
+	jz	.L4_60		// to next 4 lines of N
+
+	ALIGN_4
+
+.L4_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L4_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_42:
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	je	.L4_46
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	je	.L4_46
+
+	jmp	.L4_42
+	ALIGN_4
+
+.L4_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_47:
+
+	KERNEL1x4_SUB
+
+	jl	.L4_47
+	ALIGN_4
+
+
+.L4_49:
+
+	SAVE1x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	addq	$1 * SIZE, CO2		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L4_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $4, KK
+#endif
+
+	decq	J			// j --
+	jg	.L4_01			// next 4 lines of N
+
+
+
+/*******************************************************************************************/
+.L2_0:
+
+	movq	Nmod6, J		
+	andq	$3, J			// j % 4
+	je	.L999
+
+	movq	Nmod6, J		
+	andq	$2, J			// j % 4
+	je	.L1_0
+
+.L2_01:
+
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// K / 4
+	jz	.L2_01b
+	ALIGN_4
+
+.L2_01a:
+
+	vmovsd	      (BO1), %xmm0
+	vmovsd	2*SIZE(BO1), %xmm1
+	vmovsd	4*SIZE(BO1), %xmm2
+	vmovsd	6*SIZE(BO1), %xmm3
+
+	vmovsd	%xmm0,       (BO)
+	vmovsd	%xmm1, 2*SIZE(BO)
+	vmovsd	%xmm2, 4*SIZE(BO)
+	vmovsd	%xmm3, 6*SIZE(BO)
+
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO
+	decq	%rax
+	jnz	.L2_01a
+
+
+.L2_01b:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L2_02d
+        ALIGN_4
+
+.L2_02c:
+
+	vmovsd 	(BO1), %xmm0
+	vmovsd 	%xmm0, (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L2_02c
+
+.L2_02d:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $2, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	je	.L2_16
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL16x2_SUB
+
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	SAVE16x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L2_60		// to next 2 lines of N
+
+	testq	$8, M		
+	jz	.L2_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L2_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_2:
+
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	je	.L2_20_6
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	je	.L2_20_6
+
+	jmp	.L2_20_2
+	ALIGN_4
+
+.L2_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_7:
+
+	KERNEL8x2_SUB
+
+	jl	.L2_20_7
+	ALIGN_4
+
+
+.L2_20_9:
+
+	SAVE8x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L2_21pre:
+
+	testq	$4, M		
+	jz	.L2_30
+	ALIGN_4
+
+.L2_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_26
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL4x2_SUB
+
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	SAVE4x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_32:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_36
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_36
+
+	jmp	.L2_32
+	ALIGN_4
+
+.L2_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB
+
+	jl	.L2_37
+	ALIGN_4
+
+
+.L2_39:
+
+	SAVE2x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_46
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB
+
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	SAVE1x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK
+#endif
+
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovss	(BO1), %xmm0
+	vmovss	%xmm0,       (BO)
+	addq	$1*SIZE,BO1
+	addq	$1*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $1, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	je	.L1_16
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL16x1_SUB
+
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	SAVE16x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L999
+
+	testq	$8, M		
+	jz	.L1_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L1_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_20_6
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_2:
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	je	.L1_20_6
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	je	.L1_20_6
+
+	jmp	.L1_20_2
+	ALIGN_4
+
+.L1_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_20_9
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_7:
+
+	KERNEL8x1_SUB
+
+	jl	.L1_20_7
+	ALIGN_4
+
+
+.L1_20_9:
+
+	SAVE8x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L1_21pre:
+
+	testq	$4, M		
+	jz	.L1_30
+	ALIGN_4
+
+.L1_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_26
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL4x1_SUB
+
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	SAVE4x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_36
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_32:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_36
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_36
+
+	jmp	.L1_32
+	ALIGN_4
+
+.L1_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	movq    %rax, BI                        //  Index for BO
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB
+
+	jl	.L1_37
+	ALIGN_4
+
+
+.L1_39:
+
+	SAVE2x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_46
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB
+
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	SAVE1x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+.L999:
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	movups	 64(%rsp), %xmm6
+	movups	 80(%rsp), %xmm7
+	movups	 96(%rsp), %xmm8
+	movups	112(%rsp), %xmm9
+	movups	128(%rsp), %xmm10
+	movups	144(%rsp), %xmm11
+	movups	160(%rsp), %xmm12
+	movups	176(%rsp), %xmm13
+	movups	192(%rsp), %xmm14
+	movups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+#endif
+
diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
index fd028964b..65305ac59 100644
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "sgemv_n_microk_nehalem-4.c"
 #elif defined(SANDYBRIDGE)
 #include "sgemv_n_microk_sandy-4.c"
-#elif defined(HASWELL) || defined(ZEN)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #include "sgemv_n_microk_haswell-4.c"
 #endif
 
diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index f04d461f7..065e5b385 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "sgemv_t_microk_bulldozer-4.c"
 #elif defined(SANDYBRIDGE)
 #include "sgemv_t_microk_sandy-4.c"
-#elif defined(HASWELL) || defined(ZEN)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #include "sgemv_t_microk_haswell-4.c"
 #endif
 
diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c
index 199d8a517..73ae001ea 100644
--- a/kernel/x86_64/ssymv_L.c
+++ b/kernel/x86_64/ssymv_L.c
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "ssymv_L_microk_bulldozer-2.c"
 #elif defined(NEHALEM)
 #include "ssymv_L_microk_nehalem-2.c"
-#elif defined(HASWELL) || defined(ZEN)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #include "ssymv_L_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "ssymv_L_microk_sandy-2.c"
diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c
index 691a071f7..f37c251a1 100644
--- a/kernel/x86_64/ssymv_U.c
+++ b/kernel/x86_64/ssymv_U.c
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "ssymv_U_microk_bulldozer-2.c"
 #elif defined(NEHALEM)
 #include "ssymv_U_microk_nehalem-2.c"
-#elif defined(HASWELL) || defined(ZEN)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #include "ssymv_U_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "ssymv_U_microk_sandy-2.c"
diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S
index 8cae3fc1b..8a5c44c9b 100644
--- a/kernel/x86_64/symv_L_sse.S
+++ b/kernel/x86_64/symv_L_sse.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 12)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 12)
diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S
index d7091624d..0c40a3435 100644
--- a/kernel/x86_64/symv_L_sse2.S
+++ b/kernel/x86_64/symv_L_sse2.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 12)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 12)
diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S
index 3549b9863..7a2eeace5 100644
--- a/kernel/x86_64/symv_U_sse.S
+++ b/kernel/x86_64/symv_U_sse.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 12)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN)
+#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 12)
diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S
index 882b035a9..0408b577c 100644
--- a/kernel/x86_64/symv_U_sse2.S
+++ b/kernel/x86_64/symv_U_sse2.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 12)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN)
+#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 24)
diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c
index 8cb1d532f..53866cf95 100644
--- a/kernel/x86_64/zaxpy.c
+++ b/kernel/x86_64/zaxpy.c
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "zaxpy_microk_bulldozer-2.c"
 #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
 #include "zaxpy_microk_steamroller-2.c"
-#elif defined(HASWELL) || defined(ZEN)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #include "zaxpy_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "zaxpy_microk_sandy-2.c"
diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c
index d11c76647..ef12569c8 100644
--- a/kernel/x86_64/zdot.c
+++ b/kernel/x86_64/zdot.c
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "zdot_microk_bulldozer-2.c"
 #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR)
 #include "zdot_microk_steamroller-2.c"
-#elif defined(HASWELL) || defined(ZEN)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #include "zdot_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "zdot_microk_sandy-2.c"
diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c
index f6f88155c..0fedc496b 100644
--- a/kernel/x86_64/zgemv_n_4.c
+++ b/kernel/x86_64/zgemv_n_4.c
@@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(HASWELL) || defined(ZEN)
+#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #include "zgemv_n_microk_haswell-4.c"
 #elif defined(SANDYBRIDGE)
 #include "zgemv_n_microk_sandy-4.c"
diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c
index 3e4b7d5df..2ab7a671b 100644
--- a/kernel/x86_64/zgemv_t_4.c
+++ b/kernel/x86_64/zgemv_t_4.c
@@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)  || defined(EXCAVATOR)
 #include "zgemv_t_microk_bulldozer-4.c"
-#elif defined(HASWELL) || defined(ZEN)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #include "zgemv_t_microk_haswell-4.c"
 #endif
 
diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c
index aa5d8fac0..2a6d0e4c7 100644
--- a/kernel/x86_64/zscal.c
+++ b/kernel/x86_64/zscal.c
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(HASWELL) || defined(ZEN)
+#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #include "zscal_microk_haswell-2.c"
 #elif defined(BULLDOZER)  || defined(PILEDRIVER)
 #include "zscal_microk_bulldozer-2.c"
diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S
index dd95eea17..e44bd7550 100644
--- a/kernel/x86_64/zsymv_L_sse.S
+++ b/kernel/x86_64/zsymv_L_sse.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 24)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 24)
diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S
index 75124cf3e..e9f330c36 100644
--- a/kernel/x86_64/zsymv_L_sse2.S
+++ b/kernel/x86_64/zsymv_L_sse2.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 24)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 24)
diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S
index db1a4ff5f..9f0dead18 100644
--- a/kernel/x86_64/zsymv_U_sse.S
+++ b/kernel/x86_64/zsymv_U_sse.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 24)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN)
+#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 24)
diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S
index 599765a6d..b6106a37d 100644
--- a/kernel/x86_64/zsymv_U_sse2.S
+++ b/kernel/x86_64/zsymv_U_sse2.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 24)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN)
+#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 24)
diff --git a/param.h b/param.h
index 4227d548e..49a5e85e8 100644
--- a/param.h
+++ b/param.h
@@ -1613,6 +1613,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
+#endif
+
+#ifdef SKYLAKEX
+
+#define SNUMOPT         16
+#define DNUMOPT         8
+
+#define GEMM_DEFAULT_OFFSET_A     0
+#define GEMM_DEFAULT_OFFSET_B     0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SYMV_P  8
+
+#define SWITCH_RATIO	4
+
+#ifdef ARCH_X86
+
+#define SGEMM_DEFAULT_UNROLL_M 4
+#define DGEMM_DEFAULT_UNROLL_M 2
+#define QGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_M 2
+#define ZGEMM_DEFAULT_UNROLL_M 1
+#define XGEMM_DEFAULT_UNROLL_M 1
+
+#define SGEMM_DEFAULT_UNROLL_N 4
+#define DGEMM_DEFAULT_UNROLL_N 4
+#define QGEMM_DEFAULT_UNROLL_N 2
+#define CGEMM_DEFAULT_UNROLL_N 2
+#define ZGEMM_DEFAULT_UNROLL_N 2
+#define XGEMM_DEFAULT_UNROLL_N 1
+
+#else
+
+#define SGEMM_DEFAULT_UNROLL_M 16
+#define DGEMM_DEFAULT_UNROLL_M 4
+#define QGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define ZGEMM_DEFAULT_UNROLL_M 4
+#define XGEMM_DEFAULT_UNROLL_M 1
+
+#define SGEMM_DEFAULT_UNROLL_N 4
+#define DGEMM_DEFAULT_UNROLL_N 8
+#define QGEMM_DEFAULT_UNROLL_N 2
+#define CGEMM_DEFAULT_UNROLL_N 2
+#define ZGEMM_DEFAULT_UNROLL_N 2
+#define XGEMM_DEFAULT_UNROLL_N 1
+
+#define SGEMM_DEFAULT_UNROLL_MN 32
+#define DGEMM_DEFAULT_UNROLL_MN 32
+#endif
+
+#ifdef ARCH_X86
+
+#define SGEMM_DEFAULT_P 512
+#define SGEMM_DEFAULT_R sgemm_r
+#define DGEMM_DEFAULT_P 512
+#define DGEMM_DEFAULT_R dgemm_r
+#define QGEMM_DEFAULT_P 504
+#define QGEMM_DEFAULT_R qgemm_r
+#define CGEMM_DEFAULT_P 128
+#define CGEMM_DEFAULT_R 1024
+#define ZGEMM_DEFAULT_P 512
+#define ZGEMM_DEFAULT_R zgemm_r
+#define XGEMM_DEFAULT_P 252
+#define XGEMM_DEFAULT_R xgemm_r
+#define SGEMM_DEFAULT_Q 256
+#define DGEMM_DEFAULT_Q 256
+#define QGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 256
+#define ZGEMM_DEFAULT_Q 192
+#define XGEMM_DEFAULT_Q 128
+
+#else
+
+#define SGEMM_DEFAULT_P 768
+#define DGEMM_DEFAULT_P 512
+#define CGEMM_DEFAULT_P 384
+#define ZGEMM_DEFAULT_P 256
+
+#ifdef WINDOWS_ABI
+#define SGEMM_DEFAULT_Q 320
+#define DGEMM_DEFAULT_Q 128
+#else
+#define SGEMM_DEFAULT_Q 384
+#define DGEMM_DEFAULT_Q 256
+#endif
+#define CGEMM_DEFAULT_Q 192
+#define ZGEMM_DEFAULT_Q 128
+
+#define SGEMM_DEFAULT_R sgemm_r
+#define DGEMM_DEFAULT_R 13824
+#define CGEMM_DEFAULT_R cgemm_r
+#define ZGEMM_DEFAULT_R zgemm_r
+
+#define QGEMM_DEFAULT_Q 128
+#define QGEMM_DEFAULT_P 504
+#define QGEMM_DEFAULT_R qgemm_r
+#define XGEMM_DEFAULT_P 252
+#define XGEMM_DEFAULT_R xgemm_r
+#define XGEMM_DEFAULT_Q 128
+
+#define CGEMM3M_DEFAULT_UNROLL_N 8
+#define CGEMM3M_DEFAULT_UNROLL_M 4
+#define ZGEMM3M_DEFAULT_UNROLL_N 8
+#define ZGEMM3M_DEFAULT_UNROLL_M 2
+
+#define CGEMM3M_DEFAULT_P 448
+#define ZGEMM3M_DEFAULT_P 224
+#define XGEMM3M_DEFAULT_P 112
+#define CGEMM3M_DEFAULT_Q 224
+#define ZGEMM3M_DEFAULT_Q 224
+#define XGEMM3M_DEFAULT_Q 224
+#define CGEMM3M_DEFAULT_R 12288
+#define ZGEMM3M_DEFAULT_R 12288
+#define XGEMM3M_DEFAULT_R 12288
+
+#endif
+
+
 #endif
 
 

From 00235157339dc5fba2b4194bd660c45257e539e1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 3 Jun 2018 13:22:59 +0200
Subject: [PATCH 16/86] Typo fix (misplaced parenthesis)

---
 kernel/x86/trsm_kernel_LT_4x4_penryn.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S
index 361ccf603..e2f731fca 100644
--- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S
+++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S
@@ -62,7 +62,7 @@
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL || defined (SKYLAKEX))
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
 #define PREFETCH     prefetcht0
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif

From f1fb9a474571846ffc140313dbe5b8ba21925b74 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 3 Jun 2018 13:48:27 +0200
Subject: [PATCH 17/86] Propagate NO_AVX512 if needed

---
 Makefile.system | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index b005b80c9..cec4b44e5 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -147,6 +147,10 @@ ifeq ($(NO_AVX2), 1)
 GETARCH_FLAGS	+= -DNO_AVX2
 endif
 
+ifeq ($(NO_AVX512), 1)
+GETARCH_FLAGS	+= -DNO_AVX512
+endif
+
 ifeq ($(DEBUG), 1)
 GETARCH_FLAGS	+= -g
 endif

From a7d0f49cec68dc3f116feed0320708ae004af4c4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 3 Jun 2018 23:13:25 +0200
Subject: [PATCH 18/86] Add SKYLAKEX to DYNAMIC_CORE list only if AVX512 is
 available

---
 Makefile.system | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/Makefile.system b/Makefile.system
index cec4b44e5..82e38a6d2 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -477,7 +477,12 @@ ifneq ($(NO_AVX), 1)
 DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR
 endif
 ifneq ($(NO_AVX2), 1)
-DYNAMIC_CORE += HASWELL ZEN SKYLAKEX
+DYNAMIC_CORE += HASWELL ZEN
+endif
+ifneq ($(NO_AVX512), 1)
+ifneq ($(NO_AVX2), 1)
+DYNAMIC_CORE += SKYLAKEX
+endif
 endif
 endif
 

From 5a92b311e05fb938e1fd85dcaf6fbeebc77bd4fb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 3 Jun 2018 23:29:07 +0200
Subject: [PATCH 19/86] Separate Skylake X from Skylake

---
 cpuid_x86.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/cpuid_x86.c b/cpuid_x86.c
index 5f49e7715..d0dbe1d24 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1301,6 +1301,19 @@ int get_cpuname(void){
           else
 	    return CPUTYPE_NEHALEM;
 	case 5:
+	  // Skylake X
+#ifndef NO_AVX512
+	  return CPUTYPE_SKYLAKEX;
+#else
+	  if(support_avx())
+#ifndef NO_AVX2
+	  return CPUTYPE_HASWELL;
+#else
+	  return CPUTYPE_SANDYBRIDGE;
+#endif
+	  else
+	  return CPUTYPE_NEHALEM;
+#endif			
         case 14:
 	  // Skylake
           if(support_avx())
@@ -1558,6 +1571,7 @@ static char *cpuname[] = {
   "STEAMROLLER",
   "EXCAVATOR",
   "ZEN",
+  "SKYLAKEX"	
 };
 
 static char *lowercpuname[] = {
@@ -1612,6 +1626,7 @@ static char *lowercpuname[] = {
   "steamroller",
   "excavator",
   "zen",
+  "skylakex"
 };
 
 static char *corename[] = {
@@ -1643,6 +1658,7 @@ static char *corename[] = {
   "STEAMROLLER",
   "EXCAVATOR",
   "ZEN",
+  "SKYLAKEX"	
 };
 
 static char *corename_lower[] = {
@@ -1674,6 +1690,7 @@ static char *corename_lower[] = {
   "steamroller",
   "excavator",
   "zen",
+  "skylakex"	
 };
 
 
@@ -1862,6 +1879,19 @@ int get_coretype(void){
           else
 	    return CORE_NEHALEM;
 	case 5:
+	 // Skylake X
+#ifndef NO_AVX512
+	    return CORE_SKYLAKEX;
+#else
+	  if/support_avx())
+#ifndef NO_AVX2
+	    return CORE_HASWELL;
+#else
+	    return CORE_SANDYBRIDGE;
+#endif
+	  else
+	    return CORE_NEHALEM;
+#endif			
 	case 14:
 	  // Skylake
           if(support_avx())

From 5a51cf4576df2e065e5517b04369ff10a2a83f58 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 3 Jun 2018 23:41:33 +0200
Subject: [PATCH 20/86] Separate Skylake X from Skylake

---
 driver/others/dynamic.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index a0c9794b1..5e9a24b8b 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -79,6 +79,11 @@ extern gotoblas_t  gotoblas_EXCAVATOR;
 #else
 extern gotoblas_t  gotoblas_HASWELL;
 extern gotoblas_t  gotoblas_ZEN;
+#ifndef NO_AVX512
+extern gotoblas_t  gotoblas_SKYLAKEX;
+#else
+#define gotoblas_SKYLAKEX gotoblas_HASWELL;
+#endif
 #endif
 #else
 //Use NEHALEM kernels for sandy bridge
@@ -286,8 +291,21 @@ static gotoblas_t *get_coretype(void){
 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 	  }
 	}
+	if (model == 5) {	
+	// Intel Skylake X
+#ifndef NO_AVX512
+	  return $gotoblas_SKYLAKEX;
+#else		
+	  if(support_avx())
+	    return &gotoblas_HASWELL;
+	  else {
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+	    return &gotoblas_NEHALEM;
+	  }
+	}
+#endif
 	//Intel Skylake
-	if (model == 14 || model == 5) {
+	if (model == 14) {
 	  if(support_avx())
 	    return &gotoblas_HASWELL;
 	  else{
@@ -447,7 +465,8 @@ static char *corename[] = {
     "Haswell",
     "Steamroller",
     "Excavator",
-    "Zen"
+    "Zen",
+    "SkylakeX"	
 };
 
 char *gotoblas_corename(void) {
@@ -475,7 +494,7 @@ char *gotoblas_corename(void) {
   if (gotoblas == &gotoblas_STEAMROLLER)  return corename[21];
   if (gotoblas == &gotoblas_EXCAVATOR)    return corename[22];
   if (gotoblas == &gotoblas_ZEN)          return corename[23];
-
+  if (gotoblas == &gotoblas_SKYLAKEX)     return corename[24];
   return corename[0];
 }
 
@@ -505,6 +524,7 @@ static gotoblas_t *force_coretype(char *coretype){
 
 	switch (found)
 	{
+		case 24: return (&gotoblas_SKYLAKEX);	
 		case 23: return (&gotoblas_ZEN);
 		case 22: return (&gotoblas_EXCAVATOR);
 		case 21: return (&gotoblas_STEAMROLLER);

From 83fec56a3f55fa24b2e541549852bdee03d30a0c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 4 Jun 2018 00:01:11 +0200
Subject: [PATCH 21/86] Disable AVX512 (Skylake X) support if the build system
 is too old

---
 c_check | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/c_check b/c_check
index a3b337602..dfe99350a 100644
--- a/c_check
+++ b/c_check
@@ -201,6 +201,21 @@ $architecture = zarch  if ($data =~ /ARCH_ZARCH/);
 $binformat    = bin32;
 $binformat    = bin64  if ($data =~ /BINARY_64/);
 
+$no_avx512= 0;
+if (($architecture eq "x86") || ($architecture eq "x86_64")) {
+    $code = '"vaddps %zmm1, %zmm0, %zmm0"'; 
+    print $tmpf "void main(void){ __asm__ volatile($code); }\n";
+    $args = " -o $tmpf.o -x c $tmpf";
+    my @cmd = ("$compiler_name $args");
+    system(@cmd) == 0;
+    if ($? != 0) {
+	$no_avx512 = 1;
+    } else {
+	$no_avx512 = 0;
+    }
+    unlink("tmpf.o");
+}
+
 $data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
 
 $data =~ /globl\s([_\.]*)(.*)/;
@@ -288,6 +303,7 @@ print MAKEFILE "CROSS=1\n" if $cross != 0;
 print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
 print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
 print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
+print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
 
 $os           =~ tr/[a-z]/[A-Z]/;
 $architecture =~ tr/[a-z]/[A-Z]/;

From ef626c6824c26415bc074d11325245e72f9e3284 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 4 Jun 2018 00:13:19 +0200
Subject: [PATCH 22/86] typo fix

---
 driver/others/dynamic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 5e9a24b8b..2c902d108 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -294,7 +294,7 @@ static gotoblas_t *get_coretype(void){
 	if (model == 5) {	
 	// Intel Skylake X
 #ifndef NO_AVX512
-	  return $gotoblas_SKYLAKEX;
+	  return &gotoblas_SKYLAKEX;
 #else		
 	  if(support_avx())
 	    return &gotoblas_HASWELL;

From 89372e0993b7d9fe9061797625713519392fa42b Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 3 Jun 2018 22:15:09 +0000
Subject: [PATCH 23/86] Use AVX512 also for DGEMM

this required switching to the generic gemm_beta code (which is faster anyway on SKX)
for both DGEMM and SGEMM

Performance for the not-retuned version is in the 30% range
---
 kernel/x86_64/KERNEL.SKYLAKEX              |   15 +
 kernel/x86_64/dgemm_kernel_16x2_skylakex.S | 5138 ++++++++++++++++++++
 kernel/x86_64/sgemm_kernel_16x4_skylakex.S |    3 +-
 3 files changed, 5154 insertions(+), 2 deletions(-)
 create mode 100644 kernel/x86_64/dgemm_kernel_16x2_skylakex.S

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 744831d67..c273ff8cd 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -2,3 +2,18 @@ include $(KERNELDIR)/KERNEL.HASWELL
 
 SGEMMKERNEL    =  sgemm_kernel_16x4_skylakex.S
 
+
+DTRMMKERNEL    =  ../generic/trmmkernel_16x2.c
+DGEMMKERNEL    =  dgemm_kernel_16x2_skylakex.S
+DGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_16.c
+DGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+
+SGEMM_BETA = ../generic/gemm_beta.c
+DGEMM_BETA = ../generic/gemm_beta.c
\ No newline at end of file
diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.S b/kernel/x86_64/dgemm_kernel_16x2_skylakex.S
new file mode 100644
index 000000000..91ac51280
--- /dev/null
+++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.S
@@ -0,0 +1,5138 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+/*********************************************************************
+* 2013/10/20 Saar
+*        BLASTEST               : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+
+*
+*
+* 2013/10/20 Saar
+* Parameter:
+*       DGEMM_DEFAULT_UNROLL_N  2
+*       DGEMM_DEFAULT_UNROLL_M  16
+*       DGEMM_DEFAULT_P         192
+*       DGEMM_DEFAULT_Q         128
+*	A_PR1			512
+*
+*
+* Performance without prefetch of B:
+*       1 thread:       45.8 GFLOPS (MKL:  45)
+*       2 threads:      80.0 GFLOPS (MKL:  91)
+*       4 threads:     135.0 GFLOPS (MKL: 135)
+*********************************************************************/
+
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define BI	%rbp
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define BO2	%r15
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+
+#else
+
+#define STACKSIZE 256
+
+#define OLD_A		40 + STACKSIZE(%rsp)
+#define OLD_B		48 + STACKSIZE(%rsp)
+#define OLD_C		56 + STACKSIZE(%rsp)
+#define OLD_LDC		64 + STACKSIZE(%rsp)
+#define OLD_OFFSET	72 + STACKSIZE(%rsp)
+
+#endif
+
+#define L_BUFFER_SIZE 512*8*4
+#define LB2_OFFSET    512*8*2
+
+#define Ndiv6	 24(%rsp)
+#define Nmod6	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA	 48(%rsp)
+#define OFFSET	 56(%rsp)
+#define KK	 64(%rsp)
+#define KKK	 72(%rsp)
+#define BUFFER1	           128(%rsp)
+#define BUFFER2	LB2_OFFSET+128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $0,  4096 * 4(%rsp);\
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+#if defined(BULLDOZER)
+
+.macro VFMADD231PD_ y0,y1,y2
+	vfmaddpd \y0,\y1,\y2,\y0
+.endm
+
+.macro VFMADD231SD_ x0,x1,x2
+	vfmaddsd \x0,\x1,\x2,\x0
+.endm
+
+#else
+
+.macro VFMADD231PD_ y0,y1,y2
+	vfmadd231pd \y2,\y1,\y0
+.endm
+
+.macro VFMADD231SD_ x0,x1,x2
+	vfmadd231sd \x2,\x1,\x0
+.endm
+
+#endif
+
+
+#define	A_PR1	1024
+#define	B_PR1	256
+
+/*******************************************************************************************
+* 3 lines of N
+*******************************************************************************************/
+
+.macro KERNEL16x3_SUBN
+	vbroadcastsd	-12 * SIZE(BO), %zmm1
+	vbroadcastsd	-11 * SIZE(BO), %zmm2
+	vbroadcastsd	-10 * SIZE(BO), %zmm3
+
+	vmovaps 	-16 * SIZE(AO), %zmm0
+	VFMADD231PD_  	%zmm4,%zmm1,%zmm0
+	VFMADD231PD_  	%zmm5,%zmm2,%zmm0
+	VFMADD231PD_  	%zmm6,%zmm3,%zmm0
+
+	vmovaps 	 -8 * SIZE(AO), %zmm9
+	VFMADD231PD_  	%zmm10,%zmm1,%zmm9
+	VFMADD231PD_  	%zmm11,%zmm2,%zmm9
+	VFMADD231PD_  	%zmm12,%zmm3,%zmm9
+	addq	$ 3*SIZE , BO	
+	addq	$ 16*SIZE, AO
+.endm
+
+
+.macro KERNEL8x3_SUBN
+	vbroadcastsd	-12 * SIZE(BO), %ymm1
+	vmovaps 	-16 * SIZE(AO), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	-11 * SIZE(BO), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	-10 * SIZE(BO), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	vmovaps 	-12 * SIZE(AO), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
+	prefetcht0	B_PR1(BO)
+	addq	$ 3*SIZE , BO	
+	addq	$ 8*SIZE, AO
+.endm
+
+.macro KERNEL4x3_SUBN
+	vbroadcastsd	-12 * SIZE(BO), %ymm1
+	vmovaps 	-16 * SIZE(AO), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	-11 * SIZE(BO), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	-10 * SIZE(BO), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	addq	$ 3*SIZE , BO	
+	addq	$ 4*SIZE, AO
+.endm
+
+.macro KERNEL2x3_SUBN
+	vmovsd	-12 * SIZE(BO), %xmm1
+	vmovsd 	-16 * SIZE(AO), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	-11 * SIZE(BO), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	-10 * SIZE(BO), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+	vmovsd 	-15 * SIZE(AO), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
+	addq	$ 3*SIZE , BO	
+	addq	$ 2*SIZE, AO
+.endm
+
+.macro KERNEL1x3_SUBN
+	vmovsd	-12 * SIZE(BO), %xmm1
+	vmovsd 	-16 * SIZE(AO), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	-11 * SIZE(BO), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	-10 * SIZE(BO), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+	addq	$ 3*SIZE , BO	
+	addq	$ 1*SIZE, AO
+.endm
+
+
+
+
+
+
+/******************************************************************************************/
+
+.macro KERNEL16x3_1
+	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %zmm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %zmm0
+	VFMADD231PD_  	%zmm4,%zmm1,%zmm0
+	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %zmm2
+	VFMADD231PD_  	%zmm5,%zmm2,%zmm0
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %zmm3
+	VFMADD231PD_  	%zmm6,%zmm3,%zmm0
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %zmm0
+	VFMADD231PD_  	%zmm10,%zmm1,%zmm0
+	VFMADD231PD_  	%zmm11,%zmm2,%zmm0
+	VFMADD231PD_  	%zmm12,%zmm3,%zmm0
+	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %zmm1
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %zmm2
+.endm
+
+
+
+
+.macro KERNEL16x3_2
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %zmm0
+	VFMADD231PD_  	%zmm4,%zmm1,%zmm0
+	VFMADD231PD_  	%zmm5,%zmm2,%zmm0
+	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %zmm3
+	VFMADD231PD_  	%zmm6,%zmm3,%zmm0
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %zmm0
+	VFMADD231PD_  	%zmm10,%zmm1,%zmm0
+	VFMADD231PD_  	%zmm11,%zmm2,%zmm0
+	VFMADD231PD_  	%zmm12,%zmm3,%zmm0
+	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %zmm1
+	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %zmm2
+.endm
+
+.macro KERNEL16x3_3
+	vmovups 	  0 * SIZE(AO, %rax, SIZE), %zmm0
+	VFMADD231PD_  	%zmm4,%zmm1,%zmm0
+	VFMADD231PD_  	%zmm5,%zmm2,%zmm0
+	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %zmm3
+	VFMADD231PD_  	%zmm6,%zmm3,%zmm0
+	vmovups 	  8 * SIZE(AO, %rax, SIZE), %zmm0
+	VFMADD231PD_  	%zmm10,%zmm1,%zmm0
+	VFMADD231PD_  	%zmm11,%zmm2,%zmm0
+	VFMADD231PD_  	%zmm12,%zmm3,%zmm0
+	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %zmm1
+	vbroadcastsd	  4 * SIZE(BO, BI, SIZE), %zmm2
+.endm
+
+.macro KERNEL16x3_4
+	vmovups 	 16 * SIZE(AO, %rax, SIZE), %zmm0
+	VFMADD231PD_  	%zmm4,%zmm1,%zmm0
+	VFMADD231PD_  	%zmm5,%zmm2,%zmm0
+	vbroadcastsd	  5 * SIZE(BO, BI, SIZE), %zmm3
+	VFMADD231PD_  	%zmm6,%zmm3,%zmm0
+	vmovups 	 24 * SIZE(AO, %rax, SIZE), %zmm0
+	VFMADD231PD_  	%zmm10,%zmm1,%zmm0
+	VFMADD231PD_  	%zmm11,%zmm2,%zmm0
+	addq	$12, BI	
+	VFMADD231PD_  	%zmm12,%zmm3,%zmm0
+	addq	$64, %rax 
+.endm
+
+.macro KERNEL16x3_SUB
+	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %zmm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %zmm0
+	VFMADD231PD_  	%zmm4,%zmm1,%zmm0
+	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %zmm2
+	VFMADD231PD_  	%zmm5,%zmm2,%zmm0
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %zmm3
+	VFMADD231PD_  	%zmm6,%zmm3,%zmm0
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %zmm0
+	VFMADD231PD_  	%zmm10,%zmm1,%zmm0
+	VFMADD231PD_  	%zmm11,%zmm2,%zmm0
+	VFMADD231PD_  	%zmm12,%zmm3,%zmm0
+	addq	$3 , BI	
+	addq	$16, %rax 
+.endm
+
+.macro SAVE16x3
+
+	vbroadcastsd	ALPHA, %zmm0
+
+	vmulpd	%zmm0 , %zmm4 , %zmm4
+	vmulpd	%zmm0 , %zmm10, %zmm10
+
+	vmulpd	%zmm0 , %zmm5 , %zmm5
+	vmulpd	%zmm0 , %zmm11, %zmm11
+
+	vmulpd	%zmm0 , %zmm6 , %zmm6
+	vmulpd	%zmm0 , %zmm12, %zmm12
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	        (CO1), %zmm4,%zmm4
+	vaddpd  8 * SIZE(CO1), %zmm10,%zmm10
+
+	vaddpd 	        (CO1, LDC), %zmm5,%zmm5
+	vaddpd  8 * SIZE(CO1, LDC), %zmm11,%zmm11
+
+	vaddpd 	        (CO1, LDC, 2), %zmm6,%zmm6
+	vaddpd  8 * SIZE(CO1, LDC, 2), %zmm12,%zmm12
+
+#endif
+
+	vmovups	%zmm4 ,  	(CO1)
+	vmovups	%zmm10, 8 * SIZE(CO1)
+
+	vmovups	%zmm5 ,  	(CO1, LDC)
+	vmovups	%zmm11, 8 * SIZE(CO1, LDC)
+
+	vmovups	%zmm6 ,  	(CO1, LDC, 2)
+	vmovups	%zmm12, 8 * SIZE(CO1, LDC, 2)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x3_1
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
+.endm
+
+.macro KERNEL8x3_2
+	prefetcht0	64+A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
+.endm
+
+.macro KERNEL8x3_3
+	prefetcht0	128+A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
+.endm
+
+.macro KERNEL8x3_4
+	prefetcht0	192+A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	  4 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	  5 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
+	addq	$12, BI
+	addq	$32, %rax
+.endm
+
+.macro KERNEL8x3_SUB
+	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
+	addq	$3 , BI
+	addq	$8 , %rax
+.endm
+
+.macro SAVE8x3
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm7 , %ymm7
+
+	vmulpd	%ymm0 , %ymm5 , %ymm5
+	vmulpd	%ymm0 , %ymm8 , %ymm8
+
+	vmulpd	%ymm0 , %ymm6 , %ymm6
+	vmulpd	%ymm0 , %ymm9 , %ymm9
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	        (CO1), %ymm4,%ymm4
+	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
+
+	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
+	vaddpd  4 * SIZE(CO1, LDC), %ymm8,%ymm8
+
+	vaddpd 	        (CO1, LDC, 2), %ymm6,%ymm6
+	vaddpd  4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm7 , 4 * SIZE(CO1)
+
+	vmovups	%ymm5 ,  	(CO1, LDC)
+	vmovups	%ymm8 , 4 * SIZE(CO1, LDC)
+
+	vmovups	%ymm6 ,  	(CO1, LDC, 2)
+	vmovups	%ymm9 , 4 * SIZE(CO1, LDC, 2)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x3_1
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+.endm
+
+.macro KERNEL4x3_2
+	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+.endm
+
+.macro KERNEL4x3_3
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+.endm
+
+.macro KERNEL4x3_4
+	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	  4 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	  5 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	addq	$12, BI
+	addq	$16, %rax
+.endm
+
+.macro KERNEL4x3_SUB
+	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	addq	$3 , BI
+	addq	$4 , %rax
+.endm
+
+.macro SAVE4x3
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm5 , %ymm5
+	vmulpd	%ymm0 , %ymm6 , %ymm6
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	        (CO1), %ymm4,%ymm4
+	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
+	vaddpd 	        (CO1, LDC, 2), %ymm6,%ymm6
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 ,  	(CO1, LDC)
+	vmovups	%ymm6 ,  	(CO1, LDC, 2)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x3_1
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	vmovsd	 -6 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -5 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
+.endm
+
+.macro KERNEL2x3_2
+	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
+.endm
+
+.macro KERNEL2x3_3
+	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-28 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	  2 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+	vmovsd 	-27 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
+.endm
+
+.macro KERNEL2x3_4
+	vmovsd	  3 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-26 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	  4 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	  5 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+	vmovsd 	-25 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
+	addq	$12, BI
+	addq	$8, %rax
+.endm
+
+.macro KERNEL2x3_SUB
+	vmovsd	 -6 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -5 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
+	addq	$3 , BI
+	addq	$2 , %rax
+.endm
+
+.macro SAVE2x3
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+	vmulsd	%xmm0 , %xmm8 , %xmm8
+	vmulsd	%xmm0 , %xmm5 , %xmm5
+	vmulsd	%xmm0 , %xmm10, %xmm10
+	vmulsd	%xmm0 , %xmm6 , %xmm6
+	vmulsd	%xmm0 , %xmm12, %xmm12
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	 (CO1), %xmm4,%xmm4
+	vaddsd 1 * SIZE(CO1), %xmm8,%xmm8
+	vaddsd 	 (CO1, LDC), %xmm5,%xmm5
+	vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10
+	vaddsd 	 (CO1, LDC, 2), %xmm6,%xmm6
+	vaddsd 1 * SIZE(CO1, LDC, 2), %xmm12,%xmm12
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm8 , 1 * SIZE(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+	vmovsd	%xmm10, 1 * SIZE(CO1, LDC)
+	vmovsd	%xmm6 ,  	(CO1, LDC, 2)
+	vmovsd	%xmm12, 1 * SIZE(CO1, LDC, 2)
+
+.endm
+
+/*******************************************************************************************/
+
+.macro KERNEL1x3_1
+	vmovsd	 -6 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -5 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+.endm
+
+.macro KERNEL1x3_2
+	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+.endm
+
+.macro KERNEL1x3_3
+	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	  2 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+.endm
+
+.macro KERNEL1x3_4
+	vmovsd	  3 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	  4 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	  5 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+	addq	$12, BI
+	addq	$4, %rax
+.endm
+
+.macro KERNEL1x3_SUB
+	vmovsd	 -6 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -5 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+	addq	$3 , BI
+	addq	$1 , %rax
+.endm
+
+.macro SAVE1x3
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+	vmulsd	%xmm0 , %xmm5 , %xmm5
+	vmulsd	%xmm0 , %xmm6 , %xmm6
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	 (CO1), %xmm4,%xmm4
+	vaddsd 	 (CO1, LDC), %xmm5,%xmm5
+	vaddsd 	 (CO1, LDC, 2), %xmm6,%xmm6
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+	vmovsd	%xmm6 ,  	(CO1, LDC, 2)
+
+.endm
+
+
+/*******************************************************************************************/
+
+/*******************************************************************************************
+* 2 lines of N
+*******************************************************************************************/
+
+.macro KERNEL16x2_1
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	prefetcht0	64+A_PR1(AO, %rax, SIZE)
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
+.endm
+
+.macro KERNEL16x2_2
+	prefetcht0	128+A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	prefetcht0	192+A_PR1(AO, %rax, SIZE)
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
+.endm
+
+.macro KERNEL16x2_3
+	prefetcht0	256+A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	  0 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vmovups 	  4 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	prefetcht0	320+A_PR1(AO, %rax, SIZE)
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	vmovups 	  8 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
+	vmovups 	 12 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
+.endm
+
+.macro KERNEL16x2_4
+	prefetcht0	384+A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	 16 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vmovups 	 20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	prefetcht0	448+A_PR1(AO, %rax, SIZE)
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	vmovups 	 24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
+	vmovups 	 28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
+	addq	$8, BI
+	addq	$64, %rax
+.endm
+
+.macro KERNEL16x2_SUB
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
+	addq	$2, BI
+	addq	$16, %rax
+.endm
+
+.macro SAVE16x2
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm7 , %ymm7
+	vmulpd	%ymm0 , %ymm10, %ymm10
+	vmulpd	%ymm0 , %ymm13, %ymm13
+
+	vmulpd	%ymm0 , %ymm5 , %ymm5
+	vmulpd	%ymm0 , %ymm8 , %ymm8
+	vmulpd	%ymm0 , %ymm11, %ymm11
+	vmulpd	%ymm0 , %ymm14, %ymm14
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	        (CO1), %ymm4,%ymm4
+	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
+	vaddpd  8 * SIZE(CO1), %ymm10,%ymm10
+	vaddpd 12 * SIZE(CO1), %ymm13,%ymm13
+
+	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
+	vaddpd  4 * SIZE(CO1, LDC), %ymm8,%ymm8
+	vaddpd  8 * SIZE(CO1, LDC), %ymm11,%ymm11
+	vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm7 , 4 * SIZE(CO1)
+	vmovups	%ymm10, 8 * SIZE(CO1)
+	vmovups	%ymm13,12 * SIZE(CO1)
+
+	vmovups	%ymm5 ,  	(CO1, LDC)
+	vmovups	%ymm8 , 4 * SIZE(CO1, LDC)
+	vmovups	%ymm11, 8 * SIZE(CO1, LDC)
+	vmovups	%ymm14,12 * SIZE(CO1, LDC)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x2_1
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+.endm
+
+.macro KERNEL8x2_2
+	prefetcht0	64+A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+.endm
+
+.macro KERNEL8x2_3
+	prefetcht0	128+A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+.endm
+
+.macro KERNEL8x2_4
+	prefetcht0	192+A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	addq	$8, BI				 
+	addq	$32, %rax 			 
+.endm
+
+.macro KERNEL8x2_SUB
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	addq	$2, BI				 
+	addq	$8 , %rax 			 
+.endm
+
+.macro SAVE8x2
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm7 , %ymm7
+
+	vmulpd	%ymm0 , %ymm5 , %ymm5
+	vmulpd	%ymm0 , %ymm8 , %ymm8
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	        (CO1), %ymm4,%ymm4
+	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
+
+	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
+	vaddpd  4 * SIZE(CO1, LDC), %ymm8,%ymm8
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm7 , 4 * SIZE(CO1)
+
+	vmovups	%ymm5 ,  	(CO1, LDC)
+	vmovups	%ymm8 , 4 * SIZE(CO1, LDC)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x2_1
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+.endm
+
+.macro KERNEL4x2_2
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+.endm
+
+.macro KERNEL4x2_3
+	prefetcht0	64+A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+.endm
+
+.macro KERNEL4x2_4
+	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	addq	$8, BI				 
+	addq	$16, %rax 			 
+.endm
+
+.macro KERNEL4x2_SUB
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	addq	$2, BI				 
+	addq	$4 , %rax 			 
+.endm
+
+.macro SAVE4x2
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm5 , %ymm5
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	        (CO1), %ymm4,%ymm4
+	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 ,  	(CO1, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x2_1
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+.endm
+
+.macro KERNEL2x2_2
+	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+.endm
+
+.macro KERNEL2x2_3
+	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-28 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd 	-27 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+.endm
+
+.macro KERNEL2x2_4
+	vmovsd	  2 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-26 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	  3 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd 	-25 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+	addq	$8, BI				 
+	addq	$8, %rax 			 
+.endm
+
+.macro KERNEL2x2_SUB
+	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+	addq	$2, BI				 
+	addq	$2, %rax 			 
+.endm
+
+.macro SAVE2x2
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+	vmulsd	%xmm0 , %xmm8 , %xmm8
+	vmulsd	%xmm0 , %xmm5 , %xmm5
+	vmulsd	%xmm0 , %xmm10, %xmm10
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	 (CO1), %xmm4,%xmm4
+	vaddsd 1 * SIZE(CO1), %xmm8,%xmm8
+	vaddsd 	 (CO1, LDC), %xmm5,%xmm5
+	vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm8 , 1 * SIZE(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+	vmovsd	%xmm10, 1 * SIZE(CO1, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL1x2_1
+	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+.endm
+
+.macro KERNEL1x2_2
+	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+.endm
+
+.macro KERNEL1x2_3
+	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+.endm
+
+.macro KERNEL1x2_4
+	vmovsd	  2 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	  3 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	addq	$8, BI				 
+	addq	$4, %rax 			 
+.endm
+
+.macro KERNEL1x2_SUB
+	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	addq	$2, BI				 
+	addq	$1, %rax 			 
+.endm
+
+.macro SAVE1x2
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+	vmulsd	%xmm0 , %xmm5 , %xmm5
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	 (CO1), %xmm4,%xmm4
+	vaddsd 	 (CO1, LDC), %xmm5,%xmm5
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+/*******************************************************************************************
+* 1 line of N
+*******************************************************************************************/
+
+.macro KERNEL16x1_1
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+.endm
+
+.macro KERNEL16x1_2
+	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+.endm
+
+.macro KERNEL16x1_3
+	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	  0 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vmovups 	  4 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	vmovups 	  8 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	vmovups 	 12 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+.endm
+
+.macro KERNEL16x1_4
+	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	 16 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vmovups 	 20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	vmovups 	 24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	vmovups 	 28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+	addq	$4, BI				 
+	addq	$64, %rax 			 
+.endm
+
+.macro KERNEL16x1_SUB
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+	addq	$1, BI				 
+	addq	$16, %rax 			 
+.endm
+
+.macro SAVE16x1
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm7 , %ymm7
+	vmulpd	%ymm0 , %ymm10, %ymm10
+	vmulpd	%ymm0 , %ymm13, %ymm13
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	        (CO1), %ymm4,%ymm4
+	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
+	vaddpd  8 * SIZE(CO1), %ymm10,%ymm10
+	vaddpd 12 * SIZE(CO1), %ymm13,%ymm13
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm7 , 4 * SIZE(CO1)
+	vmovups	%ymm10, 8 * SIZE(CO1)
+	vmovups	%ymm13,12 * SIZE(CO1)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x1_1
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+.endm
+
+.macro KERNEL8x1_2
+	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+.endm
+
+.macro KERNEL8x1_3
+	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+.endm
+
+.macro KERNEL8x1_4
+	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	addq	$4, BI				 
+	addq	$32, %rax 			 
+.endm
+
+.macro KERNEL8x1_SUB
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	addq	$1, BI				 
+	addq	$8 , %rax 			 
+.endm
+
+.macro SAVE8x1
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm7 , %ymm7
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	        (CO1), %ymm4,%ymm4
+	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm7 , 4 * SIZE(CO1)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x1_1
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+.endm
+
+.macro KERNEL4x1_2
+	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+.endm
+
+.macro KERNEL4x1_3
+	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+.endm
+
+.macro KERNEL4x1_4
+	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	addq	$4, BI				 
+	addq	$16, %rax 			 
+.endm
+
+.macro KERNEL4x1_SUB
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	addq	$1, BI				 
+	addq	$4 , %rax 			 
+.endm
+
+.macro SAVE4x1
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	        (CO1), %ymm4,%ymm4
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x1_1
+	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+.endm
+
+.macro KERNEL2x1_2
+	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+.endm
+
+.macro KERNEL2x1_3
+	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-28 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd 	-27 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+.endm
+
+.macro KERNEL2x1_4
+	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-26 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd 	-25 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	addq	$4, BI				 
+	addq	$8, %rax 			 
+.endm
+
+.macro KERNEL2x1_SUB
+	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	addq	$1, BI				 
+	addq	$2 , %rax 			 
+.endm
+
+.macro SAVE2x1
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+	vmulsd	%xmm0 , %xmm8 , %xmm8
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	 (CO1), %xmm4,%xmm4
+	vaddsd 1 * SIZE(CO1), %xmm8,%xmm8
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm8 , 1 * SIZE(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL1x1_1
+	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+.endm
+
+.macro KERNEL1x1_2
+	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+.endm
+
+.macro KERNEL1x1_3
+	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+.endm
+
+.macro KERNEL1x1_4
+	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	addq	$ 4, BI				 
+	addq	$ 4, %rax 			 
+.endm
+
+.macro KERNEL1x1_SUB
+	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	addq	$ 1, BI				 
+	addq	$ 1 , %rax 			 
+.endm
+
+.macro SAVE1x1
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	 (CO1), %xmm4,%xmm4
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+#if !defined(TRMMKERNEL)
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	movups	%xmm6,   64(%rsp)
+	movups	%xmm7,   80(%rsp)
+	movups	%xmm8,   96(%rsp)
+	movups	%xmm9,  112(%rsp)
+	movups	%xmm10, 128(%rsp)
+	movups	%xmm11, 144(%rsp)
+	movups	%xmm12, 160(%rsp)
+	movups	%xmm13, 176(%rsp)
+	movups	%xmm14, 192(%rsp)
+	movups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+
+	vmovaps	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $6,  %rdi
+        divq    %rdi                    //    N / 6
+        movq    %rax, Ndiv6             //    N / 6
+        movq    %rdx, Nmod6             //    N % 6
+
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L2_0
+	ALIGN_4
+
+.L6_01:
+        // copy to sub buffer
+        movq    K, %rax
+        salq    $1,%rax                 // K * 2 ; read 2 values
+        movq    B, BO1
+        leaq    (B,%rax, SIZE), BO2     // next offset to BO2
+        leaq    BUFFER1, BO             // first buffer to BO
+        movq    K, %rax
+	sarq	$3 , %rax		// K / 8
+	jz	.L6_01a_2
+        ALIGN_4
+
+.L6_01a_1:
+
+        prefetcht0 512(BO1)
+        prefetcht0 512(BO2)
+        prefetchw  512(BO)
+
+
+	vmovups	0 * SIZE(BO1), %xmm0
+	vmovups	2 * SIZE(BO1), %xmm2
+	vmovups	4 * SIZE(BO1), %xmm4
+	vmovups	6 * SIZE(BO1), %xmm6
+	vmovsd  0 * SIZE(BO2), %xmm1
+	vmovsd  2 * SIZE(BO2), %xmm3
+	vmovsd  4 * SIZE(BO2), %xmm5
+	vmovsd  6 * SIZE(BO2), %xmm7
+	vmovups	%xmm0, 0*SIZE(BO)
+	vmovsd	%xmm1, 2*SIZE(BO)
+	vmovups	%xmm2, 3*SIZE(BO)
+	vmovsd	%xmm3, 5*SIZE(BO)
+	vmovups	%xmm4, 6*SIZE(BO)
+	vmovsd	%xmm5, 8*SIZE(BO)
+	vmovups	%xmm6, 9*SIZE(BO)
+	vmovsd	%xmm7,11*SIZE(BO)
+	addq	$ 8*SIZE,BO1
+	addq	$ 8*SIZE,BO2
+	addq	$ 12*SIZE,BO
+
+	vmovups	0 * SIZE(BO1), %xmm0
+	vmovups	2 * SIZE(BO1), %xmm2
+	vmovups	4 * SIZE(BO1), %xmm4
+	vmovups	6 * SIZE(BO1), %xmm6
+	vmovsd  0 * SIZE(BO2), %xmm1
+	vmovsd  2 * SIZE(BO2), %xmm3
+	vmovsd  4 * SIZE(BO2), %xmm5
+	vmovsd  6 * SIZE(BO2), %xmm7
+	vmovups	%xmm0, 0*SIZE(BO)
+	vmovsd	%xmm1, 2*SIZE(BO)
+	vmovups	%xmm2, 3*SIZE(BO)
+	vmovsd	%xmm3, 5*SIZE(BO)
+	vmovups	%xmm4, 6*SIZE(BO)
+	vmovsd	%xmm5, 8*SIZE(BO)
+	vmovups	%xmm6, 9*SIZE(BO)
+	vmovsd	%xmm7,11*SIZE(BO)
+	addq	$ 8*SIZE,BO1
+	addq	$ 8*SIZE,BO2
+	addq	$ 12*SIZE,BO
+
+	decq	%rax
+	jnz	.L6_01a_1
+
+
+
+.L6_01a_2:
+
+	movq    K, %rax
+        andq    $7, %rax                // K % 8
+        jz      .L6_02c
+        ALIGN_4
+
+
+.L6_02b:
+
+	vmovups	0 * SIZE(BO1), %xmm0
+	vmovsd  0 * SIZE(BO2), %xmm2
+	vmovups	%xmm0, 0*SIZE(BO)
+	vmovsd	%xmm2, 2*SIZE(BO)
+	addq	$ 2*SIZE,BO1
+	addq	$ 2*SIZE,BO2
+	addq	$ 3*SIZE,BO
+	decq	%rax
+	jnz	.L6_02b
+
+.L6_02c:
+
+	movq	K, %rax
+	salq	$1,%rax			// K * 2
+	leaq	(B,%rax, SIZE), BO1	// next offset to BO1
+	leaq	(BO1,%rax, SIZE), BO2	// next offset to BO2
+	leaq    BUFFER2, BO		// second buffer to BO
+	movq	K, %rax
+	sarq	$3 , %rax		// K / 8
+	jz	.L6_02c_2
+	ALIGN_4
+
+.L6_02c_1:
+
+	prefetcht0 512(BO2)
+        prefetchw  512(BO)
+
+	vmovups	0 * SIZE(BO2), %xmm0
+	vmovups	2 * SIZE(BO2), %xmm2
+	vmovups	4 * SIZE(BO2), %xmm4
+	vmovups	6 * SIZE(BO2), %xmm6
+	vmovsd  1 * SIZE(BO1), %xmm1
+	vmovsd  3 * SIZE(BO1), %xmm3
+	vmovsd  5 * SIZE(BO1), %xmm5
+	vmovsd  7 * SIZE(BO1), %xmm7
+	vmovsd	%xmm1, 0*SIZE(BO)
+	vmovups	%xmm0, 1*SIZE(BO)
+	vmovsd	%xmm3, 3*SIZE(BO)
+	vmovups	%xmm2, 4*SIZE(BO)
+	vmovsd	%xmm5, 6*SIZE(BO)
+	vmovups	%xmm4, 7*SIZE(BO)
+	vmovsd	%xmm7, 9*SIZE(BO)
+	vmovups	%xmm6,10*SIZE(BO)
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO2
+	addq	$12*SIZE,BO
+
+
+	vmovups	0 * SIZE(BO2), %xmm0
+	vmovups	2 * SIZE(BO2), %xmm2
+	vmovups	4 * SIZE(BO2), %xmm4
+	vmovups	6 * SIZE(BO2), %xmm6
+	vmovsd  1 * SIZE(BO1), %xmm1
+	vmovsd  3 * SIZE(BO1), %xmm3
+	vmovsd  5 * SIZE(BO1), %xmm5
+	vmovsd  7 * SIZE(BO1), %xmm7
+	vmovsd	%xmm1, 0*SIZE(BO)
+	vmovups	%xmm0, 1*SIZE(BO)
+	vmovsd	%xmm3, 3*SIZE(BO)
+	vmovups	%xmm2, 4*SIZE(BO)
+	vmovsd	%xmm5, 6*SIZE(BO)
+	vmovups	%xmm4, 7*SIZE(BO)
+	vmovsd	%xmm7, 9*SIZE(BO)
+	vmovups	%xmm6,10*SIZE(BO)
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO2
+	addq	$12*SIZE,BO
+
+	decq	%rax
+	jnz	.L6_02c_1
+
+
+.L6_02c_2:
+
+	movq    K, %rax
+        andq    $7, %rax                // K % 8
+        jz      .L6_03c
+        ALIGN_4
+
+.L6_03b:
+
+	vmovsd	  1*SIZE(BO1), %xmm0
+	vmovups	  0*SIZE(BO2), %xmm1
+	vmovsd	%xmm0, 0*SIZE(BO)
+	vmovups	%xmm1, 1*SIZE(BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO2
+	addq	$3*SIZE,BO
+	decq	%rax
+	jnz	.L6_03b
+
+
+.L6_03c:
+
+	movq	BO2, B			// next offset of B
+
+.L6_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		 
+	leaq	(C, LDC, 1), C		// c += 3 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L6_20
+
+	ALIGN_4
+
+.L6_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	prefetcht0	(CO1)
+	prefetcht0	(CO1,LDC,1)
+	prefetcht0	(CO1,LDC,2)
+	prefetcht0	64(CO1)
+	prefetcht0	64(CO1,LDC,1)
+	prefetcht0	64(CO1,LDC,2)
+
+	vzeroall
+
+        movq    K, %rax
+
+	sarq $1, %rax			//  K / 8
+	je	.L6_16
+
+	ALIGN_5
+
+.L6_12:
+/*
+	prefetcht0	B_PR1(BO)
+	prefetcht0	B_PR1+64(BO)
+	prefetcht0	B_PR1+128(BO)
+*/
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+/*
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+*/
+	dec	%rax
+	jne	.L6_12
+
+.L6_16:
+        movq    K, %rax
+
+	andq	$1, %rax		# if (k & 1)
+	je .L6_19
+
+	ALIGN_4
+
+.L6_17:
+
+	KERNEL16x3_SUBN
+
+	dec	%rax
+	jne	.L6_17
+	ALIGN_4
+
+
+.L6_19:
+
+	SAVE16x3
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L6_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L6_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L7_10		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L6_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L6_20_1:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L6_20_6
+
+	ALIGN_4
+
+.L6_20_2:
+
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+	dec	%rax
+	jne	.L6_20_2
+	ALIGN_4
+
+.L6_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_20_9
+
+
+	ALIGN_4
+
+.L6_20_7:
+
+	KERNEL8x3_SUBN
+
+	dec	%rax
+	jne	.L6_20_7
+	ALIGN_4
+
+
+.L6_20_9:
+
+	SAVE8x3
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L6_21pre:
+
+	testq	$4, M		
+	jz	.L6_30
+	ALIGN_4
+
+.L6_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L6_26
+
+	ALIGN_4
+
+.L6_22:
+
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+	dec	%rax
+	jne	.L6_22
+	ALIGN_4
+
+.L6_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_29
+
+	ALIGN_4
+
+.L6_27:
+
+	KERNEL4x3_SUBN
+
+	dec %rax
+	jne	.L6_27
+	ALIGN_4
+
+
+.L6_29:
+
+	SAVE4x3
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L6_30:
+	testq	$2, M		
+	jz	.L6_40
+
+	ALIGN_4
+
+.L6_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L6_36
+	ALIGN_4
+
+.L6_32:
+
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+	dec %rax
+	jne	.L6_32
+	ALIGN_4
+
+.L6_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_39
+
+	ALIGN_4
+
+.L6_37:
+
+	KERNEL2x3_SUBN
+
+	dec %rax
+	jne	.L6_37
+	ALIGN_4
+
+
+.L6_39:
+
+	SAVE2x3
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L6_40:
+	testq	$1, M		
+	jz	.L7_10		// to next 3 lines of N
+
+	ALIGN_4
+
+.L6_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	sarq	$3,%rax
+	je	.L6_46
+
+	ALIGN_4
+
+.L6_42:
+
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+
+	dec %rax
+	jne	.L6_42
+	ALIGN_4
+
+.L6_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_49
+
+	ALIGN_4
+
+.L6_47:
+
+	KERNEL1x3_SUBN
+
+	dec	%rax
+	jne	.L6_47
+	ALIGN_4
+
+
+.L6_49:
+
+	SAVE1x3
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+
+/***************************************************************************************************************/
+
+.L7_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		 
+	leaq	(C, LDC, 1), C		// c += 3 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L7_20
+
+	ALIGN_4
+
+.L7_11:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $12 * SIZE, BO
+
+	prefetcht0	(CO1)
+	prefetcht0	(CO1,LDC,1)
+	prefetcht0	(CO1,LDC,2)
+	prefetcht0	64(CO1)
+	prefetcht0	64(CO1,LDC,1)
+	prefetcht0	64(CO1,LDC,2)
+
+	vzeroall
+
+        movq    K, %rax
+
+	sarq $3, %rax			// K / 8
+	je	.L7_16
+	ALIGN_5
+
+.L7_12:
+/*
+	prefetcht0	B_PR1(BO)
+	prefetcht0	B_PR1+64(BO)
+	prefetcht0	B_PR1+128(BO)
+*/
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+	dec %rax
+	jne	.L7_12
+	ALIGN_4
+
+.L7_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_19
+
+	ALIGN_5
+
+.L7_17:
+
+	KERNEL16x3_SUBN
+
+	dec	%rax
+	jne	.L7_17
+
+
+.L7_19:
+
+	SAVE16x3
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L7_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L7_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L7_60		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L7_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L7_20_1:
+        leaq    BUFFER2, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L7_20_6
+
+	ALIGN_4
+
+.L7_20_2:
+
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+
+	dec %rax
+	jne	.L7_20_2
+	ALIGN_4
+
+.L7_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_20_9
+
+	ALIGN_4
+
+.L7_20_7:
+
+	KERNEL8x3_SUBN
+
+	dec %rax
+	jne	.L7_20_7
+	ALIGN_4
+
+.L7_20_9:
+
+	SAVE8x3
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L7_21pre:
+
+	testq	$4, M		
+	jz	.L7_30
+	ALIGN_4
+
+.L7_21:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $12 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L7_26
+
+	ALIGN_4
+
+.L7_22:
+
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+
+	dec %rax
+	jne	.L7_22
+	ALIGN_4
+
+.L7_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_29
+
+	ALIGN_4
+
+.L7_27:
+
+	KERNEL4x3_SUBN
+
+	dec %rax
+	jne	.L7_27
+	ALIGN_4
+
+
+.L7_29:
+
+	SAVE4x3
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L7_30:
+	testq	$2, M		
+	jz	.L7_40
+
+	ALIGN_4
+
+.L7_31:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $12 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L7_36
+
+	ALIGN_4
+
+.L7_32:
+
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+
+	dec %rax
+	jne	.L7_32
+	ALIGN_4
+
+.L7_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_39
+
+	ALIGN_4
+
+.L7_37:
+
+	KERNEL2x3_SUBN
+
+	dec %rax
+	jne	.L7_37
+	ALIGN_4
+
+
+.L7_39:
+
+	SAVE2x3
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L7_40:
+	testq	$1, M		
+	jz	.L7_60		// to next 3 lines of N
+
+	ALIGN_4
+
+.L7_41:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $12 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L7_46
+
+	ALIGN_4
+
+.L7_42:
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+
+	dec %rax
+	jne	.L7_42
+	ALIGN_4
+
+.L7_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_49
+
+	ALIGN_4
+
+.L7_47:
+
+	KERNEL1x3_SUBN
+
+	dec %rax
+	jne	.L7_47
+	ALIGN_4
+
+
+.L7_49:
+
+	SAVE1x3
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+.L7_60:
+
+	decq	J			// j --
+	jg	.L6_01
+
+
+.L2_0:
+	cmpq	$0, Nmod6		// N % 6 == 0
+	je	.L999
+
+/************************************************************************************************
+* Loop for Nmod6 / 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	sarq	$1, J			// j = j / 2
+	je	.L1_0
+	ALIGN_4
+
+.L2_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// K / 4
+	jz	.L2_01b
+	ALIGN_4
+
+.L2_01a:
+        prefetcht0 512(BO1)
+        prefetchw  512(BO)
+
+	vmovups	      (BO1), %xmm0
+	vmovups	2*SIZE(BO1), %xmm1
+	vmovups	4*SIZE(BO1), %xmm2
+	vmovups	6*SIZE(BO1), %xmm3
+
+	vmovups	%xmm0,       (BO)
+	vmovups	%xmm1, 2*SIZE(BO)
+	vmovups	%xmm2, 4*SIZE(BO)
+	vmovups	%xmm3, 6*SIZE(BO)
+
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO
+	decq	%rax
+	jnz	.L2_01a
+
+
+.L2_01b:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L2_02d
+        ALIGN_4
+
+.L2_02c:
+
+	vmovups	(BO1), %xmm0
+	vmovups	%xmm0, (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L2_02c
+
+.L2_02d:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$32 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x2_1
+	KERNEL16x2_2
+	KERNEL16x2_3
+	KERNEL16x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x2_1
+	KERNEL16x2_2
+	KERNEL16x2_3
+	KERNEL16x2_4
+
+	je	.L2_16
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x2_1
+	KERNEL16x2_2
+	KERNEL16x2_3
+	KERNEL16x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x2_1
+	KERNEL16x2_2
+	KERNEL16x2_3
+	KERNEL16x2_4
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL16x2_SUB
+
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	SAVE16x2
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L2_60		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L2_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L2_20_1:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_2:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_3
+	KERNEL8x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_3
+	KERNEL8x2_4
+
+	je	.L2_20_6
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_3
+	KERNEL8x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_3
+	KERNEL8x2_4
+
+	je	.L2_20_6
+
+	jmp	.L2_20_2
+	ALIGN_4
+
+.L2_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_7:
+
+	KERNEL8x2_SUB
+
+	jl	.L2_20_7
+	ALIGN_4
+
+
+.L2_20_9:
+
+	SAVE8x2
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L2_21pre:
+
+	testq	$4, M		
+	jz	.L2_30
+	ALIGN_4
+
+.L2_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_3
+	KERNEL4x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_3
+	KERNEL4x2_4
+
+	je	.L2_26
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_3
+	KERNEL4x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_3
+	KERNEL4x2_4
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL4x2_SUB
+
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	SAVE4x2
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_32:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_3
+	KERNEL2x2_4
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_3
+	KERNEL2x2_4
+
+	je	.L2_36
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_3
+	KERNEL2x2_4
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_3
+	KERNEL2x2_4
+
+	je	.L2_36
+
+	jmp	.L2_32
+	ALIGN_4
+
+.L2_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB
+
+	jl	.L2_37
+	ALIGN_4
+
+
+.L2_39:
+
+	SAVE2x2
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_3
+	KERNEL1x2_4
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_3
+	KERNEL1x2_4
+
+	je	.L2_46
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_3
+	KERNEL1x2_4
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_3
+	KERNEL1x2_4
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB
+
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	SAVE1x2
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+.L2_60:
+
+	decq	J			// j --
+	jg	.L2_01			// next 2 lines of N
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovsd	(BO1), %xmm0
+	vmovsd	%xmm0,       (BO)
+	addq	$1*SIZE,BO1
+	addq	$1*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$32 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x1_1
+	KERNEL16x1_2
+	KERNEL16x1_3
+	KERNEL16x1_4
+
+	KERNEL16x1_1
+	KERNEL16x1_2
+	KERNEL16x1_3
+	KERNEL16x1_4
+
+	je	.L1_16
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x1_1
+	KERNEL16x1_2
+	KERNEL16x1_3
+	KERNEL16x1_4
+
+	KERNEL16x1_1
+	KERNEL16x1_2
+	KERNEL16x1_3
+	KERNEL16x1_4
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL16x1_SUB
+
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	SAVE16x1
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L999
+
+	testq	$8, M		
+	jz	.L1_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L1_20_1:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_20_6
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_2:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_3
+	KERNEL8x1_4
+
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_3
+	KERNEL8x1_4
+
+	je	.L1_20_6
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_3
+	KERNEL8x1_4
+
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_3
+	KERNEL8x1_4
+
+	je	.L1_20_6
+
+	jmp	.L1_20_2
+	ALIGN_4
+
+.L1_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_20_9
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_7:
+
+	KERNEL8x1_SUB
+
+	jl	.L1_20_7
+	ALIGN_4
+
+
+.L1_20_9:
+
+	SAVE8x1
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L1_21pre:
+
+	testq	$4, M		
+	jz	.L1_30
+	ALIGN_4
+
+.L1_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_3
+	KERNEL4x1_4
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_3
+	KERNEL4x1_4
+
+	je	.L1_26
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_3
+	KERNEL4x1_4
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_3
+	KERNEL4x1_4
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL4x1_SUB
+
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	SAVE4x1
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_36
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_32:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_3
+	KERNEL2x1_4
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_3
+	KERNEL2x1_4
+
+	je	.L1_36
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_3
+	KERNEL2x1_4
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_3
+	KERNEL2x1_4
+
+	je	.L1_36
+
+	jmp	.L1_32
+	ALIGN_4
+
+.L1_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	movq    %rax, BI                        //  Index for BO
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB
+
+	jl	.L1_37
+	ALIGN_4
+
+
+.L1_39:
+
+	SAVE2x1
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_3
+	KERNEL1x1_4
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_3
+	KERNEL1x1_4
+
+	je	.L1_46
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_3
+	KERNEL1x1_4
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_3
+	KERNEL1x1_4
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB
+
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	SAVE1x1
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+.L999:
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	movups	 64(%rsp), %xmm6
+	movups	 80(%rsp), %xmm7
+	movups	 96(%rsp), %xmm8
+	movups	112(%rsp), %xmm9
+	movups	128(%rsp), %xmm10
+	movups	144(%rsp), %xmm11
+	movups	160(%rsp), %xmm12
+	movups	176(%rsp), %xmm13
+	movups	192(%rsp), %xmm14
+	movups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+#else
+/*************************************************************************************
+* TRMM Kernel
+*************************************************************************************/
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	movups	%xmm6,   64(%rsp)
+	movups	%xmm7,   80(%rsp)
+	movups	%xmm8,   96(%rsp)
+	movups	%xmm9,  112(%rsp)
+	movups	%xmm10, 128(%rsp)
+	movups	%xmm11, 144(%rsp)
+	movups	%xmm12, 160(%rsp)
+	movups	%xmm13, 176(%rsp)
+	movups	%xmm14, 192(%rsp)
+	movups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	movsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $2,  %rdi
+        divq    %rdi                    //    N / 6
+        movq    %rax, Ndiv6             //    N / 6
+        movq    %rdx, Nmod6             //    N % 6
+
+	
+
+#ifdef TRMMKERNEL
+	vmovsd	%xmm12, OFFSET
+	vmovsd	%xmm12, KK
+#ifndef LEFT
+	negq	KK
+#endif	
+#endif
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L1_0
+	ALIGN_4
+
+.L2_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// K / 4
+	jz	.L2_01b
+	ALIGN_4
+
+.L2_01a:
+        prefetcht0 512(BO1)
+        prefetchw  512(BO)
+
+	vmovups	      (BO1), %xmm0
+	vmovups	2*SIZE(BO1), %xmm1
+	vmovups	4*SIZE(BO1), %xmm2
+	vmovups	6*SIZE(BO1), %xmm3
+
+	vmovups	%xmm0,       (BO)
+	vmovups	%xmm1, 2*SIZE(BO)
+	vmovups	%xmm2, 4*SIZE(BO)
+	vmovups	%xmm3, 6*SIZE(BO)
+
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO
+	decq	%rax
+	jnz	.L2_01a
+
+
+.L2_01b:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L2_02d
+        ALIGN_4
+
+.L2_02c:
+
+	vmovups	(BO1), %xmm0
+	vmovups	%xmm0, (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L2_02c
+
+.L2_02d:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$32 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $2, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x2_1
+	KERNEL16x2_2
+	KERNEL16x2_3
+	KERNEL16x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x2_1
+	KERNEL16x2_2
+	KERNEL16x2_3
+	KERNEL16x2_4
+
+	je	.L2_16
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x2_1
+	KERNEL16x2_2
+	KERNEL16x2_3
+	KERNEL16x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x2_1
+	KERNEL16x2_2
+	KERNEL16x2_3
+	KERNEL16x2_4
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL16x2_SUB
+
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	SAVE16x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L2_60		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L2_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L2_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_2:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_3
+	KERNEL8x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_3
+	KERNEL8x2_4
+
+	je	.L2_20_6
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_3
+	KERNEL8x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_3
+	KERNEL8x2_4
+
+	je	.L2_20_6
+
+	jmp	.L2_20_2
+	ALIGN_4
+
+.L2_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_7:
+
+	KERNEL8x2_SUB
+
+	jl	.L2_20_7
+	ALIGN_4
+
+
+.L2_20_9:
+
+	SAVE8x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L2_21pre:
+
+	testq	$4, M		
+	jz	.L2_30
+	ALIGN_4
+
+.L2_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_3
+	KERNEL4x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_3
+	KERNEL4x2_4
+
+	je	.L2_26
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_3
+	KERNEL4x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_3
+	KERNEL4x2_4
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL4x2_SUB
+
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	SAVE4x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_32:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_3
+	KERNEL2x2_4
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_3
+	KERNEL2x2_4
+
+	je	.L2_36
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_3
+	KERNEL2x2_4
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_3
+	KERNEL2x2_4
+
+	je	.L2_36
+
+	jmp	.L2_32
+	ALIGN_4
+
+.L2_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB
+
+	jl	.L2_37
+	ALIGN_4
+
+
+.L2_39:
+
+	SAVE2x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_3
+	KERNEL1x2_4
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_3
+	KERNEL1x2_4
+
+	je	.L2_46
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_3
+	KERNEL1x2_4
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_3
+	KERNEL1x2_4
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB
+
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	SAVE1x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK
+#endif
+
+	decq	J			// j --
+	jg	.L2_01			// next 2 lines of N
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovsd	(BO1), %xmm0
+	vmovsd	%xmm0,       (BO)
+	addq	$1*SIZE,BO1
+	addq	$1*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$32 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $1, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x1_1
+	KERNEL16x1_2
+	KERNEL16x1_3
+	KERNEL16x1_4
+
+	KERNEL16x1_1
+	KERNEL16x1_2
+	KERNEL16x1_3
+	KERNEL16x1_4
+
+	je	.L1_16
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x1_1
+	KERNEL16x1_2
+	KERNEL16x1_3
+	KERNEL16x1_4
+
+	KERNEL16x1_1
+	KERNEL16x1_2
+	KERNEL16x1_3
+	KERNEL16x1_4
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL16x1_SUB
+
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	SAVE16x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L999
+
+	testq	$8, M		
+	jz	.L1_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L1_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_20_6
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_2:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_3
+	KERNEL8x1_4
+
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_3
+	KERNEL8x1_4
+
+	je	.L1_20_6
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_3
+	KERNEL8x1_4
+
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_3
+	KERNEL8x1_4
+
+	je	.L1_20_6
+
+	jmp	.L1_20_2
+	ALIGN_4
+
+.L1_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_20_9
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_7:
+
+	KERNEL8x1_SUB
+
+	jl	.L1_20_7
+	ALIGN_4
+
+
+.L1_20_9:
+
+	SAVE8x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L1_21pre:
+
+	testq	$4, M		
+	jz	.L1_30
+	ALIGN_4
+
+.L1_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_3
+	KERNEL4x1_4
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_3
+	KERNEL4x1_4
+
+	je	.L1_26
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_3
+	KERNEL4x1_4
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_3
+	KERNEL4x1_4
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL4x1_SUB
+
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	SAVE4x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_36
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_32:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_3
+	KERNEL2x1_4
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_3
+	KERNEL2x1_4
+
+	je	.L1_36
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_3
+	KERNEL2x1_4
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_3
+	KERNEL2x1_4
+
+	je	.L1_36
+
+	jmp	.L1_32
+	ALIGN_4
+
+.L1_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	movq    %rax, BI                        //  Index for BO
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB
+
+	jl	.L1_37
+	ALIGN_4
+
+
+.L1_39:
+
+	SAVE2x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_3
+	KERNEL1x1_4
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_3
+	KERNEL1x1_4
+
+	je	.L1_46
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_3
+	KERNEL1x1_4
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_3
+	KERNEL1x1_4
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB
+
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	SAVE1x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+.L999:
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	movups	 64(%rsp), %xmm6
+	movups	 80(%rsp), %xmm7
+	movups	 96(%rsp), %xmm8
+	movups	112(%rsp), %xmm9
+	movups	128(%rsp), %xmm10
+	movups	144(%rsp), %xmm11
+	movups	160(%rsp), %xmm12
+	movups	176(%rsp), %xmm13
+	movups	192(%rsp), %xmm14
+	movups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+
+
+
+#endif
diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.S b/kernel/x86_64/sgemm_kernel_16x4_skylakex.S
index 1fab892ca..ac4421252 100644
--- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.S
+++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.S
@@ -159,7 +159,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmovups 	-16 * SIZE(AO), %zmm0
 	vbroadcastss	 -4 * SIZE(BO), %zmm2
 	vbroadcastss	 -3 * SIZE(BO), %zmm3
-	prefetcht0	A_PR1(AO)
+#	prefetcht0	A_PR1(AO)
 
 	VFMADD231PS_(  	%zmm4,%zmm2,%zmm0 )
 	VFMADD231PS_(  	%zmm6,%zmm3,%zmm0 )
@@ -183,7 +183,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmovups 	-16 * SIZE(AO), %zmm0
 	vbroadcastss	 -4 * SIZE(BO), %zmm2
 	vbroadcastss	 -3 * SIZE(BO), %zmm3
-	prefetcht0	A_PR1(AO)
 
 	VFMADD231PS_(  	%zmm4,%zmm2,%zmm0 )
 	VFMADD231PS_(  	%zmm6,%zmm3,%zmm0 )

From ac7b6e3e9aeffe111a0ef23ba74ac2b181b87e30 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 4 Jun 2018 08:23:40 +0200
Subject: [PATCH 24/86] Fix misplaced endif

---
 driver/others/dynamic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 2c902d108..ac1186c8f 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -302,8 +302,8 @@ static gotoblas_t *get_coretype(void){
 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
 	    return &gotoblas_NEHALEM;
 	  }
+#endif		
 	}
-#endif
 	//Intel Skylake
 	if (model == 14) {
 	  if(support_avx())

From 8be027e4c62460f373980e883c487a30a15b5a5d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 4 Jun 2018 14:36:39 +0200
Subject: [PATCH 25/86] Update dynamic.c

---
 driver/others/dynamic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index ac1186c8f..96612cc52 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -82,7 +82,7 @@ extern gotoblas_t  gotoblas_ZEN;
 #ifndef NO_AVX512
 extern gotoblas_t  gotoblas_SKYLAKEX;
 #else
-#define gotoblas_SKYLAKEX gotoblas_HASWELL;
+#define gotoblas_SKYLAKEX gotoblas_HASWELL
 #endif
 #endif
 #else

From dc9fe05ab5845452d684746bb7b7b7ad400c0c31 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 4 Jun 2018 17:10:19 +0200
Subject: [PATCH 26/86] Update cpuid_x86.c

---
 cpuid_x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpuid_x86.c b/cpuid_x86.c
index d0dbe1d24..fc937865c 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1883,7 +1883,7 @@ int get_coretype(void){
 #ifndef NO_AVX512
 	    return CORE_SKYLAKEX;
 #else
-	  if/support_avx())
+	  if(support_avx())
 #ifndef NO_AVX2
 	    return CORE_HASWELL;
 #else

From b7feded85acaf95d68ed4cfd573e60c83fdbca5d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 5 Jun 2018 10:24:05 +0200
Subject: [PATCH 27/86] Propagate NO_AVX512 via CCOMMON_OPT

---
 Makefile.system | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index 82e38a6d2..8c875d6f7 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -939,6 +939,10 @@ ifeq ($(NO_AVX2), 1)
 CCOMMON_OPT	+= -DNO_AVX2
 endif
 
+ifeq ($(NO_AVX512), 1)
+CCOMMON_OPT	+= -DNO_AVX512
+endif
+
 ifdef SMP
 CCOMMON_OPT	+= -DSMP_SERVER
 

From 38ad05bd0484ea723a42415f986cf0db24e01ca8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 5 Jun 2018 10:26:49 +0200
Subject: [PATCH 28/86] Extend loop range to find SkylakeX in force_coretype

---
 driver/others/dynamic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 96612cc52..acb2d8b8c 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -506,7 +506,7 @@ static gotoblas_t *force_coretype(char *coretype){
 	char message[128];
 	//char mname[20];
 
-	for ( i=1 ; i <= 23; i++)
+	for ( i=1 ; i <= 24; i++)
 	{
 		if (!strncasecmp(coretype,corename[i],20))
 		{

From 354a976a59f1280c5403b8de37587baf53527b67 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 5 Jun 2018 10:31:34 +0200
Subject: [PATCH 29/86] Fix inverted condition in _Atomic declaration

fixes #1593
---
 common.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/common.h b/common.h
index 123e3dee7..980099ee3 100644
--- a/common.h
+++ b/common.h
@@ -642,6 +642,7 @@ void gotoblas_profile_init(void);
 void gotoblas_profile_quit(void);
 
 #ifdef USE_OPENMP
+
 #ifndef C_MSVC
 int omp_in_parallel(void);
 int omp_get_num_procs(void);
@@ -649,12 +650,15 @@ int omp_get_num_procs(void);
 __declspec(dllimport) int __cdecl omp_in_parallel(void);
 __declspec(dllimport) int __cdecl omp_get_num_procs(void);
 #endif
+
 #if (__STDC_VERSION__ >= 201112L)
+#include <stdatomic.h>
+#else
 #ifndef _Atomic
 #define _Atomic volatile
 #endif
-#include <stdatomic.h>
 #endif
+
 #else
 #ifdef __ELF__
 int omp_in_parallel  (void) __attribute__ ((weak));

From 15a78d6b662569a464de9a00517897b036fe7886 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 5 Jun 2018 15:58:34 +0200
Subject: [PATCH 30/86] export NO_AVX512 setting

---
 Makefile.system | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile.system b/Makefile.system
index 8c875d6f7..eaf3e9889 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -1249,6 +1249,7 @@ export MSA_FLAGS
 export KERNELDIR
 export FUNCTION_PROFILE
 export TARGET_CORE
+export NO_AVX512
 
 export SGEMM_UNROLL_M
 export SGEMM_UNROLL_N

From e8002536ec90b74148abce1c3de9bca0061dbe32 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 5 Jun 2018 18:23:01 +0200
Subject: [PATCH 31/86] disable quiet_make for the moment

---
 Makefile.system | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index eaf3e9889..5c16e2bee 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -155,9 +155,9 @@ ifeq ($(DEBUG), 1)
 GETARCH_FLAGS	+= -g
 endif
 
-ifeq ($(QUIET_MAKE), 1)
-MAKE += -s
-endif
+#ifeq ($(QUIET_MAKE), 1)
+#MAKE += -s
+#endif
 
 ifndef NO_PARALLEL_MAKE
 NO_PARALLEL_MAKE=0

From f6021c798dea23685af3eedcb63c4a388c78f226 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 5 Jun 2018 19:09:38 +0200
Subject: [PATCH 32/86] Re-enable QUIET_MAKE

---
 Makefile.system | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index 5c16e2bee..eaf3e9889 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -155,9 +155,9 @@ ifeq ($(DEBUG), 1)
 GETARCH_FLAGS	+= -g
 endif
 
-#ifeq ($(QUIET_MAKE), 1)
-#MAKE += -s
-#endif
+ifeq ($(QUIET_MAKE), 1)
+MAKE += -s
+endif
 
 ifndef NO_PARALLEL_MAKE
 NO_PARALLEL_MAKE=0

From 7fb62aed7e2a08fb8fc62054a164d3479511ce82 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 5 Jun 2018 23:29:33 +0200
Subject: [PATCH 33/86] Check build system support for AVX512 instructions

---
 cmake/system_check.cmake | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake
index d47c38cdd..f054852bf 100644
--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@@ -66,3 +66,12 @@ else()
   set(BINARY32 1)
 endif()
 
+if (X86_64 OR X86)
+  file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "void main(void){ __asm__ volatile(\"vaddps %zmm1, %zmm0, %zmm0\"); }")
+execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512)
+if (NO_AVX512 EQUAL 1)
+set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
+endif()
+  file(REMOVE "avx512.tmp" "avx512.o")
+endif()
+

From 06d43760e4ca2cc7007e54d88938eff9e95e0579 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 6 Jun 2018 09:18:10 +0200
Subject: [PATCH 34/86] Restore _Atomic define before stdatomic.h for old gcc

see #1593
---
 common.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/common.h b/common.h
index 123e3dee7..ecf07316d 100644
--- a/common.h
+++ b/common.h
@@ -649,12 +649,21 @@ int omp_get_num_procs(void);
 __declspec(dllimport) int __cdecl omp_in_parallel(void);
 __declspec(dllimport) int __cdecl omp_get_num_procs(void);
 #endif
+
 #if (__STDC_VERSION__ >= 201112L)
+#if defined(C_GCC) && ( __GNUC__ < 7) 
+// workaround for GCC bug 65467
 #ifndef _Atomic
 #define _Atomic volatile
 #endif
-#include <stdatomic.h>
 #endif
+#include <stdatomic.h>
+#else
+#ifndef _Atomic
+#define _Atomic volatile
+#endif
+
+
 #else
 #ifdef __ELF__
 int omp_in_parallel  (void) __attribute__ ((weak));

From 83da278093e32f1e089a12d880c7ec65dfbb1457 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 6 Jun 2018 09:27:49 +0200
Subject: [PATCH 35/86] Update common.h

---
 common.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/common.h b/common.h
index cd1c4c0d1..663f37e7b 100644
--- a/common.h
+++ b/common.h
@@ -663,6 +663,7 @@ __declspec(dllimport) int __cdecl omp_get_num_procs(void);
 #ifndef _Atomic
 #define _Atomic volatile
 #endif
+#endif
 
 #else
 #ifdef __ELF__

From 9b87b642624b398ebacee525edbc879cf3f950ea Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 6 Jun 2018 16:49:00 +0200
Subject: [PATCH 36/86] Improve AVX512 testcase

clang 3.4 managed to accept the original test code, only to fail on the actual Skylake asm later
---
 c_check | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/c_check b/c_check
index dfe99350a..cc64c16c6 100644
--- a/c_check
+++ b/c_check
@@ -203,8 +203,8 @@ $binformat    = bin64  if ($data =~ /BINARY_64/);
 
 $no_avx512= 0;
 if (($architecture eq "x86") || ($architecture eq "x86_64")) {
-    $code = '"vaddps %zmm1, %zmm0, %zmm0"'; 
-    print $tmpf "void main(void){ __asm__ volatile($code); }\n";
+    $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
+    print $tmpf "int main(void){ __asm__ volatile($code); }\n";
     $args = " -o $tmpf.o -x c $tmpf";
     my @cmd = ("$compiler_name $args");
     system(@cmd) == 0;

From e4718b1fee0f8dcd0c892063d619477bd5ed31ce Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 6 Jun 2018 16:51:30 +0200
Subject: [PATCH 37/86] Better AVX512 test case

---
 cmake/system_check.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake
index f054852bf..a565fc0d5 100644
--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@@ -67,7 +67,7 @@ else()
 endif()
 
 if (X86_64 OR X86)
-  file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "void main(void){ __asm__ volatile(\"vaddps %zmm1, %zmm0, %zmm0\"); }")
+  file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
 execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512)
 if (NO_AVX512 EQUAL 1)
 set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")

From ed7c4a043b3093dfe8ddb3d6d3e3d6fd6af43d4a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 7 Jun 2018 10:18:26 +0200
Subject: [PATCH 38/86] Use usleep instead of sched_yield by default

sched_yield only burns cpu cycles, fixes #900,  see also #923, #1560
---
 common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common.h b/common.h
index 663f37e7b..b7181e670 100644
--- a/common.h
+++ b/common.h
@@ -356,7 +356,7 @@ typedef int blasint;
 */
 
 #ifndef YIELDING
-#define YIELDING	sched_yield()
+#define YIELDING	usleep(10)
 #endif
 
 /***

From e8880c1699816483090aa5574cf9b3322943831f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 7 Jun 2018 10:26:55 +0200
Subject: [PATCH 39/86] Use a single thread for small input size

copies daxpy improvement from #27, see #1560
---
 interface/zaxpy.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/interface/zaxpy.c b/interface/zaxpy.c
index fbb830ffb..529e78e79 100644
--- a/interface/zaxpy.c
+++ b/interface/zaxpy.c
@@ -41,7 +41,11 @@
 #ifdef FUNCTION_PROFILE
 #include "functable.h"
 #endif
-
+#if  defined(Z13)
+#define MULTI_THREAD_MINIMAL  200000
+#else
+#define MULTI_THREAD_MINIMAL  10000
+#endif
 #ifndef CBLAS
 
 void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
@@ -69,7 +73,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
 #endif
 
 #ifndef CBLAS
-  PRINT_DEBUG_CNAME;
+  PRINT_DEBUG_NAME;
 #else
   PRINT_DEBUG_CNAME;
 #endif
@@ -93,6 +97,11 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
   if (incx == 0 || incy == 0)
 	  nthreads = 1;
 
+  //Work around the low performance issue with small imput size &
+  //multithreads.
+  if (n <= MULTI_THREAD_MINIMAL) {
+	  nthreads = 1;
+  }
   if (nthreads == 1) {
 #endif
 

From 66316b9f4c8c7c48eed8b29e86f64581c02d45b0 Mon Sep 17 00:00:00 2001
From: Craig Donner <cdonner@google.com>
Date: Thu, 7 Jun 2018 14:54:42 +0100
Subject: [PATCH 40/86] Improve performance of GEMM for small matrices when SMP
 is defined.

Always checking num_cpu_avail() regardless of whether threading will actually
be used adds noticeable overhead for small matrices.  Most other uses of
num_cpu_avail() do so only if threading will be used, so do the same here.
---
 interface/gemm.c | 27 ++++++---------------------
 interface/trsm.c |  3 ++-
 2 files changed, 8 insertions(+), 22 deletions(-)

diff --git a/interface/gemm.c b/interface/gemm.c
index 8baf3fbec..a3bac5984 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -44,6 +44,7 @@
 #endif
 
 #ifndef COMPLEX
+#define SMP_THRESHOLD_MIN 65536.0
 #ifdef XDOUBLE
 #define ERROR_NAME "QGEMM "
 #elif defined(DOUBLE)
@@ -52,6 +53,7 @@
 #define ERROR_NAME "SGEMM "
 #endif
 #else
+#define SMP_THRESHOLD_MIN 8192.0
 #ifndef GEMM3M
 #ifdef XDOUBLE
 #define ERROR_NAME "XGEMM "
@@ -121,8 +123,6 @@ void NAME(char *TRANSA, char *TRANSB,
   FLOAT *sa, *sb;
 
 #ifdef SMP
-  int nthreads_max;
-  int nthreads_avail;
   double MNK;
 #ifndef COMPLEX
 #ifdef XDOUBLE
@@ -245,8 +245,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
   XFLOAT *sa, *sb;
 
 #ifdef SMP
-  int nthreads_max;
-  int nthreads_avail;
   double MNK;
 #ifndef COMPLEX
 #ifdef XDOUBLE
@@ -411,25 +409,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
   mode |= (transa << BLAS_TRANSA_SHIFT);
   mode |= (transb << BLAS_TRANSB_SHIFT);
 
-  nthreads_max = num_cpu_avail(3);
-  nthreads_avail = nthreads_max;
-
-#ifndef COMPLEX
   MNK = (double) args.m * (double) args.n * (double) args.k;
-  if ( MNK <= (65536.0  * (double) GEMM_MULTITHREAD_THRESHOLD)  )
-	nthreads_max = 1;
-#else
-  MNK = (double) args.m * (double) args.n * (double) args.k;
-  if ( MNK <= (8192.0  * (double) GEMM_MULTITHREAD_THRESHOLD)  )
-	nthreads_max = 1;
-#endif
-  args.common = NULL;
-
-  if ( nthreads_max > nthreads_avail )
-  	args.nthreads = nthreads_avail;
+  if ( MNK <= (SMP_THRESHOLD_MIN  * (double) GEMM_MULTITHREAD_THRESHOLD)  )
+	args.nthreads = 1;
   else
-  	args.nthreads = nthreads_max;
-
+	args.nthreads = num_cpu_avail(3);
+  args.common = NULL;
 
  if (args.nthreads == 1) {
 #endif
diff --git a/interface/trsm.c b/interface/trsm.c
index 60c49795d..5c2750e79 100644
--- a/interface/trsm.c
+++ b/interface/trsm.c
@@ -366,12 +366,13 @@ void CNAME(enum CBLAS_ORDER order,
   mode |= (trans << BLAS_TRANSA_SHIFT);
   mode |= (side  << BLAS_RSIDE_SHIFT);
 
-  args.nthreads = num_cpu_avail(3);
   if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD )
 	args.nthreads = 1;
   else
 	if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD )
 		args.nthreads = 1;
+  else
+	args.nthreads = num_cpu_avail(3);
 		
 
   if (args.nthreads == 1) {

From 6c2d90ba7724b05e7fb97c7ec33324499e4a1a79 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 9 Jun 2018 16:29:17 +0200
Subject: [PATCH 41/86] Move some DYNAMIC_ARCH targets to new DYNAMIC_OLDER
 option

---
 CMakeLists.txt   |  1 +
 Makefile         |  3 +++
 Makefile.install |  2 +-
 Makefile.rule    |  5 +++++
 Makefile.system  | 17 ++++++++++++++++-
 5 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f49f20513..66c3d8afa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,6 +20,7 @@ option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON
 endif()
 option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF)
 option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF)
+option(DYNAMIC_OLDER "Support older cpus with DYNAMIC_ARCH" OFF)
 option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF)
 #######
 if(BUILD_WITHOUT_LAPACK)
diff --git a/Makefile b/Makefile
index 380ba1ce8..56b4426f8 100644
--- a/Makefile
+++ b/Makefile
@@ -153,6 +153,9 @@ ifeq ($(DYNAMIC_ARCH), 1)
 	do  $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
 	done
 	@echo DYNAMIC_ARCH=1 >> Makefile.conf_last
+ifeq ($(DYNAMIC_OLDER), 1)
+	@echo DYNAMIC_OLDER=1 >> Makefile.conf_last
+endif	
 endif
 ifdef USE_THREAD
 	@echo USE_THREAD=$(USE_THREAD) >>  Makefile.conf_last
diff --git a/Makefile.install b/Makefile.install
index 21c3c9e22..c51c8a021 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -98,7 +98,7 @@ endif
 	@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
 	@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
 	@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
-	@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
+	@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
 	@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
 	@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
 	@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
diff --git a/Makefile.rule b/Makefile.rule
index 1b4b8eb63..5c03d0195 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -17,6 +17,11 @@ VERSION = 0.3.1.dev
 # If you want to support multiple architecture in one binary
 # DYNAMIC_ARCH = 1
 
+# If you want the full list of x86_64 architectures supported in DYNAMIC_ARCH
+# mode (including individual optimizied codes for PENRYN, DUNNINGTON, OPTERON,
+# OPTERON_SSE3, ATOM and NANO rather than fallbacks to older architectures)
+# DYNAMIC_OLDER = 1
+
 # C compiler including binary type(32bit / 64bit). Default is gcc.
 # Don't use Intel Compiler or PGI, it won't generate right codes as I expect.
 # CC = gcc
diff --git a/Makefile.system b/Makefile.system
index eaf3e9889..62ba0e466 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -472,7 +472,18 @@ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
 endif
 
 ifeq ($(ARCH), x86_64)
-DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
+DYNAMIC_CORE = PRESCOTT CORE2 
+ifeq ($(DYNAMIC_OLDER), 1)
+DYNAMIC_CORE += PENRYN DUNNINGTON 
+endif
+DYNAMIC_CORE += NEHALEM 
+ifeq ($(DYNAMIC_OLDER), 1)
+DYNAMIC_CORE += OPTERON OPTERON_SSE3 
+endif
+DYNAMIC_CORE += BARCELONA 
+ifeq ($(DYNAMIC_OLDER), 1)
+DYNAMIC_CORE += BOBCAT ATOM NANO
+endif
 ifneq ($(NO_AVX), 1)
 DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR
 endif
@@ -917,6 +928,10 @@ ifeq ($(DYNAMIC_ARCH), 1)
 CCOMMON_OPT	+= -DDYNAMIC_ARCH
 endif
 
+ifeq ($(DYNAMIC_OLDER), 1)
+CCOMMON_OPT	+= -DDYNAMIC_OLDER
+endif
+
 ifeq ($(NO_LAPACK), 1)
 CCOMMON_OPT	+= -DNO_LAPACK
 #Disable LAPACK C interface

From 1cbd8f3ae47ffb89523fa247e81ffea07c6505a4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 9 Jun 2018 16:30:46 +0200
Subject: [PATCH 42/86] Move some DYNAMIC_ARCH targets to new DYNAMIC_OLDER
 option

---
 cmake/arch.cmake     | 13 ++++++++++++-
 cmake/openblas.pc.in |  2 +-
 cmake/system.cmake   |  3 +++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index 527d2bec6..52fb64eaa 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -49,7 +49,18 @@ if (DYNAMIC_ARCH)
   endif ()
 
   if (X86_64)
-    set(DYNAMIC_CORE PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO)
+    set(DYNAMIC_CORE PRESCOTT CORE2)
+    if (DYNAMIC_OLDER)
+	set (DYNAMIC_CORE ${DYNAMIC_CORE} PENRYN DUNNINGTON)
+    endif ()
+    set (DYNAMIC_CORE ${DYNAMIC_CORE} NEHALEM)
+    if (DYNAMIC_OLDER)
+	set (DYNAMIC_CORE ${DYNAMIC_CORE} OPTERON OPTERON_SSE3)
+    endif ()
+    set (DYNAMIC_CORE ${DYNAMIC_CORE} BARCELONA) 
+    if (DYNAMIC_OLDER)
+	set (DYNAMIC_CORE ${DYNAMIC_CORE} BOBCAT ATOM NANO)
+    endif ()
     if (NOT NO_AVX)
       set(DYNAMIC_CORE ${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR)
     endif ()
diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in
index 35973b09b..ca88a6d5f 100644
--- a/cmake/openblas.pc.in
+++ b/cmake/openblas.pc.in
@@ -1,7 +1,7 @@
 libdir=@CMAKE_INSTALL_FULL_LIBDIR@
 includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
 
-openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ 
+openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ 
 Name: OpenBLAS
 Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
 Version: @OPENBLAS_VERSION@
diff --git a/cmake/system.cmake b/cmake/system.cmake
index c21fe7c14..48e8f75bc 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -163,6 +163,9 @@ endif ()
 
 if (DYNAMIC_ARCH)
   set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
+  if (DYNAMIC_OLDER)
+    set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
+  endif ()
 endif ()
 
 if (NO_LAPACK)

From 63f7395fb49091295463785f6c1056f61dd64a7d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 9 Jun 2018 16:31:38 +0200
Subject: [PATCH 43/86] Move some DYNAMIC_ARCH targets to new DYNAMIC_OLDER
 option

---
 driver/others/dynamic.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index acb2d8b8c..4271c0a0d 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -56,16 +56,27 @@ EXTERN gotoblas_t  gotoblas_BANIAS;
 EXTERN gotoblas_t  gotoblas_ATHLON;
 
 extern gotoblas_t  gotoblas_PRESCOTT;
+extern gotoblas_t  gotoblas_CORE2;
+extern gotoblas_t  gotoblas_NEHALEM;
+extern gotoblas_t  gotoblas_BARCELONA;
+#ifdef DYNAMIC_OLDER
 extern gotoblas_t  gotoblas_ATOM;
 extern gotoblas_t  gotoblas_NANO;
-extern gotoblas_t  gotoblas_CORE2;
 extern gotoblas_t  gotoblas_PENRYN;
 extern gotoblas_t  gotoblas_DUNNINGTON;
-extern gotoblas_t  gotoblas_NEHALEM;
 extern gotoblas_t  gotoblas_OPTERON;
 extern gotoblas_t  gotoblas_OPTERON_SSE3;
-extern gotoblas_t  gotoblas_BARCELONA;
 extern gotoblas_t  gotoblas_BOBCAT;
+#else
+#define gotoblas_ATOM gotoblas_NEHALEM
+#define gotoblas_NANO gotoblas_NEHALEM
+#define gotoblas_PENRYN gotoblas_CORE2
+#define gotoblas_DUNNINGTON gotoblas_CORE2
+#define gotoblas_OPTERON gotoblas_CORE2
+#define gotoblas_OPTERON_SSE3 gotoblas_CORE2
+#define gotoblas_BOBCAT gotoblas_CORE2
+#endif
+
 #ifndef NO_AVX
 extern gotoblas_t  gotoblas_SANDYBRIDGE;
 extern gotoblas_t  gotoblas_BULLDOZER;

From e9cd11768c20707eff31912db1bafc837c0224d2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 9 Jun 2018 17:54:36 +0200
Subject: [PATCH 44/86] Enable parallel make on MS Windows by default

fixes #874
---
 getarch.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/getarch.c b/getarch.c
index fcffe63e2..31f41d62c 100644
--- a/getarch.c
+++ b/getarch.c
@@ -1196,9 +1196,7 @@ int main(int argc, char *argv[]){
 #elif NO_PARALLEL_MAKE==1
     printf("MAKE += -j 1\n");
 #else
-#ifndef OS_WINDOWS
     printf("MAKE += -j %d\n", get_num_cores());
-#endif
 #endif
 
     break;

From 0bea6bb9e7e2468bc9d42f5ffdf27f772f2984af Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 10 Jun 2018 09:24:37 +0200
Subject: [PATCH 45/86] Create OpenBLASConfig.cmake from cmake as well

---
 CMakeLists.txt | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f49f20513..e1c308910 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,6 +20,7 @@ option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON
 endif()
 option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF)
 option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF)
+option(DYNAMIC_OLDER "Support older cpus with DYNAMIC_ARCH" OFF)
 option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF)
 #######
 if(BUILD_WITHOUT_LAPACK)
@@ -208,6 +209,7 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
 
 # Install libraries
 install(TARGETS ${OpenBLAS_LIBNAME}
+	EXPORT "OpenBLASTargets"
 	RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
 	ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
   LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
@@ -267,3 +269,21 @@ if(PKG_CONFIG_FOUND)
 	configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY)
 	install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
 endif()
+
+
+# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
+set(PN OpenBLAS)
+set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}")
+configure_package_config_file(cmake/${PN}Config.cmake.in
+                              "${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake"
+                              INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR})
+write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
+                                 VERSION ${${PN}_VERSION}
+                                 COMPATIBILITY AnyNewerVersion)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake
+              ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
+        DESTINATION ${CMAKECONFIG_INSTALL_DIR})
+install(EXPORT "${PN}Targets"
+        NAMESPACE "${PN}::"
+        DESTINATION ${CMAKECONFIG_INSTALL_DIR})
+

From 02634b549b678dc38c85ce4c77ebb532e8d9e471 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 10 Jun 2018 09:25:46 +0200
Subject: [PATCH 46/86] Add template for OpenBLASConfig.cmake

---
 cmake/OpenBLASConfig.cmake.in | 79 +++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 cmake/OpenBLASConfig.cmake.in

diff --git a/cmake/OpenBLASConfig.cmake.in b/cmake/OpenBLASConfig.cmake.in
new file mode 100644
index 000000000..87a1621b4
--- /dev/null
+++ b/cmake/OpenBLASConfig.cmake.in
@@ -0,0 +1,79 @@
+# OpenBLASConfig.cmake
+# --------------------
+#
+# OpenBLAS cmake module.
+# This module sets the following variables in your project::
+#
+#   OpenBLAS_FOUND - true if OpenBLAS and all required components found on the system
+#   OpenBLAS_VERSION - OpenBLAS version in format Major.Minor.Release
+#   OpenBLAS_INCLUDE_DIRS - Directory where OpenBLAS header is located.
+#   OpenBLAS_INCLUDE_DIR - same as DIRS
+#   OpenBLAS_LIBRARIES - OpenBLAS library to link against.
+#   OpenBLAS_LIBRARY - same as LIBRARIES
+#
+#
+# Available components::
+#
+##   shared - search for only shared library
+##   static - search for only static library
+#   serial - search for unthreaded library
+#   pthread - search for native pthread threaded library
+#   openmp - search for OpenMP threaded library
+#
+#
+# Exported targets::
+#
+# If OpenBLAS is found, this module defines the following :prop_tgt:`IMPORTED`
+## target. Target is shared _or_ static, so, for both, use separate, not
+## overlapping, installations. ::
+#
+#   OpenBLAS::OpenBLAS - the main OpenBLAS library #with header & defs attached.
+#
+#
+# Suggested usage::
+#
+#   find_package(OpenBLAS)
+#   find_package(OpenBLAS 0.2.20 EXACT CONFIG REQUIRED COMPONENTS pthread)
+#
+#
+# The following variables can be set to guide the search for this package::
+#
+#   OpenBLAS_DIR - CMake variable, set to directory containing this Config file
+#   CMAKE_PREFIX_PATH - CMake variable, set to root directory of this package
+#   PATH - environment variable, set to bin directory of this package
+#   CMAKE_DISABLE_FIND_PACKAGE_OpenBLAS - CMake variable, disables
+#     find_package(OpenBLAS) when not REQUIRED, perhaps to force internal build
+
+@PACKAGE_INIT@
+
+set(PN OpenBLAS)
+
+# need to check that the @USE_*@ evaluate to something cmake can perform boolean logic upon
+if(@USE_OPENMP@)
+    set(${PN}_openmp_FOUND 1)
+elseif(@USE_THREAD@)
+    set(${PN}_pthread_FOUND 1)
+else()
+    set(${PN}_serial_FOUND 1)
+endif()
+
+check_required_components(${PN})
+
+#-----------------------------------------------------------------------------
+# Don't include targets if this file is being picked up by another
+# project which has already built this as a subproject
+#-----------------------------------------------------------------------------
+if(NOT TARGET ${PN}::OpenBLAS)
+    include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake")
+
+    get_property(_loc TARGET ${PN}::OpenBLAS PROPERTY LOCATION)
+    set(${PN}_LIBRARY ${_loc})
+    get_property(_ill TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_LINK_LIBRARIES)
+    set(${PN}_LIBRARIES ${_ill})
+
+    get_property(_id TARGET ${PN}::OpenBLAS PROPERTY INCLUDE_DIRECTORIES)
+    set(${PN}_INCLUDE_DIR ${_id})
+    get_property(_iid TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
+    set(${PN}_INCLUDE_DIRS ${_iid})
+endif()
+

From e65f451409e2150bf299a2cdd906bec4ffff7915 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 10 Jun 2018 15:09:43 +0200
Subject: [PATCH 47/86] include CMakePackageConfigHelpers

---
 CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e1c308910..a2421ac54 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,6 +12,9 @@ set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${Open
 # Adhere to GNU filesystem layout conventions
 include(GNUInstallDirs)
 
+include(CMakePackageConfigHelpers)
+
+
 set(OpenBLAS_LIBNAME openblas)
 
 #######

From c2545b0fd6978e1fb09c2dc86b825846e0034228 Mon Sep 17 00:00:00 2001
From: Craig Donner <cdonner@google.com>
Date: Mon, 11 Jun 2018 10:13:09 +0100
Subject: [PATCH 48/86] Fixed a few more unnecessary calls to num_cpu_avail.

I don't have as many benchmarks for these as for gemm, but it should still
make a difference for small matrices.
---
 interface/axpy.c                        | 14 ++++++--------
 interface/scal.c                        |  5 +++--
 interface/zaxpy.c                       | 14 ++++++--------
 interface/zscal.c                       |  4 ++--
 interface/zswap.c                       |  4 ++--
 kernel/arm64/casum_thunderx2t99.c       |  9 +++------
 kernel/arm64/copy_thunderx2t99.c        |  9 +++------
 kernel/arm64/dasum_thunderx2t99.c       |  9 +++------
 kernel/arm64/dot_thunderx2t99.c         | 11 ++++-------
 kernel/arm64/dznrm2_thunderx2t99.c      |  4 ++--
 kernel/arm64/dznrm2_thunderx2t99_fast.c |  4 ++--
 kernel/arm64/iamax_thunderx2t99.c       |  9 +++------
 kernel/arm64/izamax_thunderx2t99.c      |  9 +++------
 kernel/arm64/sasum_thunderx2t99.c       |  9 +++------
 kernel/arm64/scnrm2_thunderx2t99.c      |  4 ++--
 kernel/arm64/zasum_thunderx2t99.c       |  9 +++------
 kernel/arm64/zdot_thunderx2t99.c        |  9 +++------
 kernel/x86_64/ddot.c                    | 15 ++++++---------
 18 files changed, 59 insertions(+), 92 deletions(-)

diff --git a/interface/axpy.c b/interface/axpy.c
index f0d95b395..39edea6af 100644
--- a/interface/axpy.c
+++ b/interface/axpy.c
@@ -40,11 +40,11 @@
 #include "common.h"
 #ifdef FUNCTION_PROFILE
 #include "functable.h"
-#endif 
+#endif
 #if  defined(Z13)
 #define MULTI_THREAD_MINIMAL  200000
 #else
-#define MULTI_THREAD_MINIMAL  10000        
+#define MULTI_THREAD_MINIMAL  10000
 #endif
 #ifndef CBLAS
 
@@ -83,17 +83,15 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
   if (incy < 0) y -= (n - 1) * incy;
 
 #ifdef SMP
-  nthreads = num_cpu_avail(1);
-
   //disable multi-thread when incx==0 or incy==0
   //In that case, the threads would be dependent.
-  if (incx == 0 || incy == 0)
-	  nthreads = 1;
-
+  //
   //Temporarily work-around the low performance issue with small imput size &
   //multithreads.
-  if (n <= MULTI_THREAD_MINIMAL)
+  if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
 	  nthreads = 1;
+  else
+	  nthreads = num_cpu_avail(1);
 
   if (nthreads == 1) {
 #endif
diff --git a/interface/scal.c b/interface/scal.c
index 3f468a2a3..6d07b1650 100644
--- a/interface/scal.c
+++ b/interface/scal.c
@@ -76,10 +76,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){
 
 
 #ifdef SMP
-  nthreads = num_cpu_avail(1);
-
   if (n <= 1048576 )
 	nthreads = 1;
+  else
+	nthreads = num_cpu_avail(1);
+
 
   if (nthreads == 1) {
 #endif
diff --git a/interface/zaxpy.c b/interface/zaxpy.c
index 529e78e79..1a0259c96 100644
--- a/interface/zaxpy.c
+++ b/interface/zaxpy.c
@@ -90,18 +90,16 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
   if (incy < 0) y -= (n - 1) * incy * 2;
 
 #ifdef SMP
-  nthreads = num_cpu_avail(1);
-
   //disable multi-thread when incx==0 or incy==0
   //In that case, the threads would be dependent.
-  if (incx == 0 || incy == 0)
-	  nthreads = 1;
-
-  //Work around the low performance issue with small imput size &
+  //
+  //Temporarily work-around the low performance issue with small imput size &
   //multithreads.
-  if (n <= MULTI_THREAD_MINIMAL) {
+  if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
 	  nthreads = 1;
-  }
+  else
+	  nthreads = num_cpu_avail(1);
+
   if (nthreads == 1) {
 #endif
 
diff --git a/interface/zscal.c b/interface/zscal.c
index 633b6ecf5..bfaddc260 100644
--- a/interface/zscal.c
+++ b/interface/zscal.c
@@ -90,10 +90,10 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){
   FUNCTION_PROFILE_START();
 
 #ifdef SMP
-  nthreads = num_cpu_avail(1);
-
   if ( n <= 1048576 )
 	nthreads = 1;
+  else
+	nthreads = num_cpu_avail(1);
 
   if (nthreads == 1) {
 #endif
diff --git a/interface/zswap.c b/interface/zswap.c
index 5308cbe90..e33bbafba 100644
--- a/interface/zswap.c
+++ b/interface/zswap.c
@@ -79,12 +79,12 @@ FLOAT *y = (FLOAT*)vy;
   if (incy < 0) y -= (n - 1) * incy * 2;
 
 #ifdef SMP
-  nthreads = num_cpu_avail(1);
-
   //disable multi-thread when incx==0 or incy==0
   //In that case, the threads would be dependent.
   if (incx == 0 || incy == 0)
 	  nthreads = 1;
+  else
+	  nthreads = num_cpu_avail(1);
 
   if (nthreads == 1) {
 #endif
diff --git a/kernel/arm64/casum_thunderx2t99.c b/kernel/arm64/casum_thunderx2t99.c
index cd5d936c5..c6dbb3f77 100644
--- a/kernel/arm64/casum_thunderx2t99.c
+++ b/kernel/arm64/casum_thunderx2t99.c
@@ -233,13 +233,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT asum = 0.0;
 
 #if defined(SMP)
-	nthreads = num_cpu_avail(1);
-
-	if (inc_x == 0)
-		nthreads = 1;
-
-	if (n <= 10000)
+	if (inc_x == 0 || n <= 10000)
 		nthreads = 1;
+	else
+		nthreads = num_cpu_avail(1);
 
 	if (nthreads == 1) {
 		asum = casum_compute(n, x, inc_x);
diff --git a/kernel/arm64/copy_thunderx2t99.c b/kernel/arm64/copy_thunderx2t99.c
index bd67b48b0..e31876139 100644
--- a/kernel/arm64/copy_thunderx2t99.c
+++ b/kernel/arm64/copy_thunderx2t99.c
@@ -183,13 +183,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 	if (n <= 0) return 0;
 
 #if defined(SMP)
-	nthreads = num_cpu_avail(1);
-
-	if (inc_x == 0)
-		nthreads = 1;
-
-	if (n <= 10000)
+	if (inc_x == 0 || n <= 10000)
 		nthreads = 1;
+	else
+		nthreads = num_cpu_avail(1);
 
 	if (nthreads == 1) {
 		do_copy(n, x, inc_x, y, inc_y);
diff --git a/kernel/arm64/dasum_thunderx2t99.c b/kernel/arm64/dasum_thunderx2t99.c
index ba12fc776..a212c9534 100644
--- a/kernel/arm64/dasum_thunderx2t99.c
+++ b/kernel/arm64/dasum_thunderx2t99.c
@@ -228,13 +228,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT asum = 0.0;
 
 #if defined(SMP)
-	nthreads = num_cpu_avail(1);
-
-	if (inc_x == 0)
-		nthreads = 1;
-
-	if (n <= 10000)
+	if (inc_x == 0 || n <= 10000)
 		nthreads = 1;
+	else
+		nthreads = num_cpu_avail(1);
 
 	if (nthreads == 1) {
 		asum = dasum_compute(n, x, inc_x);
diff --git a/kernel/arm64/dot_thunderx2t99.c b/kernel/arm64/dot_thunderx2t99.c
index 8eeb94f36..3940acddd 100644
--- a/kernel/arm64/dot_thunderx2t99.c
+++ b/kernel/arm64/dot_thunderx2t99.c
@@ -199,7 +199,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	"	faddp	"DOTF", v0.2d			\n"
 #endif /* !defined(DSDOT) */
 
-#else /* !defined(DOUBLE) */ 
+#else /* !defined(DOUBLE) */
 #define KERNEL_F1						\
 	"	ldr	"TMPX", ["X"]			\n"	\
 	"	ldr	"TMPY", ["Y"]			\n"	\
@@ -384,13 +384,10 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y
 	RETURN_TYPE dot = 0.0;
 
 #if defined(SMP)
-	nthreads = num_cpu_avail(1);
-
-	if (inc_x == 0 || inc_y == 0)
-		nthreads = 1;
-
-	if (n <= 10000)
+	if (inc_x == 0 || inc_y == 0 || n <= 10000)
 		nthreads = 1;
+	else
+		nthreads = num_cpu_avail(1);
 
 	if (nthreads == 1) {
 		dot = dot_compute(n, x, inc_x, y, inc_y);
diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c
index 2aea9b4a9..b94f0cffc 100644
--- a/kernel/arm64/dznrm2_thunderx2t99.c
+++ b/kernel/arm64/dznrm2_thunderx2t99.c
@@ -328,10 +328,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	if (n <= 0 || inc_x <= 0) return 0.0;
 
 #if defined(SMP)
-	nthreads = num_cpu_avail(1);
-
 	if (n <= 10000)
 		nthreads = 1;
+	else
+		nthreads = num_cpu_avail(1);
 
 	if (nthreads == 1) {
 		nrm2_compute(n, x, inc_x, &ssq, &scale);
diff --git a/kernel/arm64/dznrm2_thunderx2t99_fast.c b/kernel/arm64/dznrm2_thunderx2t99_fast.c
index 8b04a3eb6..8405b388b 100644
--- a/kernel/arm64/dznrm2_thunderx2t99_fast.c
+++ b/kernel/arm64/dznrm2_thunderx2t99_fast.c
@@ -235,10 +235,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	if (n <= 0 || inc_x <= 0) return 0.0;
 
 #if defined(SMP)
-	nthreads = num_cpu_avail(1);
-
 	if (n <= 10000)
 		nthreads = 1;
+	else
+		nthreads = num_cpu_avail(1);
 
 	if (nthreads == 1) {
 		nrm2 = nrm2_compute(n, x, inc_x);
diff --git a/kernel/arm64/iamax_thunderx2t99.c b/kernel/arm64/iamax_thunderx2t99.c
index a11b18419..e3bec4a20 100644
--- a/kernel/arm64/iamax_thunderx2t99.c
+++ b/kernel/arm64/iamax_thunderx2t99.c
@@ -321,13 +321,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG max_index = 0;
 
 #if defined(SMP)
-	nthreads = num_cpu_avail(1);
-
-	if (inc_x == 0)
-		nthreads = 1;
-
-	if (n <= 10000)
+	if (inc_x == 0 || n <= 10000)
 		nthreads = 1;
+	else
+		nthreads = num_cpu_avail(1);
 
 	if (nthreads == 1) {
 		max_index = iamax_compute(n, x, inc_x);
diff --git a/kernel/arm64/izamax_thunderx2t99.c b/kernel/arm64/izamax_thunderx2t99.c
index 8d70b0515..b2e2828f0 100644
--- a/kernel/arm64/izamax_thunderx2t99.c
+++ b/kernel/arm64/izamax_thunderx2t99.c
@@ -330,13 +330,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG max_index = 0;
 
 #if defined(SMP)
-	nthreads = num_cpu_avail(1);
-
-	if (inc_x == 0)
-		nthreads = 1;
-
-	if (n <= 10000)
+	if (inc_x == 0 || n <= 10000)
 		nthreads = 1;
+	else
+		nthreads = num_cpu_avail(1);
 
 	if (nthreads == 1) {
 		max_index = izamax_compute(n, x, inc_x);
diff --git a/kernel/arm64/sasum_thunderx2t99.c b/kernel/arm64/sasum_thunderx2t99.c
index 28fc34c62..014c667ba 100644
--- a/kernel/arm64/sasum_thunderx2t99.c
+++ b/kernel/arm64/sasum_thunderx2t99.c
@@ -230,13 +230,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT asum = 0.0;
 
 #if defined(SMP)
-	nthreads = num_cpu_avail(1);
-
-	if (inc_x == 0)
-		nthreads = 1;
-
-	if (n <= 10000)
+	if (inc_x == 0 || n <= 10000)
 		nthreads = 1;
+	else
+		nthreads = num_cpu_avail(1);
 
 	if (nthreads == 1) {
 		asum = sasum_compute(n, x, inc_x);
diff --git a/kernel/arm64/scnrm2_thunderx2t99.c b/kernel/arm64/scnrm2_thunderx2t99.c
index b8df4962b..f96de441e 100644
--- a/kernel/arm64/scnrm2_thunderx2t99.c
+++ b/kernel/arm64/scnrm2_thunderx2t99.c
@@ -318,10 +318,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	if (n <= 0 || inc_x <= 0) return 0.0;
 
 #if defined(SMP)
-	nthreads = num_cpu_avail(1);
-
 	if (n <= 10000)
 		nthreads = 1;
+	else
+		nthreads = num_cpu_avail(1);
 
 	if (nthreads == 1) {
 		nrm2_double = nrm2_compute(n, x, inc_x);
diff --git a/kernel/arm64/zasum_thunderx2t99.c b/kernel/arm64/zasum_thunderx2t99.c
index 140e5a741..1d303a9a3 100644
--- a/kernel/arm64/zasum_thunderx2t99.c
+++ b/kernel/arm64/zasum_thunderx2t99.c
@@ -230,13 +230,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT asum = 0.0;
 
 #if defined(SMP)
-	nthreads = num_cpu_avail(1);
-
-	if (inc_x == 0)
-		nthreads = 1;
-
-	if (n <= 10000)
+	if (inc_x == 0 || n <= 10000)
 		nthreads = 1;
+	else
+		nthreads = num_cpu_avail(1);
 
 	if (nthreads == 1) {
 		asum = zasum_compute(n, x, inc_x);
diff --git a/kernel/arm64/zdot_thunderx2t99.c b/kernel/arm64/zdot_thunderx2t99.c
index 70d683077..6185bc7d9 100644
--- a/kernel/arm64/zdot_thunderx2t99.c
+++ b/kernel/arm64/zdot_thunderx2t99.c
@@ -317,13 +317,10 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
        CIMAG(zdot) = 0.0;
 
 #if defined(SMP)
-	nthreads = num_cpu_avail(1);
-
-	if (inc_x == 0 || inc_y == 0)
-		nthreads = 1;
-
-	if (n <= 10000)
+	if (inc_x == 0 || inc_y == 0 || n <= 10000)
 		nthreads = 1;
+	else
+		nthreads = num_cpu_avail(1);
 
 	if (nthreads == 1) {
 		zdot_compute(n, x, inc_x, y, inc_y, &zdot);
diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c
index 059549028..0dc9cd3da 100644
--- a/kernel/x86_64/ddot.c
+++ b/kernel/x86_64/ddot.c
@@ -29,13 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(BULLDOZER) 
+#if defined(BULLDOZER)
 #include "ddot_microk_bulldozer-2.c"
 #elif defined(STEAMROLLER)  || defined(EXCAVATOR)
 #include "ddot_microk_steamroller-2.c"
 #elif defined(PILEDRIVER)
 #include "ddot_microk_piledriver-2.c"
-#elif defined(NEHALEM) 
+#elif defined(NEHALEM)
 #include "ddot_microk_nehalem-2.c"
 #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
 #include "ddot_microk_haswell-2.c"
@@ -110,7 +110,7 @@ static FLOAT dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON
 	FLOAT temp1 = 0.0;
 	FLOAT temp2 = 0.0;
 
-        BLASLONG n1 = n & -4;	
+        BLASLONG n1 = n & -4;
 
 	while(i < n1)
 	{
@@ -169,13 +169,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 	FLOAT dot = 0.0;
 
 #if defined(SMP)
-	nthreads = num_cpu_avail(1);
-
-	if (inc_x == 0 || inc_y == 0)
-		nthreads = 1;
-
-	if (n <= 10000)
+	if (inc_x == 0 || inc_y == 0 || n <= 10000)
 		nthreads = 1;
+	else
+		nthreads = num_cpu_avail(1);
 
 	if (nthreads == 1) {
 		dot = dot_compute(n, x, inc_x, y, inc_y);

From 6f71c0fce45c86c55d12b6e12e69b9ccb8ec2f28 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 11 Jun 2018 13:26:19 +0200
Subject: [PATCH 49/86] =?UTF-8?q?Return=20a=20somewhat=20sane=20default=20?=
 =?UTF-8?q?value=20for=20L2=20cache=20size=20if=20cpuid=20retur=E2=80=A6?=
 =?UTF-8?q?=20(#1611)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Return a somewhat sane default value for L2 cache size if cpuid returned something unexpected

Fixes #1610, the KVM hypervisor on Google Chromebooks returning zero for CPUID  0x80000006, causing DYNAMIC_ARCH
builds of OpenBLAS to hang
---
 kernel/setparam-ref.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index 9030d7c6d..f654de110 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -647,7 +647,9 @@ static int get_l2_size_old(void){
       return 6144;
     }
   }
-  return 0;
+//  return 0;
+fprintf (stderr,"OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k\n");
+return 256;
 }
 #endif
 
@@ -660,6 +662,10 @@ static __inline__ int get_l2_size(void){
   l2 = BITMASK(ecx, 16, 0xffff);
 
 #ifndef ARCH_X86
+  if (l2 <= 0) {
+     fprintf (stderr,"OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k\n");
+     return 256;
+  }
   return l2;
 
 #else

From de8fff671d6081bf543b55c95655fe5f6b5e4007 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 11 Jun 2018 17:05:27 +0200
Subject: [PATCH 50/86] Revert "Use usleep instead of sched_yield by default"

---
 common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common.h b/common.h
index b7181e670..663f37e7b 100644
--- a/common.h
+++ b/common.h
@@ -356,7 +356,7 @@ typedef int blasint;
 */
 
 #ifndef YIELDING
-#define YIELDING	usleep(10)
+#define YIELDING	sched_yield()
 #endif
 
 /***

From fcb77ab129821690fac4e532640c5cfa786c3a79 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 14 Jun 2018 16:57:58 +0200
Subject: [PATCH 51/86] Update OSX deployment target to 10.8

fixes #1580
---
 Makefile.system | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.system b/Makefile.system
index 62ba0e466..5dffd8d2e 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -248,7 +248,7 @@ endif
 
 ifeq ($(OSNAME), Darwin)
 ifndef MACOSX_DEPLOYMENT_TARGET
-export MACOSX_DEPLOYMENT_TARGET=10.6
+export MACOSX_DEPLOYMENT_TARGET=10.8
 endif
 MD5SUM = md5 -r
 endif

From bf40f806efa55c7a7c7ec57535919598eaeb569d Mon Sep 17 00:00:00 2001
From: Craig Donner <cdonner@google.com>
Date: Thu, 14 Jun 2018 12:18:04 +0100
Subject: [PATCH 52/86] Remove the need for most locking in memory.c.

Using thread local storage for tracking memory allocations means that threads
no longer have to lock at all when doing memory allocations / frees. This
particularly helps the gemm driver since it does an allocation per invocation.
Even without threading at all, this helps, since even calling a lock with
no contention has a cost:

Before this change, no threading:
```
----------------------------------------------------
Benchmark             Time           CPU Iterations
----------------------------------------------------
BM_SGEMM/4          102 ns        102 ns   13504412
BM_SGEMM/6          175 ns        175 ns    7997580
BM_SGEMM/8          205 ns        205 ns    6842073
BM_SGEMM/10         266 ns        266 ns    5294919
BM_SGEMM/16         478 ns        478 ns    2963441
BM_SGEMM/20         690 ns        690 ns    2144755
BM_SGEMM/32        1906 ns       1906 ns     716981
BM_SGEMM/40        2983 ns       2983 ns     473218
BM_SGEMM/64        9421 ns       9422 ns     148450
BM_SGEMM/72       12630 ns      12631 ns     112105
BM_SGEMM/80       15845 ns      15846 ns      89118
BM_SGEMM/90       25675 ns      25676 ns      54332
BM_SGEMM/100      29864 ns      29865 ns      47120
BM_SGEMM/112      37841 ns      37842 ns      36717
BM_SGEMM/128      56531 ns      56532 ns      25361
BM_SGEMM/140      75886 ns      75888 ns      18143
BM_SGEMM/150      98493 ns      98496 ns      14299
BM_SGEMM/160     102620 ns     102622 ns      13381
BM_SGEMM/170     135169 ns     135173 ns      10231
BM_SGEMM/180     146170 ns     146172 ns       9535
BM_SGEMM/189     190226 ns     190231 ns       7397
BM_SGEMM/200     194513 ns     194519 ns       7210
BM_SGEMM/256     396561 ns     396573 ns       3531
```
with this change:
```
----------------------------------------------------
Benchmark             Time           CPU Iterations
----------------------------------------------------
BM_SGEMM/4           95 ns         95 ns   14500387
BM_SGEMM/6          166 ns        166 ns    8381763
BM_SGEMM/8          196 ns        196 ns    7277044
BM_SGEMM/10         256 ns        256 ns    5515721
BM_SGEMM/16         463 ns        463 ns    3025197
BM_SGEMM/20         636 ns        636 ns    2070213
BM_SGEMM/32        1885 ns       1885 ns     739444
BM_SGEMM/40        2969 ns       2969 ns     472152
BM_SGEMM/64        9371 ns       9372 ns     148932
BM_SGEMM/72       12431 ns      12431 ns     112919
BM_SGEMM/80       15615 ns      15616 ns      89978
BM_SGEMM/90       25397 ns      25398 ns      55041
BM_SGEMM/100      29445 ns      29446 ns      47540
BM_SGEMM/112      37530 ns      37531 ns      37286
BM_SGEMM/128      55373 ns      55375 ns      25277
BM_SGEMM/140      76241 ns      76241 ns      18259
BM_SGEMM/150     102196 ns     102200 ns      13736
BM_SGEMM/160     101521 ns     101525 ns      13556
BM_SGEMM/170     136182 ns     136184 ns      10567
BM_SGEMM/180     146861 ns     146864 ns       9035
BM_SGEMM/189     192632 ns     192632 ns       7231
BM_SGEMM/200     198547 ns     198555 ns       6995
BM_SGEMM/256     392316 ns     392330 ns       3539
```

Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost
of small matrix operations was overshadowed by thread locking (look smaller than
32) even when not explicitly spawning threads:
```
----------------------------------------------------
Benchmark             Time           CPU Iterations
----------------------------------------------------
BM_SGEMM/4          328 ns        328 ns    4170562
BM_SGEMM/6          396 ns        396 ns    3536400
BM_SGEMM/8          418 ns        418 ns    3330102
BM_SGEMM/10         491 ns        491 ns    2863047
BM_SGEMM/16         710 ns        710 ns    2028314
BM_SGEMM/20         871 ns        871 ns    1581546
BM_SGEMM/32        2132 ns       2132 ns     657089
BM_SGEMM/40        3197 ns       3196 ns     437969
BM_SGEMM/64        9645 ns       9645 ns     144987
BM_SGEMM/72       35064 ns      32881 ns      50264
BM_SGEMM/80       37661 ns      35787 ns      42080
BM_SGEMM/90       36507 ns      36077 ns      40091
BM_SGEMM/100      32513 ns      31850 ns      48607
BM_SGEMM/112      41742 ns      41207 ns      37273
BM_SGEMM/128      67211 ns      65095 ns      21933
BM_SGEMM/140      68263 ns      67943 ns      19245
BM_SGEMM/150     121854 ns     115439 ns      10660
BM_SGEMM/160     116826 ns     115539 ns      10000
BM_SGEMM/170     126566 ns     122798 ns      11960
BM_SGEMM/180     130088 ns     127292 ns      11503
BM_SGEMM/189     120309 ns     116634 ns      13162
BM_SGEMM/200     114559 ns     110993 ns      10000
BM_SGEMM/256     217063 ns     207806 ns       6417
```
and after, it's gone (note this includes my other change which reduces calls
to num_cpu_avail):
```
----------------------------------------------------
Benchmark             Time           CPU Iterations
----------------------------------------------------
BM_SGEMM/4           95 ns         95 ns   12347650
BM_SGEMM/6          166 ns        166 ns    8259683
BM_SGEMM/8          193 ns        193 ns    7162210
BM_SGEMM/10         258 ns        258 ns    5415657
BM_SGEMM/16         471 ns        471 ns    2981009
BM_SGEMM/20         666 ns        666 ns    2148002
BM_SGEMM/32        1903 ns       1903 ns     738245
BM_SGEMM/40        2969 ns       2969 ns     473239
BM_SGEMM/64        9440 ns       9440 ns     148442
BM_SGEMM/72       37239 ns      33330 ns      46813
BM_SGEMM/80       57350 ns      55949 ns      32251
BM_SGEMM/90       36275 ns      36249 ns      42259
BM_SGEMM/100      31111 ns      31008 ns      45270
BM_SGEMM/112      43782 ns      40912 ns      34749
BM_SGEMM/128      67375 ns      64406 ns      22443
BM_SGEMM/140      76389 ns      67003 ns      21430
BM_SGEMM/150      72952 ns      71830 ns      19793
BM_SGEMM/160      97039 ns      96858 ns      11498
BM_SGEMM/170     123272 ns     122007 ns      11855
BM_SGEMM/180     126828 ns     126505 ns      11567
BM_SGEMM/189     115179 ns     114665 ns      11044
BM_SGEMM/200      89289 ns      87259 ns      16147
BM_SGEMM/256     226252 ns     222677 ns       7375
```

I've also tested this with ThreadSanitizer and found no data races during
execution.  I'm not sure why 200 is always faster than it's neighbors, we must
be hitting some optimal cache size or something.
---
 driver/others/memory.c | 199 +++++++++--------------------------------
 1 file changed, 43 insertions(+), 156 deletions(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index d69e52e97..85f790615 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -13,9 +13,9 @@ met:
       notice, this list of conditions and the following disclaimer in
       the documentation and/or other materials provided with the
       distribution.
-   3. Neither the name of the OpenBLAS project nor the names of 
-      its contributors may be used to endorse or promote products 
-      derived from this software without specific prior written 
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
       permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
@@ -139,6 +139,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FIXED_PAGESIZE 4096
 #endif
 
+#ifndef BUFFERS_PER_THREAD
+#ifdef USE_OPENMP
+#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
+#else
+#define BUFFERS_PER_THREAD NUM_BUFFERS
+#endif
+#endif
+
 #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
 
 #if defined(_MSC_VER) && !defined(__clang__)
@@ -213,7 +221,7 @@ int i,n;
   ret = sched_getaffinity(0,size,cpusetp);
   if (ret!=0) return nums;
   ret = CPU_COUNT_S(size,cpusetp);
-  if (ret > 0 && ret < nums) nums = ret;	
+  if (ret > 0 && ret < nums) nums = ret;
   CPU_FREE(cpusetp);
   return nums;
  #endif
@@ -415,8 +423,15 @@ struct release_t {
 
 int hugetlb_allocated = 0;
 
-static struct release_t release_info[NUM_BUFFERS];
-static int release_pos = 0;
+#if defined(OS_WINDOWS)
+#define THREAD_LOCAL __declspec(thread)
+#define UNLIKELY_TO_BE_ZERO(x) (x)
+#else
+#define THREAD_LOCAL __thread
+#define UNLIKELY_TO_BE_ZERO(x) (__builtin_expect(x, 0))
+#endif
+static struct release_t THREAD_LOCAL release_info[BUFFERS_PER_THREAD];
+static int THREAD_LOCAL release_pos = 0;
 
 #if defined(OS_LINUX) && !defined(NO_WARMUP)
 static int hot_alloc = 0;
@@ -459,15 +474,9 @@ static void *alloc_mmap(void *address){
   }
 
   if (map_address != (void *)-1) {
-#if defined(SMP) && !defined(USE_OPENMP)
-    LOCK_COMMAND(&alloc_lock);
-#endif    
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_mmap_free;
     release_pos ++;
-#if defined(SMP) && !defined(USE_OPENMP)
-    UNLOCK_COMMAND(&alloc_lock);
-#endif    
   }
 
 #ifdef OS_LINUX
@@ -611,15 +620,9 @@ static void *alloc_mmap(void *address){
 #endif
 
   if (map_address != (void *)-1) {
-#if defined(SMP) && !defined(USE_OPENMP)
-    LOCK_COMMAND(&alloc_lock);
-#endif
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_mmap_free;
     release_pos ++;
-#if defined(SMP) && !defined(USE_OPENMP)
-    UNLOCK_COMMAND(&alloc_lock);
-#endif
   }
 
   return map_address;
@@ -872,7 +875,7 @@ static void *alloc_hugetlb(void *address){
 
   tp.PrivilegeCount = 1;
   tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
-  
+
   if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
       CloseHandle(hToken);
       return (void*)-1;
@@ -961,20 +964,17 @@ static BLASULONG base_address      = 0UL;
 static BLASULONG base_address      = BASE_ADDRESS;
 #endif
 
-static volatile struct {
-  BLASULONG lock;
+struct memory_t {
   void *addr;
-#if defined(WHEREAMI) && !defined(USE_OPENMP)
-  int   pos;
-#endif
   int used;
 #ifndef __64BIT__
   char dummy[48];
 #else
   char dummy[40];
 #endif
+};
 
-} memory[NUM_BUFFERS];
+static struct memory_t THREAD_LOCAL memory[BUFFERS_PER_THREAD];
 
 static int memory_initialized = 0;
 
@@ -987,9 +987,6 @@ static int memory_initialized = 0;
 void *blas_memory_alloc(int procpos){
 
   int position;
-#if defined(WHEREAMI) && !defined(USE_OPENMP)
-  int mypos;
-#endif
 
   void *map_address;
 
@@ -1020,102 +1017,48 @@ void *blas_memory_alloc(int procpos){
   };
   void *(**func)(void *address);
 
-#if defined(USE_OPENMP)
-  if (!memory_initialized) {
-#endif
+  if (UNLIKELY_TO_BE_ZERO(memory_initialized)) {
 
-  LOCK_COMMAND(&alloc_lock);
+    /* Only allow a single thread to initialize memory system */
+    LOCK_COMMAND(&alloc_lock);
 
-  if (!memory_initialized) {
-
-#if defined(WHEREAMI) && !defined(USE_OPENMP)
-    for (position = 0; position < NUM_BUFFERS; position ++){
-      memory[position].addr   = (void *)0;
-      memory[position].pos    = -1;
-      memory[position].used   = 0;
-      memory[position].lock   = 0;
-    }
-#endif
+    if (!memory_initialized) {
 
 #ifdef DYNAMIC_ARCH
-    gotoblas_dynamic_init();
+      gotoblas_dynamic_init();
 #endif
 
 #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
-    gotoblas_affinity_init();
+      gotoblas_affinity_init();
 #endif
 
 #ifdef SMP
-    if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
+      if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
 #endif
 
 #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
 #ifndef DYNAMIC_ARCH
-    blas_set_parameter();
+      blas_set_parameter();
 #endif
 #endif
 
-    memory_initialized = 1;
+      memory_initialized = 1;
 
+    }
+    UNLOCK_COMMAND(&alloc_lock);
   }
-  UNLOCK_COMMAND(&alloc_lock);
-#if defined(USE_OPENMP)
-  }
-#endif
 
 #ifdef DEBUG
   printf("Alloc Start ...\n");
-#endif
-
-#if defined(WHEREAMI) && !defined(USE_OPENMP)
-
-  mypos = WhereAmI();
-
-  position = mypos;
-  while (position >= NUM_BUFFERS) position >>= 1;
-
-  do {
-    if (!memory[position].used && (memory[position].pos == mypos)) {
-#if defined(SMP) && !defined(USE_OPENMP)
-      LOCK_COMMAND(&alloc_lock);
-#else      
-      blas_lock(&memory[position].lock);
-#endif
-      if (!memory[position].used) goto allocation;
-#if defined(SMP) && !defined(USE_OPENMP)
-      UNLOCK_COMMAND(&alloc_lock);
-#else
-      blas_unlock(&memory[position].lock);
-#endif      
-    }
-
-    position ++;
-
-  } while (position < NUM_BUFFERS);
-
-
 #endif
 
   position = 0;
 
   do {
-#if defined(SMP) && !defined(USE_OPENMP)
-      LOCK_COMMAND(&alloc_lock);
-#else
-    if (!memory[position].used) { 
-      blas_lock(&memory[position].lock);
-#endif
       if (!memory[position].used) goto allocation;
-#if defined(SMP) && !defined(USE_OPENMP)
-      UNLOCK_COMMAND(&alloc_lock);
-#else      
-      blas_unlock(&memory[position].lock);
-      }
-#endif
-
     position ++;
 
-  } while (position < NUM_BUFFERS);
+  } while (position < BUFFERS_PER_THREAD);
 
   goto error;
 
@@ -1126,11 +1069,6 @@ void *blas_memory_alloc(int procpos){
 #endif
 
   memory[position].used = 1;
-#if defined(SMP) && !defined(USE_OPENMP)
-  UNLOCK_COMMAND(&alloc_lock);
-#else
-  blas_unlock(&memory[position].lock);
-#endif
 
   if (!memory[position].addr) {
     do {
@@ -1148,14 +1086,14 @@ void *blas_memory_alloc(int procpos){
 
 #ifdef ALLOC_DEVICEDRIVER
 	if ((*func ==  alloc_devicedirver) && (map_address == (void *)-1)) {
-	    fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
+	    fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n");
 	}
 #endif
 
 #ifdef ALLOC_HUGETLBFILE
 	if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
 #ifndef OS_WINDOWS
-	    fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
+	    fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n");
 #endif
 	}
 #endif
@@ -1176,44 +1114,13 @@ void *blas_memory_alloc(int procpos){
 
     } while ((BLASLONG)map_address == -1);
 
-#if defined(SMP) && !defined(USE_OPENMP)
-    LOCK_COMMAND(&alloc_lock);
-#endif    
     memory[position].addr = map_address;
-#if defined(SMP) && !defined(USE_OPENMP)
-    UNLOCK_COMMAND(&alloc_lock);
-#endif
 
 #ifdef DEBUG
     printf("  Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
 #endif
   }
 
-#if defined(WHEREAMI) && !defined(USE_OPENMP)
-
-  if (memory[position].pos == -1) memory[position].pos = mypos;
-
-#endif
-
-#ifdef DYNAMIC_ARCH
-
-  if (memory_initialized == 1) {
-
-    LOCK_COMMAND(&alloc_lock);
-
-    if (memory_initialized == 1) {
-
-      if (!gotoblas) gotoblas_dynamic_init();
-
-      memory_initialized = 2;
-    }
-
-    UNLOCK_COMMAND(&alloc_lock);
-
-  }
-#endif
-
-
 #ifdef DEBUG
   printf("Mapped   : %p  %3d\n\n",
 	  (void *)memory[position].addr, position);
@@ -1222,7 +1129,7 @@ void *blas_memory_alloc(int procpos){
   return (void *)memory[position].addr;
 
  error:
-  printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
+  printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
 
   return NULL;
 }
@@ -1236,10 +1143,7 @@ void blas_memory_free(void *free_area){
 #endif
 
   position = 0;
-#if defined(SMP) && !defined(USE_OPENMP)
-  LOCK_COMMAND(&alloc_lock);
-#endif
-  while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
+  while ((position < BUFFERS_PER_THREAD) && (memory[position].addr != free_area))
     position++;
 
   if (memory[position].addr != free_area) goto error;
@@ -1248,13 +1152,7 @@ void blas_memory_free(void *free_area){
   printf("  Position : %d\n", position);
 #endif
 
-  // arm: ensure all writes are finished before other thread takes this memory
-  WMB;
-
   memory[position].used = 0;
-#if defined(SMP) && !defined(USE_OPENMP)
-  UNLOCK_COMMAND(&alloc_lock);
-#endif
 
 #ifdef DEBUG
   printf("Unmap Succeeded.\n\n");
@@ -1266,11 +1164,8 @@ void blas_memory_free(void *free_area){
   printf("BLAS : Bad memory unallocation! : %4d  %p\n", position,  free_area);
 
 #ifdef DEBUG
-  for (position = 0; position < NUM_BUFFERS; position++)
+  for (position = 0; position < BUFFERS_PER_THREAD; position++)
     printf("%4ld  %p : %d\n", position, memory[position].addr, memory[position].used);
-#endif
-#if defined(SMP) && !defined(USE_OPENMP)
-  UNLOCK_COMMAND(&alloc_lock);
 #endif
   return;
 }
@@ -1293,8 +1188,6 @@ void blas_shutdown(void){
   BLASFUNC(blas_thread_shutdown)();
 #endif
 
-  LOCK_COMMAND(&alloc_lock);
-
   for (pos = 0; pos < release_pos; pos ++) {
     release_info[pos].func(&release_info[pos]);
   }
@@ -1305,17 +1198,11 @@ void blas_shutdown(void){
   base_address      = BASE_ADDRESS;
 #endif
 
-  for (pos = 0; pos < NUM_BUFFERS; pos ++){
+  for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){
     memory[pos].addr   = (void *)0;
     memory[pos].used   = 0;
-#if defined(WHEREAMI) && !defined(USE_OPENMP)
-    memory[pos].pos    = -1;
-#endif
-    memory[pos].lock   = 0;
   }
 
-  UNLOCK_COMMAND(&alloc_lock);
-
   return;
 }
 

From 47bf0dba8f7a9cbd559e2f9cabe0bf2c7d3ee7a8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 15 Jun 2018 11:25:05 +0200
Subject: [PATCH 53/86] Add build-time option for OMP scheduler; document
 MULTITHREAD_THRESHOLD range (#1620)

* Allow choosing the OpenMP scheduler and add range hint for GEMM_MULTITHREAD_THRESHOLD
* Amended description of GEMM_MULTITHREAD_THRESHOLD
to reflect #742 making it track floating point operations rather than matrix size
---
 Makefile.rule                   | 15 +++++++++++++--
 driver/others/blas_server_omp.c |  6 +++++-
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/Makefile.rule b/Makefile.rule
index 5c03d0195..649aabe70 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -60,6 +60,14 @@ VERSION = 0.3.1.dev
 # This flag is always set for POWER8. Don't modify the flag 
 # USE_OPENMP = 1
 
+# The OpenMP scheduler to use - by default this is "static" and you
+# will normally not want to change this unless you know that your main
+# workload will involve tasks that have highly unbalanced running times
+# for individual threads. Changing away from "static" may also adversely
+# affect memory access locality in NUMA systems. Setting to "runtime" will
+# allow you to select the scheduler from the environment variable OMP_SCHEDULE
+# CCOMMON_OPT += -DOMP_SCHED=dynamic
+
 # You can define maximum number of threads. Basically it should be
 # less than actual number of cores. If you don't specify one, it's
 # automatically detected by the the script.
@@ -156,8 +164,11 @@ NO_AFFINITY = 1
 # CONSISTENT_FPCSR = 1
 
 # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
-# with single thread. You can use this flag to avoid the overhead of multi-threading
-# in small matrix sizes. The default value is 4.
+# with single thread. (Actually in recent versions this is a factor proportional to the
+# number of floating point operations necessary for the given problem size, no longer
+# an individual dimension). You can use this setting to avoid the overhead of multi-
+# threading in small matrix sizes. The default value is 4, but values as high as 50 have 
+# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
 # GEMM_MULTITHREAD_THRESHOLD = 4
 
 # If you need santy check by comparing reference BLAS. It'll be very
diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c
index fccdb4320..4255852c8 100644
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -48,6 +48,10 @@
 
 #else
 
+#ifndef OMP_SCHED
+#define OMP_SCHED static
+#endif
+
 int blas_server_avail = 0;
 
 static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
@@ -331,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
       break;
   }
 
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for schedule(OMP_SCHED)
   for (i = 0; i < num; i ++) {
 
 #ifndef USE_SIMPLE_THREADED_LEVEL3

From 9e162146a93a58a06515bc53f07e37b8924e0d67 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 17 Jun 2018 15:32:03 +0000
Subject: [PATCH 54/86] Only initialize the part of the jobs array that will
 get used

The jobs array is getting initialized in O(compiled cpus^2) complexity.
Distros and people with bigger systems will use pretty high values
(128 or 256 or more) for this value, leading to interesting bubbles
in performance.

Baseline (single threaded performance) gets roughly 13 - 15 multiplications per cycle
in the interesting range (threading kicks in at 65x65 mult by 65x65).
The hardware is capable of 32 multiplications per cycle theoretically.

   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               10703.9   10.6       0.0%                             17990.6      6.3       0.0%
  64 x 64               20778.4   12.8       0.0%                             40629.2      6.5       0.0%
  65 x 65               26869.9   10.3       0.0%                             52545.7      5.3       0.0%
  80 x 80               38104.5   13.5       0.0%                             72492.7      7.1       0.0%
  96 x 96               61626.4   14.4       0.0%                            113983.8      7.8       0.0%
 112 x 112              91803.8   15.3       0.0%                            180987.3      7.8       0.0%
 128 x 128             133161.4   15.8       0.0%                            258374.3      8.1       0.0%

When threading is turned on
TARGET=SKYLAKEX F_COMPILER=GFORTRAN  SHARED=1 DYNAMIC_THREADS=1 USE_OPENMP=0  NUM_THREADS=128

  Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               10725.9   10.5      -0.2%                             18134.9      6.2      -0.8%
  64 x 64               20500.6   12.9       1.3%                             40929.1      6.5      -0.7%
  65 x 65             2040832.1    0.1   -7495.2%                           2097633.6      0.1   -3892.0%
  80 x 80             2063129.1    0.2   -5314.4%                           2119925.2      0.2   -2824.3%
  96 x 96             2070374.5    0.4   -3259.6%                           2173604.4      0.4   -1806.9%
 112 x 112            2111721.5    0.7   -2169.6%                           2263330.8      0.6   -1170.0%
 128 x 128            2276181.5    0.9   -1609.3%                           2377228.9      0.9    -820.1%

There is a deep deep cliff once you hit 65x65

With this patch

   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               10630.0   10.6       0.7%                             18112.8      6.2      -0.7%
  64 x 64               20374.8   13.0       1.9%                             40487.0      6.5       0.4%
  65 x 65              141955.2    1.9    -428.3%                            146708.8      1.9    -179.2%
  80 x 80              178921.1    2.9    -369.6%                            186032.7      2.8    -156.6%
  96 x 96              205436.2    4.3    -233.4%                            224513.1      3.9     -97.0%
 112 x 112             244408.2    5.8    -162.7%                            262158.7      5.4     -47.1%
 128 x 128             321334.5    6.5    -141.3%                            333829.0      6.3     -29.2%

The cliff is very significantly reduced.
(more to follow)
---
 driver/level3/level3_thread.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c
index 4ab1ee8cc..018813b8c 100644
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -658,8 +658,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
     }
 
     /* Clear synchronization flags */
-    for (i = 0; i < MAX_CPU_NUMBER; i++) {
-      for (j = 0; j < MAX_CPU_NUMBER; j++) {
+    for (i = 0; i < nthreads; i++) {
+      for (j = 0; j < nthreads; j++) {
 	for (k = 0; k < DIVIDE_RATE; k++) {
 	  job[i].working[j][CACHE_LINE_SIZE * k] = 0;
 	}

From d148ec4ea18e672dacb1270d4a5308ccaaae18bc Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 17 Jun 2018 15:39:15 +0000
Subject: [PATCH 55/86] Don't use _Atomic for jobs sometimes...

The use of _Atomic leads to really bad code generation in the compiler
(on x86, you get 2 "mfence" memory barriers around each access with gcc8, despite
x86 being ordered and cache coherent). But there's a fallback in the code that
just uses volatile which is more than plenty in practice.

If we're nervous about cross thread synchronization for these variables, we should
make the YIELD function be a compiler/memory barrier instead.

performance before (after last commit)

   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               10630.0   10.6       0.7%                             18112.8      6.2      -0.7%
  64 x 64               20374.8   13.0       1.9%                             40487.0      6.5       0.4%
  65 x 65              141955.2    1.9    -428.3%                            146708.8      1.9    -179.2%
  80 x 80              178921.1    2.9    -369.6%                            186032.7      2.8    -156.6%
  96 x 96              205436.2    4.3    -233.4%                            224513.1      3.9     -97.0%
 112 x 112             244408.2    5.8    -162.7%                            262158.7      5.4     -47.1%
 128 x 128             321334.5    6.5    -141.3%                            333829.0      6.3     -29.2%

Performance with this patch (roughly a 2x improvement):

   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               10756.0   10.5      -0.5%                             18296.7      6.1      -1.7%
  64 x 64               20490.0   12.9       1.4%                             40615.0      6.5       0.0%
  65 x 65               83528.3    3.3    -210.9%                             96319.0      2.9     -83.3%
  80 x 80              101453.5    5.1    -166.3%                            128021.7      4.0     -76.6%
  96 x 96              149795.1    5.9    -143.1%                            168059.4      5.3     -47.4%
 112 x 112             191481.2    7.3    -105.8%                            204165.0      6.9     -14.6%
 128 x 128             265019.2    7.9     -99.0%                            272006.4      7.7      -5.3%
---
 driver/level3/level3_thread.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c
index 018813b8c..7e75f69d1 100644
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -91,11 +91,7 @@
 #endif
 
 typedef struct {
-#if __STDC_VERSION__ >= 201112L
-_Atomic
-#else  
   volatile
-#endif
    BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
 } job_t;
 

From 5c6f008365ee3c6d42f8630d27259f130a688468 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 17 Jun 2018 15:47:50 +0000
Subject: [PATCH 56/86] Tune param.h for SkylakeX

param.h defines a per-platform SWITCH_RATIO, which is used as a measure for how fine
grained the blocks for gemm need to be split up. Many platforms define this to 4.

The reality is that the gemm low level implementation for SkylakeX likes bigger blocks
due to the nature of SIMD... by tuning the SWITCH_RATIO to 32 the threading performance
improves significantly:

Before
   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               10756.0   10.5      -0.5%                             18296.7      6.1      -1.7%
  64 x 64               20490.0   12.9       1.4%                             40615.0      6.5       0.0%
  65 x 65               83528.3    3.3    -210.9%                             96319.0      2.9     -83.3%
  80 x 80              101453.5    5.1    -166.3%                            128021.7      4.0     -76.6%
  96 x 96              149795.1    5.9    -143.1%                            168059.4      5.3     -47.4%
 112 x 112             191481.2    7.3    -105.8%                            204165.0      6.9     -14.6%
 128 x 128             265019.2    7.9     -99.0%                            272006.4      7.7      -5.3%

After
   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               10666.3   10.6       0.4%                             18236.9      6.2      -1.4%
  64 x 64               20410.1   13.0       1.8%                             39925.8      6.6       1.7%
  65 x 65               34983.0    7.9     -30.2%                             51494.6      5.4       2.0%
  80 x 80               39769.1   13.0      -4.4%                             63805.2      8.1      12.0%
  96 x 96               45169.6   19.7      26.7%                             80065.8     11.1      29.8%
 112 x 112              57026.1   24.7      38.7%                             99535.5     14.2      44.1%
 128 x 128              64789.8   32.5      51.3%                            117407.2     17.9      54.6%

With this change, threading starts to be a win already at 96x96
---
 param.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/param.h b/param.h
index 49a5e85e8..3573fffbb 100644
--- a/param.h
+++ b/param.h
@@ -1626,7 +1626,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define SYMV_P  8
 
-#define SWITCH_RATIO	4
+#define SWITCH_RATIO	32
 
 #ifdef ARCH_X86
 

From 6eb4b9ae7c7cc58af00ac21b52fed8810d7e5710 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 17 Jun 2018 17:05:04 +0000
Subject: [PATCH 57/86] Tune HASWELL SWITCH_RATIO as well

Similar to the SKYLAKEX patch, 32 seems to work best
(much better than 4 or 16)

Before (4)

   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               15554.3    7.2       0.2%                             30353.8      3.7       0.3%
  64 x 64               30346.8    8.7       1.6%                             63495.0      4.1      -0.1%
  65 x 65               81668.1    3.4    -123.3%                             82705.2      3.3     -21.2%
  80 x 80              105045.9    4.9     -95.5%                            115226.0      4.5      -2.2%
  96 x 96              152461.2    5.8     -74.3%                            148156.3      6.0      16.4%
 112 x 112             188505.2    7.5     -42.2%                            171187.3      8.2      36.4%
 128 x 128             257884.0    8.1     -39.5%                            224764.8      9.3      46.0%

Intermediate (16)

   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               15565.7    7.2       0.2%                             30378.9      3.7       0.2%
  64 x 64               30430.2    8.7       1.3%                             63046.4      4.2       0.6%
  65 x 65               27306.0   10.1      25.3%                             38879.2      7.1      43.0%
  80 x 80               51008.7   10.1       5.1%                             61007.6      8.4      45.9%
  96 x 96               70856.7   12.5      19.0%                             83403.1     10.6      53.0%
 112 x 112              84769.9   16.6      36.0%                             99920.1     14.1      62.9%
 128 x 128              84213.2   25.0      54.5%                            113024.2     18.6      72.8%

After (32)

   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               15537.3    7.2       0.3%                             30537.0      3.6      -0.3%
  64 x 64               30352.7    8.7       1.6%                             62597.8      4.2       1.3%
  65 x 65               36857.0    7.5      -0.8%                             56167.6      4.9      17.7%
  80 x 80               42552.6   12.1      20.8%                             69536.7      7.4      38.3%
  96 x 96               52101.5   17.1      40.5%                             91016.1      9.7      48.7%
 112 x 112              63853.7   22.1      51.8%                            110507.4     12.7      58.9%
 128 x 128              73966.1   28.4      60.0%                            163146.4     12.9      60.8%
---
 param.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/param.h b/param.h
index 3573fffbb..cfa4bba5c 100644
--- a/param.h
+++ b/param.h
@@ -1507,7 +1507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define SYMV_P  8
 
-#define SWITCH_RATIO	4
+#define SWITCH_RATIO	32
 
 #ifdef ARCH_X86
 

From 73de17664dfdf2934a2fdc6dd9442107e6c85035 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 17 Jun 2018 17:50:43 +0000
Subject: [PATCH 58/86] Add missing barriers in gemm scheduler

a few places in the gemm scheduler code were missing barriers;
the code likely worked OK due to heavy use of volatile / _Atomic
but there's no reason to get this incorrect
---
 driver/level3/level3_thread.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c
index 7e75f69d1..aeb5e6ed4 100644
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -347,7 +347,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
       /* Make sure if no one is using workspace */
       START_RPCC();
       for (i = 0; i < args -> nthreads; i++)
-	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
+	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
       STOP_RPCC(waiting1);
 
 #if defined(FUSED_GEMM) && !defined(TIMING)
@@ -409,7 +409,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
 	  /* Wait until other region of B is initialized */
 	  START_RPCC();
-	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
+	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
 	  STOP_RPCC(waiting2);
 
           /* Apply kernel with local region of A and part of other region of B */
@@ -427,6 +427,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
         /* Clear synchronization flag if this thread is done with other region of B */
 	if (m_to - m_from == min_i) {
 	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
+	  WMB;
 	}
       }
     } while (current != mypos);
@@ -488,7 +489,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
   START_RPCC();
   for (i = 0; i < args -> nthreads; i++) {
     for (js = 0; js < DIVIDE_RATE; js++) {
-      while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
+      while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;};
     }
   }
   STOP_RPCC(waiting3);

From 7e39ffe1135ee6ca1dc119f6eea9566668fd0916 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 17 Jun 2018 17:53:15 +0000
Subject: [PATCH 59/86] On x86-64, make MB/WMB compiler barriers

Whie on x86(64) one does not normally need full memory barriers, it's
good practice to at least use compiler barriers for places where on other
architectures memory barriers are used; this prevents the compiler
from over-optimizing.
---
 common_x86_64.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/common_x86_64.h b/common_x86_64.h
index 7461aaf60..3236778b8 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -60,8 +60,13 @@
 #endif
 */
 
+#ifdef __GNUC__
+#define MB __asm__ __volatile__("": : :"memory")
+#define WMB __asm__ __volatile__("": : :"memory")
+#else
 #define MB
 #define WMB
+#endif
 
 static void __inline blas_lock(volatile BLASULONG *address){
 

From 2ddc96c9e5a86e3fd12954b3efc269f0cc8d07d8 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 17 Jun 2018 18:06:24 +0000
Subject: [PATCH 60/86] make WMB / MB safer on x86-64

make it so that

if (foo)
	RMB;
else
	MB;

is always done correctly and without syntax surprises
---
 common_x86_64.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/common_x86_64.h b/common_x86_64.h
index 3236778b8..62e138e34 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -61,11 +61,11 @@
 */
 
 #ifdef __GNUC__
-#define MB __asm__ __volatile__("": : :"memory")
-#define WMB __asm__ __volatile__("": : :"memory")
+#define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
+#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
 #else
-#define MB
-#define WMB
+#define MB do {} while (0)
+#define WMB do {} while (0)
 #endif
 
 static void __inline blas_lock(volatile BLASULONG *address){

From 2d8cc7193ace18c28ea05ef39e13bb28437b6d89 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 17 Jun 2018 23:38:14 +0200
Subject: [PATCH 61/86] Support upcoming Intel Cannon Lake CPUs as Skylake X
 (#1621)

* Support  upcoming Cannon Lake as Skylake X
---
 cpuid_x86.c             | 17 +++++++++++++++++
 driver/others/dynamic.c | 17 +++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/cpuid_x86.c b/cpuid_x86.c
index fc937865c..89eb809b0 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1339,6 +1339,23 @@ int get_cpuname(void){
 	    return CPUTYPE_NEHALEM;
 	}
 	break;
+      case 6:
+        switch (model) {
+        case 6: // Cannon Lake
+#ifndef NO_AVX512
+	  return CPUTYPE_SKYLAKEX;
+#else
+	  if(support_avx())
+#ifndef NO_AVX2
+	  return CPUTYPE_HASWELL;
+#else
+	  return CPUTYPE_SANDYBRIDGE;
+#endif
+	  else
+	  return CPUTYPE_NEHALEM;
+#endif			
+        }
+      break;  
       case 9:
       case 8: 
         switch (model) {
diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 4271c0a0d..bacd3b7fa 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -338,6 +338,23 @@ static gotoblas_t *get_coretype(void){
 	  return &gotoblas_NEHALEM;
 	}	
 	return NULL;
+      case 6:
+        if (model == 6) {
+          // Cannon Lake
+#ifndef NO_AVX512
+	  return &gotoblas_SKYLAKEX;
+#else
+	  if(support_avx())
+#ifndef NO_AVX2
+	  return &gotoblas_HASWELL;
+#else
+	  return &gotblas_SANDYBRIDGE;
+#endif
+	  else
+	  return &gotoblas_NEHALEM;
+#endif			
+        }
+        return NULL;  
       case 9:
       case 8:
 	if (model == 14 ) { // Kaby Lake

From 1f9e4f319327dd53d1243edb3a812c5a2366a938 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 19 Jun 2018 20:46:36 +0200
Subject: [PATCH 62/86] Handle special case of gfortran+clang+OpenMP

---
 ctest/Makefile | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/ctest/Makefile b/ctest/Makefile
index 6eda43863..569a5dda3 100644
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -102,7 +102,13 @@ clean ::
 	rm -f x*
 
 FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
-CEXTRALIB =
+ifeq ($(USE_OPENMP), 1)
+ifeq ($(F_COMPILER), GFORTRAN)
+ifeq ($(C_COMPILER), CLANG)
+CEXTRALIB = -lomp
+endif
+endif
+endif
 
 # Single real
 xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME)

From 6a5ab083b7e78458861b197b8e98b2506345d6d7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 19 Jun 2018 20:47:33 +0200
Subject: [PATCH 63/86] Handle special case of gfortran+clang+OpenMP

---
 test/Makefile | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/test/Makefile b/test/Makefile
index 65fb6f438..074411b05 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -122,8 +122,13 @@ endif
 
 
 FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
-CEXTRALIB =
-
+ifeq ($(USE_OPENMP), 1)
+ifeq ($(F_COMPILER), GFORTRAN)
+ifeq ($(C_COMPILER), CLANG)
+CEXTRALIB = -lomp
+endif
+endif
+endif
 
 sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME)
 	$(FC) $(FLDFLAGS) -o sblat1 sblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)

From 10b70c904d9e3b610d35f1efe8d89888da4011bb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 19 Jun 2018 20:53:19 +0200
Subject: [PATCH 64/86] Handle erroneous user settings NOFORTRAN=0 and
 NO_FORTRAN

---
 Makefile | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Makefile b/Makefile
index 56b4426f8..728567f80 100644
--- a/Makefile
+++ b/Makefile
@@ -21,6 +21,15 @@ ifeq ($(BUILD_RELAPACK), 1)
 RELA = re_lapack
 endif
 
+ifeq ($(NOFORTRAN), 0)
+undefine NOFORTRAN
+endif
+
+ifeq ($(NO_FORTRAN), 1)
+undefine NO_FORTRAN
+NOFORTRAN=1
+endif
+
 LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
 
 SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench

From 9369d3e6e5207c6974af162e67d4060ed625c322 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 19 Jun 2018 23:28:06 +0200
Subject: [PATCH 65/86] Modify NOFORTRAN tests to always check the value; fix
 rewriting of NO_FORTRAN

---
 Makefile | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/Makefile b/Makefile
index 728567f80..4760be0be 100644
--- a/Makefile
+++ b/Makefile
@@ -21,13 +21,15 @@ ifeq ($(BUILD_RELAPACK), 1)
 RELA = re_lapack
 endif
 
-ifeq ($(NOFORTRAN), 0)
-undefine NOFORTRAN
-endif
-
 ifeq ($(NO_FORTRAN), 1)
-undefine NO_FORTRAN
-NOFORTRAN=1
+define NOFORTRAN
+1
+endef
+define NO_LAPACK
+1
+endef
+export NOFORTRAN
+export NO_LAPACK
 endif
 
 LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
@@ -56,7 +58,7 @@ endif
 endif
 
 	@echo "  C compiler       ... $(C_COMPILER)  (command line : $(CC))"
-ifndef NOFORTRAN
+ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
 	@echo "  Fortran compiler ... $(F_COMPILER)  (command line : $(FC))"
 endif
 ifneq ($(OSNAME), AIX)
@@ -117,7 +119,7 @@ endif
 endif
 
 tests :
-ifndef NOFORTRAN
+ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
 	touch $(LIBNAME)
 ifndef NO_FBLAS
 	$(MAKE) -C test all
@@ -219,7 +221,7 @@ netlib :
 
 else
 netlib : lapack_prebuild
-ifndef NOFORTRAN
+ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
 endif
@@ -240,7 +242,7 @@ prof_lapack : lapack_prebuild
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
 
 lapack_prebuild :
-ifndef NOFORTRAN
+ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
 	-@echo "FORTRAN     = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "OPTS        = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "POPTS       = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -283,21 +285,21 @@ endif
 endif
 
 large.tgz :
-ifndef NOFORTRAN
+ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
 	if [ ! -a $< ]; then
 	-wget http://www.netlib.org/lapack/timing/large.tgz;
 	fi
 endif
 
 timing.tgz :
-ifndef NOFORTRAN
+ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
 	if [ ! -a $< ]; then
 	-wget http://www.netlib.org/lapack/timing/timing.tgz;
 	fi
 endif
 
 lapack-timing : large.tgz timing.tgz
-ifndef NOFORTRAN
+ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
 	(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
 	(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
 	$(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING

From 952541e840bddbcdcdfce81aefc09edf7fbfb84f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 20 Jun 2018 13:20:30 +0200
Subject: [PATCH 66/86] Need to use filter-out to handle NOFORTRAN not set

---
 Makefile | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 4760be0be..49dab6484 100644
--- a/Makefile
+++ b/Makefile
@@ -58,7 +58,7 @@ endif
 endif
 
 	@echo "  C compiler       ... $(C_COMPILER)  (command line : $(CC))"
-ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
+ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
 	@echo "  Fortran compiler ... $(F_COMPILER)  (command line : $(FC))"
 endif
 ifneq ($(OSNAME), AIX)
@@ -119,7 +119,7 @@ endif
 endif
 
 tests :
-ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
+ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
 	touch $(LIBNAME)
 ifndef NO_FBLAS
 	$(MAKE) -C test all
@@ -221,7 +221,7 @@ netlib :
 
 else
 netlib : lapack_prebuild
-ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
+ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
 endif
@@ -242,7 +242,10 @@ prof_lapack : lapack_prebuild
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
 
 lapack_prebuild :
-ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
+	$(info filter value of NOFORTRAN is:)
+	$(info x$(filter-out $(NOFORTRAN), 1 2)x)
+
+ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
 	-@echo "FORTRAN     = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "OPTS        = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "POPTS       = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc

From 0c5b7b400b3973d214ce24c566be4446743eacf7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 20 Jun 2018 15:16:19 +0200
Subject: [PATCH 67/86] Add -march=skylake-avx512 to flags if target is skylake
 x

---
 Makefile.x86_64 | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Makefile.x86_64 b/Makefile.x86_64
index 1ba63278a..677c05d93 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -8,6 +8,13 @@ endif
 endif
 endif
 
+ifeq ($(CORE), SKYLAKEX)
+ifndef NO_AVX512
+CCOMMON_OPT += -march=skylake-avx512
+FCOMMON_OPT += -march=skylake-avx512
+endif
+endif
+
 ifeq ($(OSNAME), Interix)
 ARFLAGS		= -m x64
 endif

From 05978528c3f3c61fb370e1fae0ac3013faaa595e Mon Sep 17 00:00:00 2001
From: Craig Donner <cdonner@google.com>
Date: Wed, 20 Jun 2018 17:03:18 +0100
Subject: [PATCH 68/86] Avoid declaring arrays of size 0 when making large
 stack allocations.

---
 common_stackalloc.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/common_stackalloc.h b/common_stackalloc.h
index 71fb1a477..ec0fa1611 100644
--- a/common_stackalloc.h
+++ b/common_stackalloc.h
@@ -47,14 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * - large enough to support all architectures and kernel
  * Chosing a too small SIZE will lead to a stack smashing.
  */
-#define STACK_ALLOC(SIZE, TYPE, BUFFER)                                    \
-  /* make it volatile because some function (ex: dgemv_n.S) */             \
-  /* do not restore all register */                                        \
-  volatile int stack_alloc_size = SIZE;                                    \
-  if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE))                    \
-    stack_alloc_size = 0;                                                  \
-  STACK_ALLOC_PROTECT_SET                                                  \
-  TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20)));    \
+#define STACK_ALLOC(SIZE, TYPE, BUFFER)                                        \
+  /* make it volatile because some function (ex: dgemv_n.S) */                 \
+  /* do not restore all register */                                            \
+  volatile int stack_alloc_size = SIZE;                                        \
+  if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \
+  STACK_ALLOC_PROTECT_SET                                                      \
+  /* Avoid declaring an array of length 0 */                                   \
+  TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1]                   \
+      __attribute__((aligned(0x20)));                                          \
   BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
 #else
   //Original OpenBLAS/GotoBLAS codes.

From a399d004257b2f43e8211341f924f3a73171b98c Mon Sep 17 00:00:00 2001
From: oon3m0oo <oon3m0oo@users.noreply.github.com>
Date: Wed, 20 Jun 2018 21:04:03 +0100
Subject: [PATCH 69/86] Further improvements to memory.c. (#1625)

- Compiler TLS is now used only used when the compiler supports it
- If compiler TLS is unsupported, we use platform-specific TLS
- Only one variable (an index) is now in TLS
- We only access TLS once per alloc, and never when freeing
- Allocation / release info is now stored within the allocation itself, by
  over-allocating; this saves having external structures do the bookkeeping, and
  reduces some of the redundant data that was being stored (such as addresses)
- We never hit the alloc lock when not using SMP or when using OpenMP (that was
  my fault)
- Now that there are fewer tracking structures I think this is a bit easier to
  read than before
---
 driver/others/memory.c | 399 +++++++++++++++++++++++++----------------
 1 file changed, 243 insertions(+), 156 deletions(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index 85f790615..ed20cf5cd 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -326,6 +326,8 @@ int  goto_get_num_procs  (void) {
   return blas_cpu_number;
 }
 
+static void blas_memory_init();
+
 void openblas_fork_handler()
 {
   // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
@@ -337,7 +339,7 @@ void openblas_fork_handler()
   // implementation of OpenMP.
 #if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
   int err;
-  err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
+  err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, blas_memory_init);
   if(err != 0)
     openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
 #endif
@@ -415,23 +417,104 @@ int openblas_get_num_threads(void) {
 #endif
 }
 
-struct release_t {
-  void *address;
-  void (*func)(struct release_t *);
-  long attr;
-};
-
 int hugetlb_allocated = 0;
 
 #if defined(OS_WINDOWS)
 #define THREAD_LOCAL __declspec(thread)
-#define UNLIKELY_TO_BE_ZERO(x) (x)
+#define LIKELY_ONE(x) (x)
 #else
 #define THREAD_LOCAL __thread
-#define UNLIKELY_TO_BE_ZERO(x) (__builtin_expect(x, 0))
+#define LIKELY_ONE(x) (__builtin_expect(x, 1))
 #endif
-static struct release_t THREAD_LOCAL release_info[BUFFERS_PER_THREAD];
-static int THREAD_LOCAL release_pos = 0;
+
+/* Stores information about the allocation and how to release it */
+struct alloc_t {
+  /* Whether this allocation is being used */
+  int used;
+  /* Any special attributes needed when releasing this allocation */
+  int attr;
+  /* Function that can properly release this memory */
+  void (*release_func)(struct alloc_t *);
+  /* Pad to 64-byte alignment */
+  char pad[64 - 2 * sizeof(int) - sizeof(void(*))];
+};
+
+/* Convenience macros for storing release funcs */
+#define STORE_RELEASE_FUNC(address, func)                   \
+  if (address != (void *)-1) {                              \
+    struct alloc_t *alloc_info = (struct alloc_t *)address; \
+    alloc_info->release_func = func;                        \
+  }
+
+#define STORE_RELEASE_FUNC_WITH_ATTR(address, func, attr)   \
+  if (address != (void *)-1) {                              \
+    struct alloc_t *alloc_info = (struct alloc_t *)address; \
+    alloc_info->release_func = func;                        \
+    alloc_info->attr = attr;                                \
+  }
+
+/* The number of bytes that will be allocated for each buffer. When allocating
+   memory, we store an alloc_t followed by the actual buffer memory. This means
+   that each allocation always has its associated alloc_t, without the need
+   for an auxiliary tracking structure. */
+static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t);
+
+/* Clang supports TLS from version 2.8 */
+#if defined(__clang__) && __clang_major__ > 2 || \
+    (__clang_minor__ == 2 || __clang_minor__ == 8)
+#define HAS_COMPILER_TLS
+#endif
+
+/* GCC supports TLS from version 4.1 */
+#if !defined(__clang__) && defined(__GNUC__) && \
+    (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
+#define HAS_COMPILER_TLS
+#endif
+
+/* MSVC supports TLS from version 2005 */
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+#define HAS_COMPILER_TLS
+#endif
+
+/* Versions of XCode before 8 did not properly support TLS */
+#if defined(__apple_build_version__) && __apple_build_version__ < 8000042
+#undef HAS_COMPILER_TLS
+#endif
+
+/* Android NDK's before version 12b did not support TLS */
+#if defined(__ANDROID__) && defined(__clang__)
+#if __has_include(<android/ndk-version.h>)
+#include <android/ndk-version.h>
+#endif
+#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
+    defined(__NDK_MINOR__) &&                                               \
+    ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
+#undef HAS_COMPILER_TLS
+#endif
+#endif
+
+/* Holds pointers to allocated memory */
+#if defined(SMP) && !defined(USE_OPENMP)
+/* This is the number of threads than can be spawned by the server, which is the
+   server plus the number of threads in the thread pool */
+#  define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER
+static int next_memory_table_pos = 0;
+#  if defined(HAS_COMPILER_TLS)
+/* Use compiler generated thread-local-storage */
+static int THREAD_LOCAL local_memory_table_pos = 0;
+#  else
+/* Use system-dependent thread-local-storage */
+#    if defined(OS_WINDOWS)
+static DWORD local_storage_key;
+#    else
+static pthread_key_t local_storage_key;
+#    endif /* defined(OS_WINDOWS) */
+#  endif /* defined(HAS_COMPILER_TLS) */
+#else
+/* There is only one allocating thread when in single-threaded mode and when using OpenMP */
+#  define MAX_ALLOCATING_THREADS 1
+#endif /* defined(SMP) && !defined(USE_OPENMP) */
+static struct alloc_t * local_memory_table[MAX_ALLOCATING_THREADS][BUFFERS_PER_THREAD];
 
 #if defined(OS_LINUX) && !defined(NO_WARMUP)
 static int hot_alloc = 0;
@@ -447,11 +530,41 @@ static pthread_spinlock_t alloc_lock = 0;
 static BLASULONG  alloc_lock = 0UL;
 #endif
 
+/* Returns a pointer to the start of the per-thread memory allocation data */
+static __inline struct alloc_t ** get_memory_table() {
+#if defined(SMP) && !defined(USE_OPENMP)
+#  if !defined(HAS_COMPILER_TLS)
+#    if defined(OS_WINDOWS)
+  int local_memory_table_pos = (int)::TlsGetValue(local_storage_key);
+#    else
+  int local_memory_table_pos = (int)pthread_getspecific(local_storage_key);
+#    endif /* defined(OS_WINDOWS) */
+#  endif /* !defined(HAS_COMPILER_TLS) */
+  if (!local_memory_table_pos) {
+    LOCK_COMMAND(&alloc_lock);
+    local_memory_table_pos = next_memory_table_pos++;
+    UNLOCK_COMMAND(&alloc_lock);
+    if (next_memory_table_pos > MAX_ALLOCATING_THREADS)
+      printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n");
+#  if !defined(HAS_COMPILER_TLS)
+#    if defined(OS_WINDOWS)
+    ::TlsSetValue(local_storage_key, (void*)local_memory_table_pos);
+#    else
+    pthread_setspecific(local_storage_key, (void*)local_memory_table_pos);
+#    endif /* defined(OS_WINDOWS) */
+#  endif /* !defined(HAS_COMPILER_TLS) */
+  }
+  return local_memory_table[local_memory_table_pos];
+#else
+  return local_memory_table[0];
+#endif /* defined(SMP) && !defined(USE_OPENMP) */
+}
+
 #ifdef ALLOC_MMAP
 
-static void alloc_mmap_free(struct release_t *release){
+static void alloc_mmap_free(struct alloc_t *alloc_info){
 
-  if (munmap(release -> address, BUFFER_SIZE)) {
+  if (munmap(alloc_info, allocation_block_size)) {
     printf("OpenBLAS : munmap failed\n");
   }
 }
@@ -465,22 +578,18 @@ static void *alloc_mmap(void *address){
 
   if (address){
     map_address = mmap(address,
-		       BUFFER_SIZE,
+		       allocation_block_size,
 		       MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
   } else {
     map_address = mmap(address,
-		       BUFFER_SIZE,
+		       allocation_block_size,
 		       MMAP_ACCESS, MMAP_POLICY, -1, 0);
   }
 
-  if (map_address != (void *)-1) {
-    release_info[release_pos].address = map_address;
-    release_info[release_pos].func    = alloc_mmap_free;
-    release_pos ++;
-  }
+  STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
 
 #ifdef OS_LINUX
-  my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
+  my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
 #endif
 
   return map_address;
@@ -533,25 +642,25 @@ static void *alloc_mmap(void *address){
 
   if (address){
     /* Just give up use advanced operation */
-    map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
+    map_address = mmap(address, allocation_block_size, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
 
 #ifdef OS_LINUX
-    my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
+    my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
 #endif
 
   } else {
 #if defined(OS_LINUX) && !defined(NO_WARMUP)
     if (hot_alloc == 0) {
-      map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0);
+      map_address = mmap(NULL, allocation_block_size, MMAP_ACCESS, MMAP_POLICY, -1, 0);
 
 #ifdef OS_LINUX
-      my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
+      my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
 #endif
 
     } else {
 #endif
 
-      map_address = mmap(NULL, BUFFER_SIZE * SCALING,
+      map_address = mmap(NULL, allocation_block_size * SCALING,
 			 MMAP_ACCESS, MMAP_POLICY, -1, 0);
 
       if (map_address != (void *)-1) {
@@ -559,7 +668,7 @@ static void *alloc_mmap(void *address){
 #ifdef OS_LINUX
 #ifdef DEBUG
 		  int ret=0;
-		  ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
+		  ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
 		  if(ret==-1){
 			  int errsv=errno;
 			  perror("OpenBLAS alloc_mmap:");
@@ -567,7 +676,7 @@ static void *alloc_mmap(void *address){
 		  }
 
 #else
-		  my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
+		  my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
 #endif
 #endif
 
@@ -575,7 +684,7 @@ static void *alloc_mmap(void *address){
 	allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
 
 	start   = (BLASULONG)map_address;
-	current = (SCALING - 1) * BUFFER_SIZE;
+	current = (SCALING - 1) * allocation_block_size;
 
 	while(current > 0) {
 	  *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
@@ -590,7 +699,7 @@ static void *alloc_mmap(void *address){
 	best = (BLASULONG)-1;
 	best_address = map_address;
 
-	while ((start + allocsize  < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
+	while ((start + allocsize  < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) {
 
 	  current = run_bench(start, allocsize);
 
@@ -606,7 +715,7 @@ static void *alloc_mmap(void *address){
       if ((BLASULONG)best_address > (BLASULONG)map_address)
 	munmap(map_address,  (BLASULONG)best_address - (BLASULONG)map_address);
 
-      munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
+      munmap((void *)((BLASULONG)best_address + allocation_block_size), (SCALING - 1) * allocation_block_size + (BLASULONG)map_address - (BLASULONG)best_address);
 
       map_address = best_address;
 
@@ -619,11 +728,7 @@ static void *alloc_mmap(void *address){
   }
 #endif
 
-  if (map_address != (void *)-1) {
-    release_info[release_pos].address = map_address;
-    release_info[release_pos].func    = alloc_mmap_free;
-    release_pos ++;
-  }
+  STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
 
   return map_address;
 }
@@ -635,9 +740,9 @@ static void *alloc_mmap(void *address){
 
 #ifdef ALLOC_MALLOC
 
-static void alloc_malloc_free(struct release_t *release){
+static void alloc_malloc_free(struct alloc_t *alloc_info){
 
-  free(release -> address);
+  free(alloc_info);
 
 }
 
@@ -645,15 +750,11 @@ static void *alloc_malloc(void *address){
 
   void *map_address;
 
-  map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
+  map_address = (void *)malloc(allocation_block_size + FIXED_PAGESIZE);
 
   if (map_address == (void *)NULL) map_address = (void *)-1;
 
-  if (map_address != (void *)-1) {
-    release_info[release_pos].address = map_address;
-    release_info[release_pos].func    = alloc_malloc_free;
-    release_pos ++;
-  }
+  STORE_RELEASE_FUNC(map_address, alloc_malloc_free);
 
   return map_address;
 
@@ -670,24 +771,20 @@ void *qfree (void *address);
 #define QCOMMS    0x2
 #define QFAST     0x4
 
-static void alloc_qalloc_free(struct release_t *release){
+static void alloc_qalloc_free(struct alloc_t *alloc_info){
 
-  qfree(release -> address);
+  qfree(alloc_info);
 
 }
 
 static void *alloc_qalloc(void *address){
   void *map_address;
 
-  map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE);
+  map_address = (void *)qalloc(QCOMMS | QFAST, allocation_block_size + FIXED_PAGESIZE);
 
   if (map_address == (void *)NULL) map_address = (void *)-1;
 
-  if (map_address != (void *)-1) {
-    release_info[release_pos].address = map_address;
-    release_info[release_pos].func    = alloc_qalloc_free;
-    release_pos ++;
-  }
+  STORE_RELEASE_FUNC(map_address, alloc_qalloc_free);
 
   return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
 }
@@ -696,9 +793,9 @@ static void *alloc_qalloc(void *address){
 
 #ifdef ALLOC_WINDOWS
 
-static void alloc_windows_free(struct release_t *release){
+static void alloc_windows_free(struct alloc_t *alloc_info){
 
-  VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);
+  VirtualFree(alloc_info, allocation_block_size, MEM_DECOMMIT);
 
 }
 
@@ -706,17 +803,13 @@ static void *alloc_windows(void *address){
   void *map_address;
 
   map_address  = VirtualAlloc(address,
-			      BUFFER_SIZE,
+			      allocation_block_size,
 			      MEM_RESERVE | MEM_COMMIT,
 			      PAGE_READWRITE);
 
   if (map_address == (void *)NULL) map_address = (void *)-1;
 
-  if (map_address != (void *)-1) {
-    release_info[release_pos].address = map_address;
-    release_info[release_pos].func    = alloc_windows_free;
-    release_pos ++;
-  }
+  STORE_RELEASE_FUNC(map_address, alloc_windows_free);
 
   return map_address;
 }
@@ -728,13 +821,14 @@ static void *alloc_windows(void *address){
 #define DEVICEDRIVER_NAME "/dev/mapper"
 #endif
 
-static void alloc_devicedirver_free(struct release_t *release){
+static void alloc_devicedirver_free(struct alloc_t *alloc_info){
 
-  if (munmap(release -> address, BUFFER_SIZE)) {
+  int attr = alloc_info -> attr;
+  if (munmap(address, allocation_block_size)) {
     printf("OpenBLAS : Bugphysarea unmap failed.\n");
   }
 
-  if (close(release -> attr)) {
+  if (close(attr)) {
     printf("OpenBLAS : Bugphysarea close failed.\n");
   }
 
@@ -751,17 +845,12 @@ static void *alloc_devicedirver(void *address){
 
   }
 
-  map_address = mmap(address, BUFFER_SIZE,
+  map_address = mmap(address, allocation_block_size,
 		     PROT_READ | PROT_WRITE,
 		     MAP_FILE | MAP_SHARED,
 		     fd, 0);
 
-  if (map_address != (void *)-1) {
-    release_info[release_pos].address = map_address;
-    release_info[release_pos].attr    = fd;
-    release_info[release_pos].func    = alloc_devicedirver_free;
-    release_pos ++;
-  }
+  STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_devicedirver_free, fd);
 
   return map_address;
 }
@@ -770,9 +859,9 @@ static void *alloc_devicedirver(void *address){
 
 #ifdef ALLOC_SHM
 
-static void alloc_shm_free(struct release_t *release){
+static void alloc_shm_free(struct alloc_t *alloc_info){
 
-  if (shmdt(release -> address)) {
+  if (shmdt(alloc_info)) {
     printf("OpenBLAS : Shared memory unmap failed.\n");
     }
 }
@@ -781,22 +870,21 @@ static void *alloc_shm(void *address){
   void *map_address;
   int shmid;
 
-  shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);
+  shmid = shmget(IPC_PRIVATE, allocation_block_size,IPC_CREAT | 0600);
 
   map_address = (void *)shmat(shmid, address, 0);
 
   if (map_address != (void *)-1){
 
 #ifdef OS_LINUX
-    my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
+    my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
 #endif
 
     shmctl(shmid, IPC_RMID, 0);
 
-    release_info[release_pos].address = map_address;
-    release_info[release_pos].attr    = shmid;
-    release_info[release_pos].func    = alloc_shm_free;
-    release_pos ++;
+    struct alloc_t *alloc_info = (struct alloc_t *)map_address;
+    alloc_info->release_func = alloc_shm_free;
+    alloc_info->attr = shmid;
   }
 
   return map_address;
@@ -804,23 +892,23 @@ static void *alloc_shm(void *address){
 
 #if defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS
 
-static void alloc_hugetlb_free(struct release_t *release){
+static void alloc_hugetlb_free(struct alloc_t *alloc_info){
 
 #if defined(OS_LINUX) || defined(OS_AIX)
-  if (shmdt(release -> address)) {
+  if (shmdt(alloc_info)) {
     printf("OpenBLAS : Hugepage unmap failed.\n");
   }
 #endif
 
 #ifdef __sun__
 
-  munmap(release -> address, BUFFER_SIZE);
+  munmap(alloc_info, allocation_block_size);
 
 #endif
 
 #ifdef OS_WINDOWS
 
-  VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);
+  VirtualFree(alloc_info, allocation_block_size, MEM_LARGE_PAGES | MEM_DECOMMIT);
 
 #endif
 
@@ -833,7 +921,7 @@ static void *alloc_hugetlb(void *address){
 #if defined(OS_LINUX) || defined(OS_AIX)
   int shmid;
 
-  shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
+  shmid = shmget(IPC_PRIVATE, allocation_block_size,
 #ifdef OS_LINUX
 		 SHM_HUGETLB |
 #endif
@@ -846,7 +934,7 @@ static void *alloc_hugetlb(void *address){
     map_address = (void *)shmat(shmid, address, SHM_RND);
 
 #ifdef OS_LINUX
-    my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
+    my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
 #endif
 
     if (map_address != (void *)-1){
@@ -863,7 +951,7 @@ static void *alloc_hugetlb(void *address){
   mha.mha_pagesize = HUGE_PAGESIZE;
   memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
 
-  map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE);
+  map_address = (BLASULONG)memalign(HUGE_PAGESIZE, allocation_block_size);
 #endif
 
 #ifdef OS_WINDOWS
@@ -887,7 +975,7 @@ static void *alloc_hugetlb(void *address){
   }
 
   map_address  = (void *)VirtualAlloc(address,
-				      BUFFER_SIZE,
+				      allocation_block_size,
 				      MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
 				      PAGE_READWRITE);
 
@@ -898,11 +986,7 @@ static void *alloc_hugetlb(void *address){
 
 #endif
 
-  if (map_address != (void *)-1){
-    release_info[release_pos].address = map_address;
-    release_info[release_pos].func    = alloc_hugetlb_free;
-    release_pos ++;
-  }
+  STORE_RELEASE_FUNC(map_address, alloc_hugetlb_free);
 
   return map_address;
 }
@@ -914,13 +998,14 @@ static void *alloc_hugetlb(void *address){
 
 static int hugetlb_pid = 0;
 
-static void alloc_hugetlbfile_free(struct release_t *release){
+static void alloc_hugetlbfile_free(struct alloc_t *alloc_info){
 
-  if (munmap(release -> address, BUFFER_SIZE)) {
+  int attr = alloc_info -> attr;
+  if (munmap(alloc_info, allocation_block_size)) {
     printf("OpenBLAS : HugeTLBfs unmap failed.\n");
   }
 
-  if (close(release -> attr)) {
+  if (close(attr)) {
     printf("OpenBLAS : HugeTLBfs close failed.\n");
   }
 }
@@ -941,17 +1026,12 @@ static void *alloc_hugetlbfile(void *address){
 
   unlink(filename);
 
-  map_address = mmap(address, BUFFER_SIZE,
+  map_address = mmap(address, allocation_block_size,
 		     PROT_READ | PROT_WRITE,
 		     MAP_SHARED,
 		     fd, 0);
 
-  if (map_address != (void *)-1) {
-    release_info[release_pos].address = map_address;
-    release_info[release_pos].attr    = fd;
-    release_info[release_pos].func    = alloc_hugetlbfile_free;
-    release_pos ++;
-  }
+  STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_hugetlbfile_free, fd);
 
   return map_address;
 }
@@ -964,19 +1044,11 @@ static BLASULONG base_address      = 0UL;
 static BLASULONG base_address      = BASE_ADDRESS;
 #endif
 
-struct memory_t {
-  void *addr;
-  int used;
-#ifndef __64BIT__
-  char dummy[48];
+#if __STDC_VERSION__ >= 201112L
+static _Atomic int memory_initialized = 0;
 #else
-  char dummy[40];
+static volatile int memory_initialized = 0;
 #endif
-};
-
-static struct memory_t THREAD_LOCAL memory[BUFFERS_PER_THREAD];
-
-static int memory_initialized = 0;
 
 /*       Memory allocation routine           */
 /* procpos ... indicates where it comes from */
@@ -984,6 +1056,20 @@ static int memory_initialized = 0;
 /*                1 : Level 2 functions      */
 /*                2 : Thread                 */
 
+static void blas_memory_init(){
+#if defined(SMP) && !defined(USE_OPENMP)
+  next_memory_table_pos = 0;
+#  if !defined(HAS_COMPILER_TLS)
+#    if defined(OS_WINDOWS)
+  local_storage_key = ::TlsAlloc();
+#    else
+  pthread_key_create(&local_storage_key, NULL);
+#    endif /* defined(OS_WINDOWS) */
+#  endif /* defined(HAS_COMPILER_TLS) */
+#endif /* defined(SMP) && !defined(USE_OPENMP) */
+  memset(local_memory_table, 0, sizeof(local_memory_table));
+}
+
 void *blas_memory_alloc(int procpos){
 
   int position;
@@ -1016,14 +1102,17 @@ void *blas_memory_alloc(int procpos){
     NULL,
   };
   void *(**func)(void *address);
+  struct alloc_t * alloc_info;
+  struct alloc_t ** alloc_table;
 
-  if (UNLIKELY_TO_BE_ZERO(memory_initialized)) {
-
+  if (!LIKELY_ONE(memory_initialized)) {
+#if defined(SMP) && !defined(USE_OPENMP)
     /* Only allow a single thread to initialize memory system */
     LOCK_COMMAND(&alloc_lock);
 
     if (!memory_initialized) {
-
+#endif
+      blas_memory_init();
 #ifdef DYNAMIC_ARCH
       gotoblas_dynamic_init();
 #endif
@@ -1044,8 +1133,10 @@ void *blas_memory_alloc(int procpos){
 
       memory_initialized = 1;
 
+#if defined(SMP) && !defined(USE_OPENMP)
     }
     UNLOCK_COMMAND(&alloc_lock);
+#endif
   }
 
 #ifdef DEBUG
@@ -1053,9 +1144,9 @@ void *blas_memory_alloc(int procpos){
 #endif
 
   position = 0;
-
+  alloc_table = get_memory_table();
   do {
-      if (!memory[position].used) goto allocation;
+      if (!alloc_table[position] || !alloc_table[position]->used) goto allocation;
     position ++;
 
   } while (position < BUFFERS_PER_THREAD);
@@ -1068,9 +1159,8 @@ void *blas_memory_alloc(int procpos){
   printf("  Position -> %d\n", position);
 #endif
 
-  memory[position].used = 1;
-
-  if (!memory[position].addr) {
+  alloc_info = alloc_table[position];
+  if (!alloc_info) {
     do {
 #ifdef DEBUG
       printf("Allocation Start : %lx\n", base_address);
@@ -1082,7 +1172,7 @@ void *blas_memory_alloc(int procpos){
 
       while ((func != NULL) && (map_address == (void *) -1)) {
 
-	map_address = (*func)((void *)base_address);
+  map_address = (*func)((void *)base_address);
 
 #ifdef ALLOC_DEVICEDRIVER
 	if ((*func ==  alloc_devicedirver) && (map_address == (void *)-1)) {
@@ -1110,23 +1200,24 @@ void *blas_memory_alloc(int procpos){
 #endif
       if (((BLASLONG) map_address) == -1) base_address = 0UL;
 
-      if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
+      if (base_address) base_address += allocation_block_size + FIXED_PAGESIZE;
 
     } while ((BLASLONG)map_address == -1);
 
-    memory[position].addr = map_address;
+    alloc_table[position] = alloc_info = map_address;
 
 #ifdef DEBUG
-    printf("  Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
+    printf("  Mapping Succeeded. %p(%d)\n", (void *)alloc_info, position);
 #endif
   }
 
 #ifdef DEBUG
-  printf("Mapped   : %p  %3d\n\n",
-	  (void *)memory[position].addr, position);
+  printf("Mapped   : %p  %3d\n\n", (void *)alloc_info, position);
 #endif
 
-  return (void *)memory[position].addr;
+  alloc_info->used = 1;
+
+  return (void *)(((char *)alloc_info) + sizeof(struct alloc_t));
 
  error:
   printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
@@ -1134,25 +1225,19 @@ void *blas_memory_alloc(int procpos){
   return NULL;
 }
 
-void blas_memory_free(void *free_area){
-
+void blas_memory_free(void *buffer){
+#ifdef DEBUG
   int position;
+  struct alloc_t ** alloc_table;
+#endif
+  /* Since we passed an offset pointer to the caller, get back to the actual allocation */
+  struct alloc_t *alloc_info = (void *)(((char *)buffer) - sizeof(struct alloc_t));
 
 #ifdef DEBUG
-  printf("Unmapped Start : %p ...\n", free_area);
+  printf("Unmapped Start : %p ...\n", alloc_info);
 #endif
 
-  position = 0;
-  while ((position < BUFFERS_PER_THREAD) && (memory[position].addr != free_area))
-    position++;
-
-  if (memory[position].addr != free_area) goto error;
-
-#ifdef DEBUG
-  printf("  Position : %d\n", position);
-#endif
-
-  memory[position].used = 0;
+  alloc_info->used = 0;
 
 #ifdef DEBUG
   printf("Unmap Succeeded.\n\n");
@@ -1160,12 +1245,13 @@ void blas_memory_free(void *free_area){
 
   return;
 
- error:
-  printf("BLAS : Bad memory unallocation! : %4d  %p\n", position,  free_area);
-
 #ifdef DEBUG
-  for (position = 0; position < BUFFERS_PER_THREAD; position++)
-    printf("%4ld  %p : %d\n", position, memory[position].addr, memory[position].used);
+  alloc_table = get_memory_table();
+  for (position = 0; position < BUFFERS_PER_THREAD; position++){
+    if (alloc_table[position]) {
+      printf("%4ld  %p : %d\n", position, alloc_table[position], alloc_table[position]->used);
+    }
+  }
 #endif
   return;
 }
@@ -1182,14 +1268,20 @@ void blas_memory_free_nolock(void * map_address) {
 
 void blas_shutdown(void){
 
-  int pos;
+  int pos, thread;
 
 #ifdef SMP
   BLASFUNC(blas_thread_shutdown)();
 #endif
 
-  for (pos = 0; pos < release_pos; pos ++) {
-    release_info[pos].func(&release_info[pos]);
+  for (thread = 0; thread < MAX_ALLOCATING_THREADS; thread ++){
+    for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){
+      struct alloc_t *alloc_info = local_memory_table[thread][pos];
+      if (alloc_info) {
+        alloc_info->release_func(alloc_info);
+        alloc_info = (void *)0;
+      }
+    }
   }
 
 #ifdef SEEK_ADDRESS
@@ -1198,11 +1290,6 @@ void blas_shutdown(void){
   base_address      = BASE_ADDRESS;
 #endif
 
-  for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){
-    memory[pos].addr   = (void *)0;
-    memory[pos].used   = 0;
-  }
-
   return;
 }
 
@@ -1226,7 +1313,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
   size_t size;
   BLASULONG buffer;
 
-  size   = BUFFER_SIZE - PAGESIZE;
+  size   = allocation_block_size - PAGESIZE;
   buffer = (BLASULONG)sa + GEMM_OFFSET_A;
 
 #if defined(OS_LINUX) && !defined(NO_WARMUP)
@@ -1247,7 +1334,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
   UNLOCK_COMMAND(&init_lock);
 #endif
 
-  size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE);
+  size = MIN((allocation_block_size - PAGESIZE), L2_SIZE);
   buffer = (BLASULONG)sa + GEMM_OFFSET_A;
 
   while (size > 0) {

From 28c28ed275df2fd812bcdc75fdc04cdb6d9580b3 Mon Sep 17 00:00:00 2001
From: Craig Donner <cdonner@google.com>
Date: Thu, 21 Jun 2018 11:13:57 +0100
Subject: [PATCH 70/86] Fix data races reported by TSAN.

---
 driver/others/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index ed20cf5cd..7eff16ce3 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -543,9 +543,9 @@ static __inline struct alloc_t ** get_memory_table() {
   if (!local_memory_table_pos) {
     LOCK_COMMAND(&alloc_lock);
     local_memory_table_pos = next_memory_table_pos++;
-    UNLOCK_COMMAND(&alloc_lock);
     if (next_memory_table_pos > MAX_ALLOCATING_THREADS)
       printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n");
+    UNLOCK_COMMAND(&alloc_lock);
 #  if !defined(HAS_COMPILER_TLS)
 #    if defined(OS_WINDOWS)
     ::TlsSetValue(local_storage_key, (void*)local_memory_table_pos);

From 2aa0a5804e381f89a53fdbef9bd51e8af23c8940 Mon Sep 17 00:00:00 2001
From: oon3m0oo <oon3m0oo@users.noreply.github.com>
Date: Thu, 21 Jun 2018 17:47:45 +0100
Subject: [PATCH 71/86] Use BLAS rather than CBLAS in test_fork.c (#1626)

This is handy for people not using lapack.
---
 utest/CMakeLists.txt |  2 --
 utest/Makefile       |  2 --
 utest/test_fork.c    | 22 +++++++++++++---------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index 77a42d84f..1b426afe7 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -25,7 +25,6 @@ endif ()
 
 # known to hang with the native Windows and Android threads
 # FIXME needs checking if this works on any of the other platforms
-if (NOT NO_CBLAS)
 if (NOT USE_OPENMP)
 if (OS_CYGWIN_NT OR OS_LINUX)
 set(OpenBLAS_utest_src
@@ -34,7 +33,6 @@ set(OpenBLAS_utest_src
   )
 endif()
 endif()
-endif()
 
 if (NOT NO_LAPACK)
 set(OpenBLAS_utest_src
diff --git a/utest/Makefile b/utest/Makefile
index e071540dc..e40b3c6db 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -17,13 +17,11 @@ endif
 
 #this does not work with OpenMP nor with native Windows or Android threads
 # FIXME TBD if this works on OSX, SunOS, POWER and zarch
-ifneq ($(NO_CBLAS), 1)
 ifndef USE_OPENMP
 ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT))
 OBJS += test_fork.o
 endif
 endif
-endif
 
 all : run_test
 
diff --git a/utest/test_fork.c b/utest/test_fork.c
index 9e0244305..9fc51287c 100644
--- a/utest/test_fork.c
+++ b/utest/test_fork.c
@@ -13,9 +13,9 @@ met:
       notice, this list of conditions and the following disclaimer in
       the documentation and/or other materials provided with the
       distribution.
-   3. Neither the name of the OpenBLAS project nor the names of 
-      its contributors may be used to endorse or promote products 
-      derived from this software without specific prior written 
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
       permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
@@ -48,11 +48,13 @@ void* xmalloc(size_t n)
     }
 }
 
-void check_dgemm(double *a, double *b, double *result, double *expected, int n)
+void check_dgemm(double *a, double *b, double *result, double *expected, blasint n)
 {
+    char trans1 = 'T';
+    char trans2 = 'N';
+    double zerod = 0, oned = 1;
     int i;
-    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n,
-        1.0, a, n, b, n, 0.0, result, n);
+    BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, result, &n);
     for(i = 0; i < n * n; ++i) {
         ASSERT_DBL_NEAR_TOL(expected[i], result[i], DOUBLE_EPS);
     }
@@ -60,7 +62,7 @@ void check_dgemm(double *a, double *b, double *result, double *expected, int n)
 
 CTEST(fork, safety)
 {
-    int n = 1000;
+    blasint n = 1000;
     int i;
 
     double *a, *b, *c, *d;
@@ -84,8 +86,10 @@ CTEST(fork, safety)
 
     // Compute a DGEMM product in the parent process prior to forking to
     // ensure that the OpenBLAS thread pool is initialized.
-    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n,
-       1.0, a, n, b, n, 0.0, c, n);
+    char trans1 = 'T';
+    char trans2 = 'N';
+    double zerod = 0, oned = 1;
+    BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, c, &n);
 
     fork_pid = fork();
     if (fork_pid == -1) {

From 9cf22b7d9129e186a1ee941fbab8e45328c50b61 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 23 Jun 2018 13:27:30 +0200
Subject: [PATCH 72/86] Build cblas_iXamin interfaces

---
 interface/Makefile | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/interface/Makefile b/interface/Makefile
index 9b2b93b83..20ec74e9e 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -260,7 +260,7 @@ HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \
 	  idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX)
 
 CSBLAS1OBJS   = \
-	cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
+	cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
 	cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
 	cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
 	cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX)
@@ -277,7 +277,7 @@ CSBLAS3OBJS   = \
 	cblas_sgeadd.$(SUFFIX)
 
 CDBLAS1OBJS   = \
-	cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
+	cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
 	cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
 	cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
 	cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX)
@@ -294,7 +294,7 @@ CDBLAS3OBJS   += \
         cblas_dgeadd.$(SUFFIX) 
 
 CCBLAS1OBJS   = \
-	cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX)  cblas_caxpy.$(SUFFIX) \
+	cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX)  cblas_caxpy.$(SUFFIX) \
 	cblas_ccopy.$(SUFFIX) \
 	cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \
 	cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
@@ -320,7 +320,7 @@ CCBLAS3OBJS   = \
 
 
 CZBLAS1OBJS   = \
-	cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX)  cblas_zaxpy.$(SUFFIX) \
+	cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX)  cblas_zaxpy.$(SUFFIX) \
 	cblas_zcopy.$(SUFFIX) \
 	cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
 	cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
@@ -1359,6 +1359,18 @@ cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c
 cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c
 	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
 
+cblas_isamin.$(SUFFIX) cblas_isamin.$(PSUFFIX) : imax.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_idamin.$(SUFFIX) cblas_idamin.$(PSUFFIX) : imax.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_icamin.$(SUFFIX) cblas_icamin.$(PSUFFIX) : imax.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_izamin.$(SUFFIX) cblas_izamin.$(PSUFFIX) : imax.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
 cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c
 	$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)
 

From eb71d61c7cb6640e66a5239d1113de8a8c1477df Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 23 Jun 2018 13:31:09 +0200
Subject: [PATCH 73/86] Expose CBLAS interface to BLAS extensions iXamin

---
 cblas.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cblas.h b/cblas.h
index 89f78c133..6461f4209 100644
--- a/cblas.h
+++ b/cblas.h
@@ -82,6 +82,11 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
 CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
 
+CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
+CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
+CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
+
 void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
 void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
 void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);

From 0b2b83d9ed91e5e9234e41b1d41b0a7f21f5234c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 23 Jun 2018 19:41:32 +0200
Subject: [PATCH 74/86] Add support for a user-defined list of dynamic targets

---
 Makefile.system | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/Makefile.system b/Makefile.system
index 62ba0e466..4712d9525 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -248,7 +248,7 @@ endif
 
 ifeq ($(OSNAME), Darwin)
 ifndef MACOSX_DEPLOYMENT_TARGET
-export MACOSX_DEPLOYMENT_TARGET=10.6
+export MACOSX_DEPLOYMENT_TARGET=10.8
 endif
 MD5SUM = md5 -r
 endif
@@ -497,6 +497,14 @@ endif
 endif
 endif
 
+ifdef DYNAMIC_LIST
+override DYNAMIC_CORE = PRESCOTT $(DYNAMIC_LIST)
+XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_PRESCOTT
+XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
+CCOMMON_OPT += $(XCCOMMON_OPT)
+#CCOMMON_OPT	+= -DDYNAMIC_LIST='$(DYNAMIC_LIST)'
+endif
+
 # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
 ifndef DYNAMIC_CORE
 override DYNAMIC_ARCH=

From 1833a6707157abe966f39dcac90530c2461117d9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 23 Jun 2018 19:42:15 +0200
Subject: [PATCH 75/86] Add support for a user-defined list of dynamic targets

---
 driver/others/dynamic.c | 139 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 139 insertions(+)

diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 4271c0a0d..d5ed6d164 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -49,6 +49,127 @@
 #define EXTERN
 #endif
 
+#ifdef DYNAMIC_LIST
+extern gotoblas_t gotoblas_PRESCOTT;
+
+#ifdef DYN_ATHLON
+extern gotoblas_t gotoblas_ATHLON;
+#else
+#define gotoblas_ATHLON gotoblas_PRESCOTT
+#endif
+#ifdef DYN_KATMAI
+extern gotoblas_t gotoblas_KATMAI;
+#else
+#define gotoblas_KATMAI gotoblas_PRESCOTT
+#endif
+#ifdef DYN_BANIAS
+extern gotoblas_t gotoblas_BANIAS;
+#else
+#define gotoblas_BANIAS gotoblas_PRESCOTT
+#endif
+#ifdef DYN_COPPERMINE
+extern gotoblas_t gotoblas_COPPERMINE;
+#else
+#define gotoblas_COPPERMINE gotoblas_PRESCOTT
+#endif
+#ifdef DYN_NORTHWOOD
+extern gotoblas_t gotoblas_NORTHWOOD;
+#else
+#define gotoblas_NORTHWOOD gotoblas_PRESCOTT
+#endif
+#ifdef DYN_CORE2
+extern gotoblas_t gotoblas_CORE2;
+#else
+#define gotoblas_CORE2 gotoblas_PRESCOTT
+#endif
+#ifdef DYN_NEHALEM
+extern gotoblas_t gotoblas_NEHALEM;
+#else
+#define gotoblas_NEHALEM gotoblas_PRESCOTT
+#endif
+#ifdef DYN_BARCELONA
+extern gotoblas_t gotoblas_BARCELONA;
+#else
+#define gotoblas_BARCELONA gotoblas_PRESCOTT
+#endif
+#ifdef DYN_ATOM
+extern gotoblas_t gotoblas_ATOM;
+#else
+#define gotoblas_ATOM gotoblas_PRESCOTT
+#endif
+#ifdef DYN_NANO
+extern gotoblas_t gotoblas_NANO;
+#else
+#define gotoblas_NANO gotoblas_PRESCOTT
+#endif
+#ifdef DYN_PENRYN
+extern gotoblas_t gotoblas_PENRYN;
+#else
+#define gotoblas_PENRYN gotoblas_PRESCOTT
+#endif
+#ifdef DYN_DUNNINGTON
+extern gotoblas_t gotoblas_DUNNINGTON;
+#else
+#define gotoblas_DUNNINGTON gotoblas_PRESCOTT
+#endif
+#ifdef DYN_OPTERON
+extern gotoblas_t gotoblas_OPTERON;
+#else
+#define gotoblas_OPTERON gotoblas_PRESCOTT
+#endif
+#ifdef DYN_OPTERON_SSE3
+extern gotoblas_t gotoblas_OPTERON_SSE3;
+#else
+#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT
+#endif
+#ifdef DYN_BOBCAT
+extern gotoblas_t gotoblas_BOBCAT;
+#else
+#define gotoblas_BOBCAT gotoblas_PRESCOTT
+#endif
+#ifdef DYN_SANDYBRIDGE
+extern gotoblas_t gotoblas_SANDYBRIDGE;
+#else
+#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT
+#endif
+#ifdef DYN_BULLDOZER
+extern gotoblas_t gotoblas_BULLDOZER;
+#else
+#define gotoblas_BULLDOZER gotoblas_PRESCOTT
+#endif
+#ifdef DYN_PILEDRIVER
+extern gotoblas_t gotoblas_PILEDRIVER;
+#else
+#define gotoblas_PILEDRIVER gotoblas_PRESCOTT
+#endif
+#ifdef DYN_STEAMROLLER
+extern gotoblas_t gotoblas_STEAMROLLER;
+#else
+#define gotoblas_STEAMROLLER gotoblas_PRESCOTT
+#endif
+#ifdef DYN_EXCAVATOR
+extern gotoblas_t gotoblas_EXCAVATOR;
+#else
+#define gotoblas_EXCAVATOR gotoblas_PRESCOTT
+#endif
+#ifdef DYN_HASWELL
+extern gotoblas_t gotoblas_HASWELL;
+#else
+#define gotoblas_HASWELL gotoblas_PRESCOTT
+#endif
+#ifdef DYN_ZEN
+extern gotoblas_t gotoblas_ZEN;
+#else
+#define gotoblas_ZEN gotoblas_PRESCOTT
+#endif
+#ifdef DYN_SKYLAKEX
+extern gotoblas_t gotoblas_SKYLAKEX;
+#else
+#define gotoblas_SKYLAKEX gotoblas_PRESCOTT
+#endif
+
+
+#else // not DYNAMIC_LIST
 EXTERN gotoblas_t  gotoblas_KATMAI;
 EXTERN gotoblas_t  gotoblas_COPPERMINE;
 EXTERN gotoblas_t  gotoblas_NORTHWOOD;
@@ -108,6 +229,7 @@ extern gotoblas_t  gotoblas_SKYLAKEX;
 #define gotoblas_ZEN gotoblas_BARCELONA
 #endif
 
+#endif // DYNAMIC_LIST
 
 #define VENDOR_INTEL      1
 #define VENDOR_AMD        2
@@ -338,6 +460,23 @@ static gotoblas_t *get_coretype(void){
 	  return &gotoblas_NEHALEM;
 	}	
 	return NULL;
+      case 6:
+        if (model == 6) {
+          // Cannon Lake
+#ifndef NO_AVX512
+	  return &gotoblas_SKYLAKEX;
+#else
+	  if(support_avx())
+#ifndef NO_AVX2
+	  return &gotoblas_HASWELL;
+#else
+	  return &gotblas_SANDYBRIDGE;
+#endif
+	  else
+	  return &gotoblas_NEHALEM;
+#endif			
+        }
+        return NULL;  
       case 9:
       case 8:
 	if (model == 14 ) { // Kaby Lake

From 01440685379f11f158c5f612cf15fc279eb16c88 Mon Sep 17 00:00:00 2001
From: Craig Donner <cdonner@google.com>
Date: Mon, 25 Jun 2018 13:53:11 +0100
Subject: [PATCH 76/86] Rewrite &= -> = and simplify the initial blocking
 phase.

---
 driver/level3/level3_thread.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c
index aeb5e6ed4..ee3e3b9a9 100644
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -344,12 +344,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
     div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
     for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {
 
-      /* Make sure if no one is using workspace */
-      START_RPCC();
-      for (i = 0; i < args -> nthreads; i++)
-	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
-      STOP_RPCC(waiting1);
-
 #if defined(FUSED_GEMM) && !defined(TIMING)
 
       /* Fused operation to copy region of B into workspace and apply kernel */
@@ -387,10 +381,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
       }
 #endif
 
-      /* Set flag so other threads can access local region of B */
-      for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
+      for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) {
+        /* Make sure if no one is using workspace */
+        START_RPCC();
+        while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
+        STOP_RPCC(waiting1);
+        /* Set flag so other threads can access local region of B */
         job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
-      WMB;
+        WMB;
+      }
     }
 
     /* Get regions of B from other threads and apply kernel */
@@ -426,13 +425,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
         /* Clear synchronization flag if this thread is done with other region of B */
 	if (m_to - m_from == min_i) {
-	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
+	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
 	  WMB;
 	}
       }
     } while (current != mypos);
 
-    /* Iterate through steps of m 
+    /* Iterate through steps of m
      * Note: First step has already been finished */
     for(is = m_from + min_i; is < m_to; is += min_i){
       min_i = m_to - is;
@@ -462,14 +461,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
 			   c, ldc, is, js);
           STOP_RPCC(kernel);
-          
+
 #ifdef TIMING
           ops += 2 * min_i * MIN(range_n[current + 1]  - js, div_n) * min_l;
 #endif
-          
+
           /* Clear synchronization flag if this thread is done with region of B */
           if (is + min_i >= m_to) {
-            job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
+            job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
             WMB;
           }
 	}

From 750162a05f8c6d0d9530955f78e8e6bb138d8df9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 25 Jun 2018 21:02:31 +0200
Subject: [PATCH 77/86] Try gradual fallback for cores not in the dynamic core
 list

---
 driver/others/dynamic.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index d5ed6d164..13794207c 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -89,11 +89,15 @@ extern gotoblas_t gotoblas_NEHALEM;
 #endif
 #ifdef DYN_BARCELONA
 extern gotoblas_t gotoblas_BARCELONA;
+#elif defined(DYN_NEHALEM)
+#define gotoblas_BARCELONA gotoblas_NEHALEM
 #else
 #define gotoblas_BARCELONA gotoblas_PRESCOTT
 #endif
 #ifdef DYN_ATOM
 extern gotoblas_t gotoblas_ATOM;
+elif defined(DYN_NEHALEM)
+#define gotoblas_ATOM gotoblas_NEHALEM
 #else
 #define gotoblas_ATOM gotoblas_PRESCOTT
 #endif
@@ -124,46 +128,82 @@ extern gotoblas_t gotoblas_OPTERON_SSE3;
 #endif
 #ifdef DYN_BOBCAT
 extern gotoblas_t gotoblas_BOBCAT;
+#elif defined(DYN_NEHALEM)
+#define gotoblas_BOBCAT gotoblas_NEHALEM
 #else
 #define gotoblas_BOBCAT gotoblas_PRESCOTT
 #endif
 #ifdef DYN_SANDYBRIDGE
 extern gotoblas_t gotoblas_SANDYBRIDGE;
+#elif defined(DYN_NEHALEM)
+#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
 #else
 #define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT
 #endif
 #ifdef DYN_BULLDOZER
 extern gotoblas_t gotoblas_BULLDOZER;
+#elif defined(DYN_SANDYBRIDGE)
+#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE
+#elif defined(DYN_NEHALEM)
+#define gotoblas_BULLDOZER gotoblas_NEHALEM
 #else
 #define gotoblas_BULLDOZER gotoblas_PRESCOTT
 #endif
 #ifdef DYN_PILEDRIVER
 extern gotoblas_t gotoblas_PILEDRIVER;
+#elif defined(DYN_SANDYBRIDGE)
+#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE
+#elif defined(DYN_NEHALEM)
+#define gotoblas_PILEDRIVER gotoblas_NEHALEM
 #else
 #define gotoblas_PILEDRIVER gotoblas_PRESCOTT
 #endif
 #ifdef DYN_STEAMROLLER
 extern gotoblas_t gotoblas_STEAMROLLER;
+#elif defined(DYN_SANDYBRIDGE)
+#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE
+#elif defined(DYN_NEHALEM)
+#define gotoblas_STEAMROLLER gotoblas_NEHALEM
 #else
 #define gotoblas_STEAMROLLER gotoblas_PRESCOTT
 #endif
 #ifdef DYN_EXCAVATOR
 extern gotoblas_t gotoblas_EXCAVATOR;
+#elif defined(DYN_SANDYBRIDGE)
+#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE
+#elif defined(DYN_NEHALEM)
+#define gotoblas_EXCAVATOR gotoblas_NEHALEM
 #else
 #define gotoblas_EXCAVATOR gotoblas_PRESCOTT
 #endif
 #ifdef DYN_HASWELL
 extern gotoblas_t gotoblas_HASWELL;
+#elif defined(DYN_SANDYBRIDGE)
+#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
+#elif defined(DYN_NEHALEM)
+#define gotoblas_HASWELL gotoblas_NEHALEM
 #else
 #define gotoblas_HASWELL gotoblas_PRESCOTT
 #endif
 #ifdef DYN_ZEN
 extern gotoblas_t gotoblas_ZEN;
+#elif defined(DYN_HASWELL)
+#define gotoblas_ZEN gotoblas_HASWELL
+#elif defined(DYN_SANDYBRIDGE)
+#define gotoblas_ZEN gotoblas_SANDYBRIDGE
+#elif defined(DYN_NEHALEM)
+#define gotoblas_ZEN gotoblas_NEHALEM
 #else
 #define gotoblas_ZEN gotoblas_PRESCOTT
 #endif
 #ifdef DYN_SKYLAKEX
 extern gotoblas_t gotoblas_SKYLAKEX;
+#elif defined(DYN_HASWELL)
+#define gotoblas_SKYLAKEX gotoblas_HASWELL
+#elif defined(DYN_SANDYBRIDGE)
+#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
+#elif defined(DYN_NEHALEM)
+#define gotoblas_SKYLAKEX gotoblas_NEHALEM
 #else
 #define gotoblas_SKYLAKEX gotoblas_PRESCOTT
 #endif

From 092175cfec7d49d40904aeff1d8121acb4ed1452 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 26 Jun 2018 08:09:52 +0200
Subject: [PATCH 78/86] Revert changes to NOFORTRAN handling from 952541e

---
 Makefile | 28 +++++++---------------------
 1 file changed, 7 insertions(+), 21 deletions(-)

diff --git a/Makefile b/Makefile
index 49dab6484..56b4426f8 100644
--- a/Makefile
+++ b/Makefile
@@ -21,17 +21,6 @@ ifeq ($(BUILD_RELAPACK), 1)
 RELA = re_lapack
 endif
 
-ifeq ($(NO_FORTRAN), 1)
-define NOFORTRAN
-1
-endef
-define NO_LAPACK
-1
-endef
-export NOFORTRAN
-export NO_LAPACK
-endif
-
 LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
 
 SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
@@ -58,7 +47,7 @@ endif
 endif
 
 	@echo "  C compiler       ... $(C_COMPILER)  (command line : $(CC))"
-ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
+ifndef NOFORTRAN
 	@echo "  Fortran compiler ... $(F_COMPILER)  (command line : $(FC))"
 endif
 ifneq ($(OSNAME), AIX)
@@ -119,7 +108,7 @@ endif
 endif
 
 tests :
-ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
+ifndef NOFORTRAN
 	touch $(LIBNAME)
 ifndef NO_FBLAS
 	$(MAKE) -C test all
@@ -221,7 +210,7 @@ netlib :
 
 else
 netlib : lapack_prebuild
-ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
+ifndef NOFORTRAN
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
 endif
@@ -242,10 +231,7 @@ prof_lapack : lapack_prebuild
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
 
 lapack_prebuild :
-	$(info filter value of NOFORTRAN is:)
-	$(info x$(filter-out $(NOFORTRAN), 1 2)x)
-
-ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
+ifndef NOFORTRAN
 	-@echo "FORTRAN     = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "OPTS        = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "POPTS       = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -288,21 +274,21 @@ endif
 endif
 
 large.tgz :
-ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
+ifndef NOFORTRAN
 	if [ ! -a $< ]; then
 	-wget http://www.netlib.org/lapack/timing/large.tgz;
 	fi
 endif
 
 timing.tgz :
-ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
+ifndef NOFORTRAN
 	if [ ! -a $< ]; then
 	-wget http://www.netlib.org/lapack/timing/timing.tgz;
 	fi
 endif
 
 lapack-timing : large.tgz timing.tgz
-ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
+ifndef NOFORTRAN
 	(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
 	(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
 	$(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING

From e322a951febc933e0bae192dcb117e447df24050 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 26 Jun 2018 20:44:13 +0200
Subject: [PATCH 79/86] Remove premature exit for INC_X or INC_Y zero

---
 kernel/arm/cdot_vfp.S | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/arm/cdot_vfp.S b/kernel/arm/cdot_vfp.S
index e5a6e4d35..fd86a37b0 100644
--- a/kernel/arm/cdot_vfp.S
+++ b/kernel/arm/cdot_vfp.S
@@ -215,11 +215,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	N, #0
 	ble	cdot_kernel_L999
 
-	cmp	INC_X, #0
-	beq	cdot_kernel_L999
+#	cmp	INC_X, #0
+#	beq	cdot_kernel_L999
 
-	cmp	INC_Y, #0
-	beq	cdot_kernel_L999
+#	cmp	INC_Y, #0
+#	beq	cdot_kernel_L999
 
 	cmp	INC_X, #1
 	bne	cdot_kernel_S_BEGIN

From 545b82efd30e4e0a33cb57bb7c6fb12601a6d3d9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 26 Jun 2018 20:45:00 +0200
Subject: [PATCH 80/86] Remove premature exit for INC_X or INC_Y zero

---
 kernel/arm/ddot_vfp.S | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/arm/ddot_vfp.S b/kernel/arm/ddot_vfp.S
index fb294d8b4..cc2e485b7 100644
--- a/kernel/arm/ddot_vfp.S
+++ b/kernel/arm/ddot_vfp.S
@@ -164,11 +164,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	N, #0
 	ble	ddot_kernel_L999
 
-	cmp	INC_X, #0
-	beq	ddot_kernel_L999
+#	cmp	INC_X, #0
+#	beq	ddot_kernel_L999
 
-	cmp	INC_Y, #0
-	beq	ddot_kernel_L999
+#	cmp	INC_Y, #0
+#	beq	ddot_kernel_L999
 
 	cmp	INC_X, #1
 	bne	ddot_kernel_S_BEGIN

From e344db269b5b45d08ff4ce60801de0ece0965866 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 26 Jun 2018 20:45:57 +0200
Subject: [PATCH 81/86] Remove premature exit for INC_X or INC_Y zero

---
 kernel/arm/sdot_vfp.S | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/arm/sdot_vfp.S b/kernel/arm/sdot_vfp.S
index 5f4f424bf..544846258 100644
--- a/kernel/arm/sdot_vfp.S
+++ b/kernel/arm/sdot_vfp.S
@@ -253,11 +253,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	N, #0
 	ble	sdot_kernel_L999
 
-	cmp	INC_X, #0
-	beq	sdot_kernel_L999
+#	cmp	INC_X, #0
+#	beq	sdot_kernel_L999
 
-	cmp	INC_Y, #0
-	beq	sdot_kernel_L999
+#	cmp	INC_Y, #0
+#	beq	sdot_kernel_L999
 
 	cmp	INC_X, #1
 	bne	sdot_kernel_S_BEGIN

From b83e4c60c73e80269e84b46590005d622d05e6d1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 26 Jun 2018 20:46:42 +0200
Subject: [PATCH 82/86] Remove premature exit for INC_X or INC_Y zero

---
 kernel/arm/zdot_vfp.S | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/arm/zdot_vfp.S b/kernel/arm/zdot_vfp.S
index 43f2c0c0b..c0cd92d3c 100644
--- a/kernel/arm/zdot_vfp.S
+++ b/kernel/arm/zdot_vfp.S
@@ -218,11 +218,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	N, #0
 	ble	zdot_kernel_L999
 
-	cmp	INC_X, #0
-	beq	zdot_kernel_L999
+#	cmp	INC_X, #0
+#	beq	zdot_kernel_L999
 
-	cmp	INC_Y, #0
-	beq	zdot_kernel_L999
+#	cmp	INC_Y, #0
+#	beq	zdot_kernel_L999
 
 	cmp	INC_X, #1
 	bne	zdot_kernel_S_BEGIN

From f0a8dc2eec86a20a1486034a999c36709e699266 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 30 Jun 2018 11:34:48 +0200
Subject: [PATCH 83/86] Disable the AVX512 DGEMM kernel for now

due to #1643
---
 kernel/x86_64/KERNEL.SKYLAKEX | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index c273ff8cd..2deb41b08 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -4,16 +4,16 @@ SGEMMKERNEL    =  sgemm_kernel_16x4_skylakex.S
 
 
 DTRMMKERNEL    =  ../generic/trmmkernel_16x2.c
-DGEMMKERNEL    =  dgemm_kernel_16x2_skylakex.S
-DGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
-DGEMMITCOPY    =  ../generic/gemm_tcopy_16.c
-DGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
-DGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+#DGEMMKERNEL    =  dgemm_kernel_16x2_skylakex.S
+#DGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
+#DGEMMITCOPY    =  ../generic/gemm_tcopy_16.c
+#DGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
+#DGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+#DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+#DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+#DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+#DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 
 SGEMM_BETA = ../generic/gemm_beta.c
-DGEMM_BETA = ../generic/gemm_beta.c
\ No newline at end of file
+DGEMM_BETA = ../generic/gemm_beta.c

From 6e54b0a027437303e425382c7e5611c1e860632f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 30 Jun 2018 17:31:06 +0200
Subject: [PATCH 84/86] Disable the 16x2 DTRMM kernel on SkylakeX as well

---
 kernel/x86_64/KERNEL.SKYLAKEX | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 2deb41b08..1256f4c3c 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -3,7 +3,7 @@ include $(KERNELDIR)/KERNEL.HASWELL
 SGEMMKERNEL    =  sgemm_kernel_16x4_skylakex.S
 
 
-DTRMMKERNEL    =  ../generic/trmmkernel_16x2.c
+#DTRMMKERNEL    =  ../generic/trmmkernel_16x2.c
 #DGEMMKERNEL    =  dgemm_kernel_16x2_skylakex.S
 #DGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
 #DGEMMITCOPY    =  ../generic/gemm_tcopy_16.c

From f5243e8e1fc585147e8b6e1553232f5f868eff1d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 30 Jun 2018 23:47:44 +0200
Subject: [PATCH 85/86] Add compiler option to avx512 test and hide test output

---
 c_check | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/c_check b/c_check
index cc64c16c6..3831d7aa3 100644
--- a/c_check
+++ b/c_check
@@ -205,8 +205,8 @@ $no_avx512= 0;
 if (($architecture eq "x86") || ($architecture eq "x86_64")) {
     $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
     print $tmpf "int main(void){ __asm__ volatile($code); }\n";
-    $args = " -o $tmpf.o -x c $tmpf";
-    my @cmd = ("$compiler_name $args");
+    $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf";
+    my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
     system(@cmd) == 0;
     if ($? != 0) {
 	$no_avx512 = 1;

From 4e9c34018e06615ea2c0c64551691e297682e7a3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 30 Jun 2018 23:57:50 +0200
Subject: [PATCH 86/86] Fix apparent off-by-one error in calculation of
 MAX_ALLOCATING_THREADS

fixes #1641
---
 driver/others/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index 7eff16ce3..98bcfb216 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -497,7 +497,7 @@ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t);
 #if defined(SMP) && !defined(USE_OPENMP)
 /* This is the number of threads than can be spawned by the server, which is the
    server plus the number of threads in the thread pool */
-#  define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER
+#  define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +1
 static int next_memory_table_pos = 0;
 #  if defined(HAS_COMPILER_TLS)
 /* Use compiler generated thread-local-storage */