From bcb115b55b92b75d862d743629b29ac0b84d2fd0 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 15 Jul 2014 13:35:36 +0200
Subject: [PATCH 01/74] added benchmark for gemv

---
 benchmark/Makefile |  77 ++++++++++++++-
 benchmark/gemv.c   | 229 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 302 insertions(+), 4 deletions(-)
 create mode 100644 benchmark/gemv.c

diff --git a/benchmark/Makefile b/benchmark/Makefile
index e3910ee96..db183c8ad 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -2,12 +2,12 @@ TOPDIR	= ..
 include $(TOPDIR)/Makefile.system
 
 # ACML standard
-ACML=/opt/acml5.3.1/gfortran64_mp/lib
-LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
+#ACML=/opt/acml5.3.1/gfortran64_mp/lib
+#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
 
 # ACML custom
-#ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib
-#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
+ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib
+LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
 
 # Atlas Ubuntu 
 #ATLAS=/usr/lib/atlas-base
@@ -37,6 +37,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
        chemm.goto zhemm.goto \
        cherk.goto zherk.goto \
        cher2k.goto zher2k.goto \
+       sgemv.goto dgemv.goto cgemv.goto zgemv.goto \
        ssymm.goto dsymm.goto csymm.goto zsymm.goto
 
 acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
@@ -49,6 +50,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
        chemm.acml zhemm.acml \
        cherk.acml zherk.acml \
        cher2k.acml zher2k.acml \
+       sgemv.acml dgemv.acml cgemv.acml zgemv.acml \
        ssymm.acml dsymm.acml csymm.acml zsymm.acml
 
 atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
@@ -61,6 +63,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
        chemm.atlas zhemm.atlas \
        cherk.atlas zherk.atlas \
        cher2k.atlas zher2k.atlas \
+       sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \
        ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas
 
 mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
@@ -73,6 +76,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
        chemm.mkl zhemm.mkl \
        cherk.mkl zherk.mkl \
        cher2k.mkl zher2k.mkl \
+       sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \
        ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl
 
 all :: goto atlas acml mkl
@@ -601,6 +605,61 @@ zher2k.atlas : zher2k.$(SUFFIX)
 zher2k.mkl : zher2k.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
+##################################### Sgemv ####################################################
+sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+sgemv.acml : sgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgemv.atlas : sgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgemv.mkl : sgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dgemv ####################################################
+dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+dgemv.acml : dgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgemv.atlas : dgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgemv.mkl : dgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Cgemv ####################################################
+
+cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+cgemv.acml : cgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgemv.atlas : cgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgemv.mkl : cgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zgemv ####################################################
+
+zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+zgemv.acml : zgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgemv.atlas : zgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgemv.mkl : zgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+
 ###################################################################################################
 
 slinpack.$(SUFFIX) : linpack.c
@@ -717,7 +776,17 @@ cher2k.$(SUFFIX) : her2k.c
 zher2k.$(SUFFIX) : her2k.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
 
+sgemv.$(SUFFIX) : gemv.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
 
+dgemv.$(SUFFIX) : gemv.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+cgemv.$(SUFFIX) : gemv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zgemv.$(SUFFIX) : gemv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
 
 
 clean ::
diff --git a/benchmark/gemv.c b/benchmark/gemv.c
new file mode 100644
index 000000000..e26a36ac1
--- /dev/null
+++ b/benchmark/gemv.c
@@ -0,0 +1,229 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+
+#undef GEMV
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+#define GEMV   BLASFUNC(dgemv)
+#else
+#define GEMV   BLASFUNC(sgemv)
+#endif
+
+#else
+
+#ifdef DOUBLE
+#define GEMV   BLASFUNC(zgemv)
+#else
+#define GEMV   BLASFUNC(cgemv)
+#endif
+
+#endif
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+  FILETIME ft;
+  unsigned __int64 tmpres = 0;
+  static int tzflag;
+
+  if (NULL != tv)
+    {
+      GetSystemTimeAsFileTime(&ft);
+
+      tmpres |= ft.dwHighDateTime;
+      tmpres <<= 32;
+      tmpres |= ft.dwLowDateTime;
+
+      /*converting file time to unix epoch*/
+      tmpres /= 10;  /*convert into microseconds*/
+      tmpres -= DELTA_EPOCH_IN_MICROSECS;
+      tv->tv_sec = (long)(tmpres / 1000000UL);
+      tv->tv_usec = (long)(tmpres % 1000000UL);
+    }
+
+  return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+  int shmid;
+  void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+  if ((shmid =shmget(IPC_PRIVATE,
+		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+    printf( "Memory allocation failed(shmget).\n");
+    exit(1);
+  }
+
+  address = shmat(shmid, NULL, SHM_RND);
+
+  if ((BLASLONG)address == -1){
+    printf( "Memory allocation failed(shmat).\n");
+    exit(1);
+  }
+
+  shmctl(shmid, IPC_RMID, 0);
+
+  return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int MAIN__(int argc, char *argv[]){
+
+  FLOAT *a, *x, *y;
+  FLOAT alpha[] = {1.0, 1.0};
+  FLOAT beta [] = {1.0, 1.0};
+  char trans='N';
+  blasint m, i, j;
+  blasint inc_x=1,inc_y=1;
+  blasint n=0;
+  int has_param_n = 0;
+  int loops = 1;
+  int l;
+  char *p;
+
+  int from =   1;
+  int to   = 200;
+  int step =   1;
+
+  struct timeval start, stop;
+  double time1,timeg;
+
+  argc--;argv++;
+
+  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+  if ((p = getenv("OPENBLAS_INCY")))   inc_y = atoi(p);
+  if ((p = getenv("OPENBLAS_TRANS")))  trans=*p;
+  if ((p = getenv("OPENBLAS_PARAM_N"))) {
+	  n = atoi(p);
+	  if ((n>0) && (n<=to)) has_param_n = 1;
+  }
+
+  if ( has_param_n == 1 )
+    fprintf(stderr, "From : %3d  To : %3d Step = %3d Trans = '%c' N = %d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,n,inc_x,inc_y,loops);
+  else
+    fprintf(stderr, "From : %3d  To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops);
+
+  if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+  if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+#ifdef linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE       Flops\n");
+
+  for(m = from; m <= to; m += step)
+  {
+
+   timeg=0;
+
+   if ( has_param_n == 0 ) n = m;
+
+   fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
+
+   for(j = 0; j < m; j++){
+      		for(i = 0; i < n * COMPSIZE; i++){
+			a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+      		}
+   }
+
+
+    for (l=0; l<loops; l++)
+    {
+
+   	for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
+			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   	}
+
+   	for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
+			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   	}
+    	gettimeofday( &start, (struct timezone *)0);
+
+    	GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
+
+    	gettimeofday( &stop, (struct timezone *)0);
+
+    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+
+	timeg += time1;
+
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr,
+	    " %10.2f MFlops\n",
+	    COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
+
+  }
+
+  return 0;
+}
+
+void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

From d286daa2ba4a6461b0e587592459db6765d3745b Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 15 Jul 2014 14:41:35 +0200
Subject: [PATCH 02/74] adjusted number of threads for small size

---
 interface/gemm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/interface/gemm.c b/interface/gemm.c
index 07fea153c..74908e842 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -405,11 +405,11 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 
 #ifndef COMPLEX
   double MNK = (double) args.m * (double) args.n * (double) args.k;
-  if ( MNK <= (1024.0  * (double) GEMM_MULTITHREAD_THRESHOLD)  )
+  if ( MNK <= (16.0 * 1024.0  * (double) GEMM_MULTITHREAD_THRESHOLD)  )
 	nthreads_max = 1;
   else
   {
-  	if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
+  	if ( MNK <= (2.0 * 65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
 	{
 		nthreads_max = 4;
 		if ( args.m < 16 * GEMM_MULTITHREAD_THRESHOLD )

From b985cea65dbb4d60b51d204fd6144741fb9a7f0b Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 15 Jul 2014 16:04:46 +0200
Subject: [PATCH 03/74] adjust number of threads for sgemv and dgemv

---
 interface/gemv.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/interface/gemv.c b/interface/gemv.c
index 562ceee9f..08553ad21 100644
--- a/interface/gemv.c
+++ b/interface/gemv.c
@@ -211,7 +211,18 @@ void CNAME(enum CBLAS_ORDER order,
   buffer = (FLOAT *)blas_memory_alloc(1);
 
 #ifdef SMP
-  nthreads = num_cpu_avail(2);
+
+  int  nthreads_max = num_cpu_avail(2);
+  int  nthreads_avail = nthreads_max;
+
+  double MNK = (double) m * (double) n;
+  if ( MNK <= (500.0 * 100.0  * (double) GEMM_MULTITHREAD_THRESHOLD)  )
+        nthreads_max = 1;
+
+  if ( nthreads_max > nthreads_avail )
+        nthreads = nthreads_avail;
+  else
+        nthreads = nthreads_max;
 
   if (nthreads == 1) {
 #endif

From 51413925bdcc1fceec46e58acbd7cf03b7762aa1 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 15 Jul 2014 16:27:02 +0200
Subject: [PATCH 04/74] adjust number of threads for small size in cgemv and
 zgemv

---
 interface/zgemv.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/interface/zgemv.c b/interface/zgemv.c
index fcc2fda54..50513a8e4 100644
--- a/interface/zgemv.c
+++ b/interface/zgemv.c
@@ -233,7 +233,19 @@ void CNAME(enum CBLAS_ORDER order,
   buffer = (FLOAT *)blas_memory_alloc(1);
 
 #ifdef SMP
-  nthreads = num_cpu_avail(2);
+
+  int  nthreads_max = num_cpu_avail(2);
+  int  nthreads_avail = nthreads_max;
+
+  double MNK = (double) m * (double) n;
+  if ( MNK <= (80.0 * 20.0  * (double) GEMM_MULTITHREAD_THRESHOLD)  )
+        nthreads_max = 1;
+
+  if ( nthreads_max > nthreads_avail )
+        nthreads = nthreads_avail;
+  else
+        nthreads = nthreads_max;
+
 
   if (nthreads == 1) {
 #endif

From 7ceb25d7b370f87f89ee900a47015d33dcaab8bf Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Wed, 16 Jul 2014 17:08:43 +0200
Subject: [PATCH 05/74] changed string GFORTRAN to lowercase

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 2e378883b..3aaf092fc 100644
--- a/Makefile
+++ b/Makefile
@@ -247,7 +247,7 @@ ifndef NOFORTRAN
 	-@echo "SUFFIX      = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "PSUFFIX     = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "CEXTRALIB   = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-ifeq ($(FC), GFORTRAN)
+ifeq ($(FC), gfortran)
 	-@echo "TIMER       = INT_ETIME" >> $(NETLIB_LAPACK_DIR)/make.inc
 ifdef SMP
 	-@echo "LOADER      = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc

From a79df1ff49a4a22394d32e9969f6ad75154f51a9 Mon Sep 17 00:00:00 2001
From: Elliot Saba <staticfloat@gmail.com>
Date: Wed, 16 Jul 2014 15:31:27 -0400
Subject: [PATCH 06/74] Don't create an absolute symlink when installing on
 Darwin

---
 Makefile.install | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Makefile.install b/Makefile.install
index e0ccccbfb..c7d1d0d11 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -79,7 +79,8 @@ endif
 ifeq ($(OSNAME), Darwin)
 	@-cp $(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
 	@-install_name_tool -id $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
-	@-ln -fs $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dylib
+	@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
+	ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
 endif
 ifeq ($(OSNAME), WINNT)
 	@-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR)

From 3c5732615dd01e4d865d6c5d516e75889b165347 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Thu, 17 Jul 2014 23:15:07 +0200
Subject: [PATCH 07/74] added blocked sgemv_n and microkernel for bulldozer and
 piledriver

---
 kernel/x86_64/KERNEL.BULLDOZER           |   1 +
 kernel/x86_64/KERNEL.PILEDRIVER          |   1 +
 kernel/x86_64/sgemv_n_avx.c              | 194 +++++++++++++
 kernel/x86_64/sgemv_n_microk_bulldozer.c | 346 +++++++++++++++++++++++
 4 files changed, 542 insertions(+)
 create mode 100644 kernel/x86_64/sgemv_n_avx.c
 create mode 100644 kernel/x86_64/sgemv_n_microk_bulldozer.c

diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER
index 70370a73c..55932e69f 100644
--- a/kernel/x86_64/KERNEL.BULLDOZER
+++ b/kernel/x86_64/KERNEL.BULLDOZER
@@ -1,3 +1,4 @@
+SGEMVNKERNEL = sgemv_n_avx.c
 
 ZGEMVNKERNEL = zgemv_n_dup.S
 ZGEMVTKERNEL = zgemv_t.S
diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER
index 92b5dc7c9..145d9fb2f 100644
--- a/kernel/x86_64/KERNEL.PILEDRIVER
+++ b/kernel/x86_64/KERNEL.PILEDRIVER
@@ -1,3 +1,4 @@
+SGEMVNKERNEL = sgemv_n_avx.c
 
 ZGEMVNKERNEL = zgemv_n_dup.S
 ZGEMVTKERNEL = zgemv_t.S
diff --git a/kernel/x86_64/sgemv_n_avx.c b/kernel/x86_64/sgemv_n_avx.c
new file mode 100644
index 000000000..8c263543c
--- /dev/null
+++ b/kernel/x86_64/sgemv_n_avx.c
@@ -0,0 +1,194 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(BULLDOZER) || defined(PILEDRIVER)
+#include "sgemv_n_microk_bulldozer.c"
+#endif
+
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
+{
+	BLASLONG i;
+	for ( i=0; i<n; i++ )
+	{
+		*dest = *src;
+		dest++;
+		src += inc_src;
+	}
+}
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+	BLASLONG i;
+	for ( i=0; i<n; i++ )
+	{
+		*dest += *src;
+		src++;
+		dest += inc_dest;
+	}
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	BLASLONG n1,n2;
+	BLASLONG m1,m2;
+	FLOAT *xbuffer,*ybuffer;
+	xbuffer = buffer;
+	ybuffer = xbuffer + 2048 + 256;
+	
+	n1 = n / 512 ;
+	n2 = n % 512 ;
+
+	m1 = m / 32;
+	m2 = m % 32;
+
+	x_ptr = x;
+	a_ptr = a;
+	y_ptr = y;
+
+	for (j=0; j<n1; j++)
+	{
+
+		if ( inc_x == 1 )
+			xbuffer = x_ptr;
+		else
+			copy_x(512,x_ptr,xbuffer,inc_x);
+
+		x_ptr += 512 * inc_x;
+		a_ptr += j * 512;
+		y_ptr = y;
+
+
+		for(i = 0; i<m1; i++ )
+		{
+			sgemv_kernel_32(512,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(32,ybuffer,y_ptr,inc_y);
+			y_ptr += 32 * inc_y;
+			a_ptr += 32;			
+
+		}
+		if ( m2 & 16 )
+		{
+			sgemv_kernel_16(512,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(16,ybuffer,y_ptr,inc_y);
+			y_ptr += 16 * inc_y;
+			a_ptr += 16;			
+		}
+		if ( m2 & 8 )
+		{
+			sgemv_kernel_8(512,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(8,ybuffer,y_ptr,inc_y);
+			y_ptr += 8 * inc_y;
+			a_ptr += 8;			
+		}
+		if ( m2 & 4 )
+		{
+			sgemv_kernel_4(512,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(4,ybuffer,y_ptr,inc_y);
+			y_ptr += 4 * inc_y;
+			a_ptr += 4;			
+		}
+		if ( m2 & 2 )
+		{
+			sgemv_kernel_2(512,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(2,ybuffer,y_ptr,inc_y);
+			y_ptr += 2 * inc_y;
+			a_ptr += 2;			
+		}
+		if ( m2 & 1 )
+		{
+			sgemv_kernel_1(512,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(1,ybuffer,y_ptr,inc_y);
+		}
+
+	}
+
+	if ( n2 > 0 )
+	{
+
+		if ( inc_x == 1 )
+			xbuffer = x_ptr;
+		else
+			copy_x(n2,x_ptr,xbuffer,inc_x);
+
+		y_ptr = y;
+
+		for(i = 0; i<m1; i++ )
+		{
+			sgemv_kernel_32(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(32,ybuffer,y_ptr,inc_y);
+			y_ptr += 32 * inc_y;
+			a_ptr += 32;			
+
+		}
+		if ( m2 & 16 )
+		{
+			sgemv_kernel_16(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(16,ybuffer,y_ptr,inc_y);
+			y_ptr += 16 * inc_y;
+			a_ptr += 16;			
+		}
+		if ( m2 & 8 )
+		{
+			sgemv_kernel_8(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(8,ybuffer,y_ptr,inc_y);
+			y_ptr += 8 * inc_y;
+			a_ptr += 8;			
+		}
+		if ( m2 & 4 )
+		{
+			sgemv_kernel_4(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(4,ybuffer,y_ptr,inc_y);
+			y_ptr += 4 * inc_y;
+			a_ptr += 4;			
+		}
+		if ( m2 & 2 )
+		{
+			sgemv_kernel_2(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(2,ybuffer,y_ptr,inc_y);
+			y_ptr += 2 * inc_y;
+			a_ptr += 2;			
+		}
+		if ( m2 & 1 )
+		{
+			sgemv_kernel_1(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(1,ybuffer,y_ptr,inc_y);
+		}
+
+
+	}
+	return(0);
+}
+
+
diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer.c b/kernel/x86_64/sgemv_n_microk_bulldozer.c
new file mode 100644
index 000000000..3dad43643
--- /dev/null
+++ b/kernel/x86_64/sgemv_n_microk_bulldozer.c
@@ -0,0 +1,346 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+
+	float *pre = a + lda*4*2;
+
+	__asm __volatile
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+	"movq		%6,	 %%r8\n\t"		// address for prefetch
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+	"prefetcht0   64(%%r8)\n\t"			// Prefetch
+
+	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
+	"vxorps		%%ymm13, %%ymm13, %%ymm13\n\t"	// set to zero
+	"vxorps		%%ymm14, %%ymm14, %%ymm14\n\t"	// set to zero
+	"vxorps		%%ymm15, %%ymm15, %%ymm15\n\t"	// set to zero
+
+	".L01LOOP%=:				 \n\t"
+	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+
+	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+	"prefetcht0   64(%%r8)\n\t"			// Prefetch
+
+	"vfmaddps %%ymm12,   0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
+	"vfmaddps %%ymm13,   8*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
+	"vfmaddps %%ymm14,  16*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp
+	"vfmaddps %%ymm15,  24*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp
+
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
+	"vmulps		%%ymm13, %%ymm1,  %%ymm13\n\t"  // scale by alpha
+	"vmulps		%%ymm14, %%ymm1,  %%ymm14\n\t"  // scale by alpha
+	"vmulps		%%ymm15, %%ymm1,  %%ymm15\n\t"  // scale by alpha
+
+	"vmovups	%%ymm12,     (%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm13,  8*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm14, 16*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm15, 24*4(%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y),      // 5
+	  "m" (pre)	// 6
+	: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
+	  "xmm0" , "xmm1", 
+	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "memory"
+	);
+
+} 
+
+static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+	float *pre = a + lda*4*3;
+
+	__asm __volatile
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+	"movq		%6,	 %%r8\n\t"		// address for prefetch
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+
+	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
+	"vxorps		%%ymm13, %%ymm13, %%ymm13\n\t"	// set to zero
+
+	".L01LOOP%=:				 \n\t"
+	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+
+	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+
+	"vfmaddps %%ymm12,   0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
+	"vfmaddps %%ymm13,   8*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
+
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
+	"vmulps		%%ymm13, %%ymm1,  %%ymm13\n\t"  // scale by alpha
+
+	"vmovups	%%ymm12,     (%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm13,  8*4(%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y),      // 5
+	  "m" (pre)	// 6
+	: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
+	  "xmm0" , "xmm1", 
+	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "memory"
+	);
+
+} 
+
+
+static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+
+	__asm __volatile
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+
+	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
+
+	".L01LOOP%=:				 \n\t"
+	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+
+	"vfmaddps %%ymm12,   0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
+
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
+
+	"vmovups	%%ymm12,     (%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y)       // 5
+	: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
+	  "xmm0" , "xmm1", 
+	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "memory"
+	);
+
+} 
+
+
+static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+
+	__asm __volatile
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vbroadcastss   %1, 	 %%xmm1\n\t"		// alpha -> xmm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+
+	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
+
+	".L01LOOP%=:				 \n\t"
+	"vbroadcastss	(%%rdi),   %%xmm0	 \n\t"	// load values of c
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+
+	"vfmaddps %%xmm12,   0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
+
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulps		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
+
+	"vmovups	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y)       // 5
+	: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
+	  "xmm0" , "xmm1", 
+	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "memory"
+	);
+
+} 
+
+static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+
+	__asm __volatile
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+
+	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
+	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
+
+	".L01LOOP%=:				 \n\t"
+	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+
+	"vfmaddss %%xmm12,   0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
+	"vfmaddss %%xmm13,   1*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp
+
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
+	"vmulss		%%xmm13, %%xmm1,  %%xmm13\n\t"  // scale by alpha
+
+	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
+	"vmovss 	%%xmm13,    4(%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y)       // 5
+	: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
+	  "xmm0" , "xmm1", 
+	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "memory"
+	);
+
+} 
+
+
+
+static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+
+	__asm __volatile
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+
+	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
+
+	".L01LOOP%=:				 \n\t"
+	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+
+	"vfmaddss %%xmm12,   0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
+
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
+
+	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y)       // 5
+	: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
+	  "xmm0" , "xmm1", 
+	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "memory"
+	);
+
+} 
+
+

From c8a4a561773dcd4b905b7618b2518539d467daaf Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Fri, 18 Jul 2014 11:25:21 +0200
Subject: [PATCH 08/74] performance optimizations for sgemv_n

---
 kernel/x86_64/sgemv_n_avx.c              |  32 ++++--
 kernel/x86_64/sgemv_n_microk_bulldozer.c | 137 ++++++++++++++++++++---
 2 files changed, 146 insertions(+), 23 deletions(-)

diff --git a/kernel/x86_64/sgemv_n_avx.c b/kernel/x86_64/sgemv_n_avx.c
index 8c263543c..dc8d015d8 100644
--- a/kernel/x86_64/sgemv_n_avx.c
+++ b/kernel/x86_64/sgemv_n_avx.c
@@ -70,12 +70,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	n1 = n / 512 ;
 	n2 = n % 512 ;
 
-	m1 = m / 32;
-	m2 = m % 32;
+	m1 = m / 64;
+	m2 = m % 64;
 
-	x_ptr = x;
-	a_ptr = a;
 	y_ptr = y;
+	x_ptr = x;
 
 	for (j=0; j<n1; j++)
 	{
@@ -85,12 +84,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 		else
 			copy_x(512,x_ptr,xbuffer,inc_x);
 
-		x_ptr += 512 * inc_x;
-		a_ptr += j * 512;
+		a_ptr = a + j * 512 * lda;
 		y_ptr = y;
 
-
 		for(i = 0; i<m1; i++ )
+		{
+			sgemv_kernel_64(512,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(64,ybuffer,y_ptr,inc_y);
+			y_ptr += 64 * inc_y;
+			a_ptr += 64;			
+
+		}
+
+		if ( m2 & 32 )
 		{
 			sgemv_kernel_32(512,alpha,a_ptr,lda,xbuffer,ybuffer);
 			add_y(32,ybuffer,y_ptr,inc_y);
@@ -98,6 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 			a_ptr += 32;			
 
 		}
+
 		if ( m2 & 16 )
 		{
 			sgemv_kernel_16(512,alpha,a_ptr,lda,xbuffer,ybuffer);
@@ -131,6 +138,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 			sgemv_kernel_1(512,alpha,a_ptr,lda,xbuffer,ybuffer);
 			add_y(1,ybuffer,y_ptr,inc_y);
 		}
+		x_ptr += 512 * inc_x;
 
 	}
 
@@ -142,9 +150,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 		else
 			copy_x(n2,x_ptr,xbuffer,inc_x);
 
+		a_ptr = a + n1 * 512 * lda;
 		y_ptr = y;
 
 		for(i = 0; i<m1; i++ )
+		{
+			sgemv_kernel_64(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(64,ybuffer,y_ptr,inc_y);
+			y_ptr += 64 * inc_y;
+			a_ptr += 64;			
+
+		}
+
+		if ( m2 & 32 )
 		{
 			sgemv_kernel_32(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
 			add_y(32,ybuffer,y_ptr,inc_y);
diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer.c b/kernel/x86_64/sgemv_n_microk_bulldozer.c
index 3dad43643..1cecd96c5 100644
--- a/kernel/x86_64/sgemv_n_microk_bulldozer.c
+++ b/kernel/x86_64/sgemv_n_microk_bulldozer.c
@@ -25,12 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-
-static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
+static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
 {
 
 
-	float *pre = a + lda*4*2;
+	float *pre = a + lda*3;
 
 	__asm __volatile
 	(
@@ -44,38 +43,56 @@ static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x,
 	"prefetcht0	(%%r8)\n\t"			// Prefetch
 	"prefetcht0   64(%%r8)\n\t"			// Prefetch
 
+	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
+	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
+	"vxorps		%%ymm10, %%ymm10, %%ymm10\n\t"	// set to zero
+	"vxorps		%%ymm11, %%ymm11, %%ymm11\n\t"	// set to zero
 	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
 	"vxorps		%%ymm13, %%ymm13, %%ymm13\n\t"	// set to zero
 	"vxorps		%%ymm14, %%ymm14, %%ymm14\n\t"	// set to zero
 	"vxorps		%%ymm15, %%ymm15, %%ymm15\n\t"	// set to zero
-
+	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-
+	"nop					 \n\t"
 	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
+
 	"prefetcht0	(%%r8)\n\t"			// Prefetch
+	"vfmaddps %%ymm8 ,   0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp
 	"prefetcht0   64(%%r8)\n\t"			// Prefetch
+	"vfmaddps %%ymm9 ,   8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp
+	"prefetcht0   128(%%r8)\n\t"			// Prefetch
+	"vfmaddps %%ymm10,  16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp
+	"vfmaddps %%ymm11,  24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp
+	"prefetcht0   192(%%r8)\n\t"			// Prefetch
+	"vfmaddps %%ymm12,  32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
+	"vfmaddps %%ymm13,  40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
+	"vfmaddps %%ymm14,  48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp
+	"vfmaddps %%ymm15,  56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp
 
-	"vfmaddps %%ymm12,   0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
-	"vfmaddps %%ymm13,   8*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
-	"vfmaddps %%ymm14,  16*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp
-	"vfmaddps %%ymm15,  24*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp
-
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
 
 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"
 
+	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
+	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
+	"vmulps		%%ymm10, %%ymm1,  %%ymm10\n\t"  // scale by alpha
+	"vmulps		%%ymm11, %%ymm1,  %%ymm11\n\t"  // scale by alpha
 	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
 	"vmulps		%%ymm13, %%ymm1,  %%ymm13\n\t"  // scale by alpha
 	"vmulps		%%ymm14, %%ymm1,  %%ymm14\n\t"  // scale by alpha
 	"vmulps		%%ymm15, %%ymm1,  %%ymm15\n\t"  // scale by alpha
 
-	"vmovups	%%ymm12,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm13,  8*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm14, 16*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm15, 24*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm10, 16*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm11, 24*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm12, 32*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm13, 40*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm14, 48*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm15, 56*4(%%rdx)	 \n\t"  // store temp -> y
 
 	:
         :
@@ -88,6 +105,94 @@ static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x,
 	  "m" (pre)	// 6
 	: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
 	  "xmm0" , "xmm1", 
+	  "xmm8", "xmm9", "xmm10", "xmm11",
+	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "memory"
+	);
+
+} 
+
+
+
+static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+
+	float *pre = a + lda*3;
+
+	__asm __volatile
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vbroadcastss   %1, 	 %%xmm1\n\t"		// alpha -> xmm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+	"movq		%6,	 %%r8\n\t"		// address for prefetch
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+	"prefetcht0   64(%%r8)\n\t"			// Prefetch
+
+	"vxorps		%%xmm8 , %%xmm8 , %%xmm8 \n\t"	// set to zero
+	"vxorps		%%xmm9 , %%xmm9 , %%xmm9 \n\t"	// set to zero
+	"vxorps		%%xmm10, %%xmm10, %%xmm10\n\t"	// set to zero
+	"vxorps		%%xmm11, %%xmm11, %%xmm11\n\t"	// set to zero
+	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
+	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
+	"vxorps		%%xmm14, %%xmm14, %%xmm14\n\t"	// set to zero
+	"vxorps		%%xmm15, %%xmm15, %%xmm15\n\t"	// set to zero
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vbroadcastss	(%%rdi),   %%xmm0	 \n\t"	// load values of c
+	"nop					 \n\t"
+	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
+
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+	"vfmaddps %%xmm8 ,   0*4(%%rsi), %%xmm0, %%xmm8 \n\t" // multiply a and c and add to temp
+	"prefetcht0   64(%%r8)\n\t"			// Prefetch
+	"vfmaddps %%xmm9 ,   4*4(%%rsi), %%xmm0, %%xmm9 \n\t" // multiply a and c and add to temp
+	"vfmaddps %%xmm10,   8*4(%%rsi), %%xmm0, %%xmm10\n\t" // multiply a and c and add to temp
+	"vfmaddps %%xmm11,  12*4(%%rsi), %%xmm0, %%xmm11\n\t" // multiply a and c and add to temp
+	"vfmaddps %%xmm12,  16*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
+	"vfmaddps %%xmm13,  20*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp
+	"vfmaddps %%xmm14,  24*4(%%rsi), %%xmm0, %%xmm14\n\t" // multiply a and c and add to temp
+	"vfmaddps %%xmm15,  28*4(%%rsi), %%xmm0, %%xmm15\n\t" // multiply a and c and add to temp
+
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulps		%%xmm8 , %%xmm1,  %%xmm8 \n\t"  // scale by alpha
+	"vmulps		%%xmm9 , %%xmm1,  %%xmm9 \n\t"  // scale by alpha
+	"vmulps		%%xmm10, %%xmm1,  %%xmm10\n\t"  // scale by alpha
+	"vmulps		%%xmm11, %%xmm1,  %%xmm11\n\t"  // scale by alpha
+	"vmulps		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
+	"vmulps		%%xmm13, %%xmm1,  %%xmm13\n\t"  // scale by alpha
+	"vmulps		%%xmm14, %%xmm1,  %%xmm14\n\t"  // scale by alpha
+	"vmulps		%%xmm15, %%xmm1,  %%xmm15\n\t"  // scale by alpha
+
+	"vmovups	%%xmm8 ,     (%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%xmm9 ,  4*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%xmm10,  8*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%xmm11, 12*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%xmm12, 16*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%xmm13, 20*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%xmm14, 24*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%xmm15, 28*4(%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y),      // 5
+	  "m" (pre)	// 6
+	: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
+	  "xmm0" , "xmm1", 
+	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	);
@@ -97,7 +202,7 @@ static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x,
 static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
 {
 
-	float *pre = a + lda*4*3;
+	float *pre = a + lda*1;
 
 	__asm __volatile
 	(

From b3938fe371d1806233b06eff23eda4456d2f763a Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sat, 19 Jul 2014 07:15:34 +0200
Subject: [PATCH 09/74] don't use this sgemv_n on Windows

---
 kernel/x86_64/KERNEL.BULLDOZER           |  5 ++
 kernel/x86_64/KERNEL.PILEDRIVER          |  5 ++
 kernel/x86_64/sgemv_n_avx.c              |  6 +-
 kernel/x86_64/sgemv_n_microk_bulldozer.c | 78 ++++++++++++------------
 4 files changed, 53 insertions(+), 41 deletions(-)

diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER
index 55932e69f..fac8016a6 100644
--- a/kernel/x86_64/KERNEL.BULLDOZER
+++ b/kernel/x86_64/KERNEL.BULLDOZER
@@ -1,4 +1,9 @@
+ifdef OS_WINDOWS
+SGEMVNKERNEL = ../arm/gemv_n.c
+else
 SGEMVNKERNEL = sgemv_n_avx.c
+endif
+
 
 ZGEMVNKERNEL = zgemv_n_dup.S
 ZGEMVTKERNEL = zgemv_t.S
diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER
index 145d9fb2f..555c8053d 100644
--- a/kernel/x86_64/KERNEL.PILEDRIVER
+++ b/kernel/x86_64/KERNEL.PILEDRIVER
@@ -1,4 +1,9 @@
+ifdef OS_WINDOWS
+SGEMVNKERNEL = ../arm/gemv_n.c
+else
 SGEMVNKERNEL = sgemv_n_avx.c
+endif
+
 
 ZGEMVNKERNEL = zgemv_n_dup.S
 ZGEMVTKERNEL = zgemv_t.S
diff --git a/kernel/x86_64/sgemv_n_avx.c b/kernel/x86_64/sgemv_n_avx.c
index dc8d015d8..91e3ee424 100644
--- a/kernel/x86_64/sgemv_n_avx.c
+++ b/kernel/x86_64/sgemv_n_avx.c
@@ -61,8 +61,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	FLOAT *a_ptr;
 	FLOAT *x_ptr;
 	FLOAT *y_ptr;
-	BLASLONG n1,n2;
-	BLASLONG m1,m2;
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG register m2;
+	BLASLONG register n2;
 	FLOAT *xbuffer,*ybuffer;
 	xbuffer = buffer;
 	ybuffer = xbuffer + 2048 + 256;
diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer.c b/kernel/x86_64/sgemv_n_microk_bulldozer.c
index 1cecd96c5..1b07f0291 100644
--- a/kernel/x86_64/sgemv_n_microk_bulldozer.c
+++ b/kernel/x86_64/sgemv_n_microk_bulldozer.c
@@ -25,13 +25,13 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
+static void  sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
 {
 
 
 	float *pre = a + lda*3;
 
-	__asm __volatile
+	__asm__  __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
@@ -103,10 +103,10 @@ static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x,
           "m" (x),      // 4
           "m" (y),      // 5
 	  "m" (pre)	// 6
-	: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
-	  "xmm0" , "xmm1", 
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
 
@@ -114,13 +114,13 @@ static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x,
 
 
 
-static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
+static void  sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
 {
 
 
 	float *pre = a + lda*3;
 
-	__asm __volatile
+	__asm__ __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vbroadcastss   %1, 	 %%xmm1\n\t"		// alpha -> xmm1
@@ -190,21 +190,16 @@ static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x,
           "m" (x),      // 4
           "m" (y),      // 5
 	  "m" (pre)	// 6
-	: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
-	  "xmm0" , "xmm1", 
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
 	);
 
 } 
 
-static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
+static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
 {
 
-	float *pre = a + lda*1;
+	float *pre = a + lda*3;
 
-	__asm __volatile
+	__asm__ __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
@@ -248,20 +243,21 @@ static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x,
           "m" (x),      // 4
           "m" (y),      // 5
 	  "m" (pre)	// 6
-	: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
-	  "xmm0" , "xmm1", 
-	  "xmm12", "xmm13", "xmm14", "xmm15",
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
 
 } 
 
 
-static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
+static void  sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
 {
 
 
-	__asm __volatile
+	__asm__ __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
@@ -295,20 +291,21 @@ static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, f
           "m" (lda),    // 3
           "m" (x),      // 4
           "m" (y)       // 5
-	: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
-	  "xmm0" , "xmm1", 
-	  "xmm12", "xmm13", "xmm14", "xmm15",
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
 
 } 
 
 
-static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
+static void  sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
 {
 
 
-	__asm __volatile
+	__asm__ __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vbroadcastss   %1, 	 %%xmm1\n\t"		// alpha -> xmm1
@@ -342,19 +339,20 @@ static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, f
           "m" (lda),    // 3
           "m" (x),      // 4
           "m" (y)       // 5
-	: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
-	  "xmm0" , "xmm1", 
-	  "xmm12", "xmm13", "xmm14", "xmm15",
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
 
 } 
 
-static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
+static void  sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
 {
 
 
-	__asm __volatile
+	__asm__ __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
@@ -392,9 +390,10 @@ static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, f
           "m" (lda),    // 3
           "m" (x),      // 4
           "m" (y)       // 5
-	: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
-	  "xmm0" , "xmm1", 
-	  "xmm12", "xmm13", "xmm14", "xmm15",
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
 
@@ -402,11 +401,11 @@ static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, f
 
 
 
-static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
+static void  sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
 {
 
 
-	__asm __volatile
+	__asm__ __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
@@ -440,9 +439,10 @@ static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, f
           "m" (lda),    // 3
           "m" (x),      // 4
           "m" (y)       // 5
-	: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
-	  "xmm0" , "xmm1", 
-	  "xmm12", "xmm13", "xmm14", "xmm15",
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
 

From 2cce125c795632e4dd6f209e5b9703ed39a7ef10 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sat, 19 Jul 2014 15:48:07 +0200
Subject: [PATCH 10/74] added optimized sgemv_t for bulldozer and piledriver

---
 kernel/x86_64/KERNEL.BULLDOZER           |   2 +
 kernel/x86_64/KERNEL.PILEDRIVER          |   2 +
 kernel/x86_64/sgemv_t_avx.c              | 228 +++++++++++++++++++++++
 kernel/x86_64/sgemv_t_microk_bulldozer.c |  99 ++++++++++
 4 files changed, 331 insertions(+)
 create mode 100644 kernel/x86_64/sgemv_t_avx.c
 create mode 100644 kernel/x86_64/sgemv_t_microk_bulldozer.c

diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER
index fac8016a6..73a9ad2ec 100644
--- a/kernel/x86_64/KERNEL.BULLDOZER
+++ b/kernel/x86_64/KERNEL.BULLDOZER
@@ -1,7 +1,9 @@
 ifdef OS_WINDOWS
 SGEMVNKERNEL = ../arm/gemv_n.c
+SGEMVTKERNEL = ../arm/gemv_t.c
 else
 SGEMVNKERNEL = sgemv_n_avx.c
+SGEMVTKERNEL = sgemv_t_avx.c
 endif
 
 
diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER
index 555c8053d..453e7b762 100644
--- a/kernel/x86_64/KERNEL.PILEDRIVER
+++ b/kernel/x86_64/KERNEL.PILEDRIVER
@@ -1,7 +1,9 @@
 ifdef OS_WINDOWS
 SGEMVNKERNEL = ../arm/gemv_n.c
+SGEMVTKERNEL = ../arm/gemv_t.c
 else
 SGEMVNKERNEL = sgemv_n_avx.c
+SGEMVTKERNEL = sgemv_t_avx.c
 endif
 
 
diff --git a/kernel/x86_64/sgemv_t_avx.c b/kernel/x86_64/sgemv_t_avx.c
new file mode 100644
index 000000000..c9cdb60cd
--- /dev/null
+++ b/kernel/x86_64/sgemv_t_avx.c
@@ -0,0 +1,228 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(BULLDOZER) || defined(PILEDRIVER)
+#include "sgemv_t_microk_bulldozer.c"
+#endif
+
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
+{
+	BLASLONG i;
+	for ( i=0; i<n; i++ )
+	{
+		*dest = *src;
+		dest++;
+		src += inc_src;
+	}
+}
+
+static void  sgemv_kernel_1( BLASLONG n, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, FLOAT *y)
+{
+
+	FLOAT register temp0 = 0.0;
+	BLASLONG i;
+	for ( i=0; i<n ; i++)
+	{
+		temp0 += a[i] * x[i];
+	}
+	temp0 *= alpha ;
+	*y += temp0;
+}
+
+
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	FLOAT *a_ptrl;
+	BLASLONG m1;
+	BLASLONG register m2;
+	FLOAT *xbuffer;
+	xbuffer = buffer;
+	BLASLONG register Mblock;
+
+	m1 = m / 1024 ;
+	m2 = m % 1024 ;
+
+	x_ptr = x;
+	a_ptr = a;
+
+	for (j=0; j<m1; j++)
+	{
+
+		if ( inc_x == 1 )
+			xbuffer = x_ptr;
+		else
+			copy_x(1024,x_ptr,xbuffer,inc_x);
+
+		y_ptr = y;
+		a_ptrl = a_ptr;
+
+		for(i = 0; i<n; i++ )
+		{
+			sgemv_kernel_16(1024,alpha,a_ptrl,lda,xbuffer,y_ptr);
+			y_ptr += inc_y;
+			a_ptrl += lda;
+		}
+		a_ptr += 1024;	
+		x_ptr += 1024 * inc_x;
+	}
+
+	if ( m2 == 0 ) return(0);
+
+	Mblock = 512;
+	while ( Mblock >= 16 )
+	{
+	  if ( m2 & Mblock)
+	  {
+
+		if ( inc_x == 1 )
+			xbuffer = x_ptr;
+		else
+			copy_x(Mblock,x_ptr,xbuffer,inc_x);
+
+		y_ptr = y;
+		a_ptrl = a_ptr;
+
+		for(i = 0; i<n; i++ )
+		{
+			sgemv_kernel_16(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
+			y_ptr += inc_y;
+			a_ptrl += lda;
+		}
+		a_ptr += Mblock;	
+		x_ptr += Mblock * inc_x;
+
+
+ 	  }
+	  Mblock /= 2;
+
+	}
+
+        if ( m2 & Mblock)
+	{
+
+		if ( inc_x == 1 )
+			xbuffer = x_ptr;
+		else
+			copy_x(Mblock,x_ptr,xbuffer,inc_x);
+
+		y_ptr = y;
+		a_ptrl = a_ptr;
+
+		for(i = 0; i<n; i++ )
+		{
+			sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
+			y_ptr += inc_y;
+			a_ptrl += lda;
+		}
+		a_ptr += Mblock;	
+		x_ptr += Mblock * inc_x;
+
+
+ 	}
+	Mblock /= 2;
+
+
+        if ( m2 & Mblock)
+	{
+
+		if ( inc_x == 1 )
+			xbuffer = x_ptr;
+		else
+			copy_x(Mblock,x_ptr,xbuffer,inc_x);
+
+		y_ptr = y;
+		a_ptrl = a_ptr;
+
+		for(i = 0; i<n; i++ )
+		{
+			sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
+			y_ptr += inc_y;
+			a_ptrl += lda;
+		}
+		a_ptr += Mblock;	
+		x_ptr += Mblock * inc_x;
+
+
+ 	}
+	Mblock /= 2;
+
+        if ( m2 & Mblock)
+	{
+
+		if ( inc_x == 1 )
+			xbuffer = x_ptr;
+		else
+			copy_x(Mblock,x_ptr,xbuffer,inc_x);
+
+		y_ptr = y;
+		a_ptrl = a_ptr;
+
+		for(i = 0; i<n; i++ )
+		{
+			sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
+			y_ptr += inc_y;
+			a_ptrl += lda;
+		}
+		a_ptr += Mblock;	
+		x_ptr += Mblock * inc_x;
+
+
+ 	}
+	Mblock /= 2;
+
+        if ( m2 & Mblock)
+	{
+
+		xbuffer = x_ptr;
+
+		y_ptr = y;
+		a_ptrl = a_ptr;
+
+		for(i = 0; i<n; i++ )
+		{
+			sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
+			y_ptr += inc_y;
+			a_ptrl += lda;
+		}
+
+
+ 	}
+
+	return(0);
+}
+
+
diff --git a/kernel/x86_64/sgemv_t_microk_bulldozer.c b/kernel/x86_64/sgemv_t_microk_bulldozer.c
new file mode 100644
index 000000000..56b12a1e8
--- /dev/null
+++ b/kernel/x86_64/sgemv_t_microk_bulldozer.c
@@ -0,0 +1,99 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+	//n = n / 16;
+
+	__asm__ __volatile__
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+
+	"leaq	(, %%rcx,4), %%rcx       \n\t"		// scale lda by size of float
+	"leaq	(%%rsi,%%rcx,1), %%r8    \n\t"		// pointer to next line
+
+	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
+	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
+	"vxorps		%%xmm14, %%xmm14, %%xmm14\n\t"	// set to zero
+	"vxorps		%%xmm15, %%xmm15, %%xmm15\n\t"	// set to zero
+
+	"sarq		$4, %%rax		 \n\t"	// n = n / 16
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	// "prefetcht0	512(%%rsi)		 \n\t"
+	"prefetcht0	(%%r8)		 	 \n\t" //prefetch next line of a
+	"vmovups	(%%rsi), %%xmm4		 \n\t"
+	"vmovups     4*4(%%rsi), %%xmm5		 \n\t"
+	"vmovups     8*4(%%rsi), %%xmm6		 \n\t"
+	"vmovups    12*4(%%rsi), %%xmm7		 \n\t"
+
+	"vfmaddps %%xmm12,   0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp
+	"vfmaddps %%xmm13,   4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp
+	"vfmaddps %%xmm14,   8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp
+	"vfmaddps %%xmm15,  12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp
+
+        "addq		$16*4    ,   %%r8	 \n\t"  // increment prefetch pointer 
+        "addq		$16*4    ,   %%rsi	 \n\t"  // increment pointer of a 
+        "addq		$16*4    ,   %%rdi	 \n\t"  // increment pointer of c 
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vaddps		%%xmm12, %%xmm14, %%xmm12\n\t"	
+	"vaddps		%%xmm13, %%xmm15, %%xmm13\n\t"	
+	"vaddps		%%xmm12, %%xmm13, %%xmm12\n\t"	
+	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
+	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
+
+	"vfmaddss	(%%rdx), %%xmm12, %%xmm1, %%xmm12\n\t"
+	"vmovss		%%xmm12, (%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y)       // 5
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
+	  "%xmm0", "%xmm1", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+

From c06f9986d449bdaa109f742a3e5f7114b4ff50ad Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sun, 20 Jul 2014 10:21:08 +0200
Subject: [PATCH 11/74] added sgemv_t microkernel for sandybridge

---
 kernel/x86_64/sgemv_t_avx.c          |   2 +
 kernel/x86_64/sgemv_t_microk_sandy.c | 105 +++++++++++++++++++++++++++
 2 files changed, 107 insertions(+)
 create mode 100644 kernel/x86_64/sgemv_t_microk_sandy.c

diff --git a/kernel/x86_64/sgemv_t_avx.c b/kernel/x86_64/sgemv_t_avx.c
index c9cdb60cd..7a9efa35e 100644
--- a/kernel/x86_64/sgemv_t_avx.c
+++ b/kernel/x86_64/sgemv_t_avx.c
@@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(BULLDOZER) || defined(PILEDRIVER)
 #include "sgemv_t_microk_bulldozer.c"
+#else
+#include "sgemv_t_microk_sandy.c"
 #endif
 
 static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
diff --git a/kernel/x86_64/sgemv_t_microk_sandy.c b/kernel/x86_64/sgemv_t_microk_sandy.c
new file mode 100644
index 000000000..1745db3a7
--- /dev/null
+++ b/kernel/x86_64/sgemv_t_microk_sandy.c
@@ -0,0 +1,105 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+	//n = n / 16;
+
+	__asm__ __volatile__
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+
+	"leaq	(, %%rcx,4), %%rcx       \n\t"		// scale lda by size of float
+	"leaq	(%%rsi,%%rcx,1), %%r8    \n\t"		// pointer to next line
+
+	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
+	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
+	"vxorps		%%xmm14, %%xmm14, %%xmm14\n\t"	// set to zero
+	"vxorps		%%xmm15, %%xmm15, %%xmm15\n\t"	// set to zero
+
+	"sarq		$4, %%rax		 \n\t"	// n = n / 16
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	// "prefetcht0	512(%%rsi)		 \n\t"
+	"prefetcht0	(%%r8)		 	 \n\t" //prefetch next line of a
+	"vmovups	(%%rsi), %%xmm4		 \n\t"
+	"vmovups     4*4(%%rsi), %%xmm5		 \n\t"
+	"vmovups     8*4(%%rsi), %%xmm6		 \n\t"
+	"vmovups    12*4(%%rsi), %%xmm7		 \n\t"
+
+	"vmulps      0*4(%%rdi), %%xmm4, %%xmm8 \n\t" // multiply a and c and add to temp
+	"vmulps      4*4(%%rdi), %%xmm5, %%xmm9 \n\t" // multiply a and c and add to temp
+	"vmulps      8*4(%%rdi), %%xmm6, %%xmm10\n\t" // multiply a and c and add to temp
+	"vmulps     12*4(%%rdi), %%xmm7, %%xmm11\n\t" // multiply a and c and add to temp
+
+	"vaddps		%%xmm12, %%xmm8 , %%xmm12\n\t"	
+	"vaddps		%%xmm13, %%xmm9 , %%xmm13\n\t"	
+	"vaddps		%%xmm14, %%xmm10, %%xmm14\n\t"	
+	"vaddps		%%xmm15, %%xmm11, %%xmm15\n\t"	
+
+        "addq		$16*4    ,   %%r8	 \n\t"  // increment prefetch pointer 
+        "addq		$16*4    ,   %%rsi	 \n\t"  // increment pointer of a 
+        "addq		$16*4    ,   %%rdi	 \n\t"  // increment pointer of c 
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vaddps		%%xmm12, %%xmm14, %%xmm12\n\t"	
+	"vaddps		%%xmm13, %%xmm15, %%xmm13\n\t"	
+	"vaddps		%%xmm12, %%xmm13, %%xmm12\n\t"	
+	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
+	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
+
+	"vfmaddss	(%%rdx), %%xmm12, %%xmm1, %%xmm12\n\t"
+	"vmovss		%%xmm12, (%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y)       // 5
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
+	  "%xmm0", "%xmm1", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+

From 02eb72ac426226566b6b9d3cffd4beaacde88672 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sun, 20 Jul 2014 10:48:41 +0200
Subject: [PATCH 12/74] bugfix in sgemv_t_microk_sandy.c

---
 kernel/x86_64/KERNEL.SANDYBRIDGE     | 8 ++++++++
 kernel/x86_64/sgemv_t_microk_sandy.c | 3 ++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE
index 7228357ce..7d6b81d54 100644
--- a/kernel/x86_64/KERNEL.SANDYBRIDGE
+++ b/kernel/x86_64/KERNEL.SANDYBRIDGE
@@ -1,3 +1,11 @@
+ifdef OS_WINDOWS
+#SGEMVNKERNEL = ../arm/gemv_n.c
+SGEMVTKERNEL = ../arm/gemv_t.c
+else
+#SGEMVNKERNEL = sgemv_n_avx.c
+SGEMVTKERNEL = sgemv_t_avx.c
+endif
+
 
 SGEMMKERNEL    =  sgemm_kernel_16x4_sandy.S
 SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
diff --git a/kernel/x86_64/sgemv_t_microk_sandy.c b/kernel/x86_64/sgemv_t_microk_sandy.c
index 1745db3a7..4ecd6d3d0 100644
--- a/kernel/x86_64/sgemv_t_microk_sandy.c
+++ b/kernel/x86_64/sgemv_t_microk_sandy.c
@@ -80,7 +80,8 @@ static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x,
 	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
 	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
 
-	"vfmaddss	(%%rdx), %%xmm12, %%xmm1, %%xmm12\n\t"
+	"vmulss		%%xmm12, %%xmm1, %%xmm12 \n\t"
+	"vaddss	       (%%rdx), %%xmm12, %%xmm12\n\t"
 	"vmovss		%%xmm12, (%%rdx)	 \n\t"  // store temp -> y
 
 	:

From d9d4077c9317b0c283dbce0547ea299dc5f1df82 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sun, 20 Jul 2014 11:30:32 +0200
Subject: [PATCH 13/74] added sgemv_t microkernel for haswell

---
 kernel/x86_64/KERNEL.HASWELL           |   8 ++
 kernel/x86_64/sgemv_t_avx.c            |   2 +
 kernel/x86_64/sgemv_t_microk_haswell.c | 100 +++++++++++++++++++++++++
 3 files changed, 110 insertions(+)
 create mode 100644 kernel/x86_64/sgemv_t_microk_haswell.c

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index ae316cff0..288e39537 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -1,3 +1,11 @@
+ifdef OS_WINDOWS
+#SGEMVNKERNEL = ../arm/gemv_n.c
+SGEMVTKERNEL = ../arm/gemv_t.c
+else
+#SGEMVNKERNEL = sgemv_n_avx.c
+SGEMVTKERNEL = sgemv_t_avx.c
+endif
+
 
 SGEMMKERNEL    =  sgemm_kernel_16x4_haswell.S
 SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
diff --git a/kernel/x86_64/sgemv_t_avx.c b/kernel/x86_64/sgemv_t_avx.c
index 7a9efa35e..55fb3d623 100644
--- a/kernel/x86_64/sgemv_t_avx.c
+++ b/kernel/x86_64/sgemv_t_avx.c
@@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(BULLDOZER) || defined(PILEDRIVER)
 #include "sgemv_t_microk_bulldozer.c"
+#elif defined(HASWELL)
+#include "sgemv_t_microk_haswell.c"
 #else
 #include "sgemv_t_microk_sandy.c"
 #endif
diff --git a/kernel/x86_64/sgemv_t_microk_haswell.c b/kernel/x86_64/sgemv_t_microk_haswell.c
new file mode 100644
index 000000000..ecb9845bb
--- /dev/null
+++ b/kernel/x86_64/sgemv_t_microk_haswell.c
@@ -0,0 +1,100 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+	//n = n / 16;
+
+	__asm__ __volatile__
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+
+	"leaq	(, %%rcx,4), %%rcx       \n\t"		// scale lda by size of float
+	"leaq	(%%rsi,%%rcx,1), %%r8    \n\t"		// pointer to next line
+
+	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
+	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
+	"vxorps		%%xmm14, %%xmm14, %%xmm14\n\t"	// set to zero
+	"vxorps		%%xmm15, %%xmm15, %%xmm15\n\t"	// set to zero
+
+	"sarq		$4, %%rax		 \n\t"	// n = n / 16
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	// "prefetcht0	512(%%rsi)		 \n\t"
+	"prefetcht0	(%%r8)		 	 \n\t" //prefetch next line of a
+	"vmovups	(%%rsi), %%xmm4		 \n\t"
+	"vmovups     4*4(%%rsi), %%xmm5		 \n\t"
+	"vmovups     8*4(%%rsi), %%xmm6		 \n\t"
+	"vmovups    12*4(%%rsi), %%xmm7		 \n\t"
+
+	"vfmadd231ps   0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp
+	"vfmadd231ps   4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp
+	"vfmadd231ps   8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp
+	"vfmadd231ps  12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp
+
+        "addq		$16*4    ,   %%r8	 \n\t"  // increment prefetch pointer 
+        "addq		$16*4    ,   %%rsi	 \n\t"  // increment pointer of a 
+        "addq		$16*4    ,   %%rdi	 \n\t"  // increment pointer of c 
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vaddps		%%xmm12, %%xmm14, %%xmm12\n\t"	
+	"vaddps		%%xmm13, %%xmm15, %%xmm13\n\t"	
+	"vaddps		%%xmm12, %%xmm13, %%xmm12\n\t"	
+	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
+	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
+
+	"vmulss	        %%xmm12, %%xmm1, %%xmm12\n\t"
+	"vaddss		(%%rdx), %%xmm12,%%xmm12\n\t"
+	"vmovss		%%xmm12, (%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y)       // 5
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
+	  "%xmm0", "%xmm1", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+

From c0fe95fb725aba06ce08a114a0b79f91d91ec64a Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sun, 20 Jul 2014 13:17:47 +0200
Subject: [PATCH 14/74] added sgemv_n microkernel for sandybridge

---
 kernel/x86_64/KERNEL.SANDYBRIDGE     |   4 +-
 kernel/x86_64/sgemv_n_avx.c          |   2 +
 kernel/x86_64/sgemv_n_microk_sandy.c | 474 +++++++++++++++++++++++++++
 3 files changed, 478 insertions(+), 2 deletions(-)
 create mode 100644 kernel/x86_64/sgemv_n_microk_sandy.c

diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE
index 7d6b81d54..9d7a49562 100644
--- a/kernel/x86_64/KERNEL.SANDYBRIDGE
+++ b/kernel/x86_64/KERNEL.SANDYBRIDGE
@@ -1,8 +1,8 @@
 ifdef OS_WINDOWS
-#SGEMVNKERNEL = ../arm/gemv_n.c
+SGEMVNKERNEL = ../arm/gemv_n.c
 SGEMVTKERNEL = ../arm/gemv_t.c
 else
-#SGEMVNKERNEL = sgemv_n_avx.c
+SGEMVNKERNEL = sgemv_n_avx.c
 SGEMVTKERNEL = sgemv_t_avx.c
 endif
 
diff --git a/kernel/x86_64/sgemv_n_avx.c b/kernel/x86_64/sgemv_n_avx.c
index 91e3ee424..96a03ec57 100644
--- a/kernel/x86_64/sgemv_n_avx.c
+++ b/kernel/x86_64/sgemv_n_avx.c
@@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(BULLDOZER) || defined(PILEDRIVER)
 #include "sgemv_n_microk_bulldozer.c"
+#else
+#include "sgemv_n_microk_sandy.c"
 #endif
 
 static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
diff --git a/kernel/x86_64/sgemv_n_microk_sandy.c b/kernel/x86_64/sgemv_n_microk_sandy.c
new file mode 100644
index 000000000..7d9360f94
--- /dev/null
+++ b/kernel/x86_64/sgemv_n_microk_sandy.c
@@ -0,0 +1,474 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+static void  sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+
+	float *pre = a + lda*3;
+
+	__asm__  __volatile__
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+	"movq		%6,	 %%r8\n\t"		// address for prefetch
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+	"prefetcht0   64(%%r8)\n\t"			// Prefetch
+
+	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
+	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
+	"vxorps		%%ymm10, %%ymm10, %%ymm10\n\t"	// set to zero
+	"vxorps		%%ymm11, %%ymm11, %%ymm11\n\t"	// set to zero
+	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
+	"vxorps		%%ymm13, %%ymm13, %%ymm13\n\t"	// set to zero
+	"vxorps		%%ymm14, %%ymm14, %%ymm14\n\t"	// set to zero
+	"vxorps		%%ymm15, %%ymm15, %%ymm15\n\t"	// set to zero
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
+	"nop					 \n\t"
+	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
+
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+	"prefetcht0   64(%%r8)\n\t"			// Prefetch
+	"prefetcht0  128(%%r8)\n\t"			// Prefetch
+	"prefetcht0  192(%%r8)\n\t"			// Prefetch
+
+	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
+	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
+	"vmulps  16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
+	"vmulps  24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
+
+	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
+	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
+	"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
+	"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
+
+	"vmulps  32*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
+	"vmulps  40*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
+	"vmulps  48*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
+	"vmulps  56*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
+
+	"vaddps %%ymm12, %%ymm4, %%ymm12\n\t" // multiply a and c and add to temp
+	"vaddps %%ymm13, %%ymm5, %%ymm13\n\t" // multiply a and c and add to temp
+	"vaddps %%ymm14, %%ymm6, %%ymm14\n\t" // multiply a and c and add to temp
+	"vaddps %%ymm15, %%ymm7, %%ymm15\n\t" // multiply a and c and add to temp
+
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
+	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
+	"vmulps		%%ymm10, %%ymm1,  %%ymm10\n\t"  // scale by alpha
+	"vmulps		%%ymm11, %%ymm1,  %%ymm11\n\t"  // scale by alpha
+	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
+	"vmulps		%%ymm13, %%ymm1,  %%ymm13\n\t"  // scale by alpha
+	"vmulps		%%ymm14, %%ymm1,  %%ymm14\n\t"  // scale by alpha
+	"vmulps		%%ymm15, %%ymm1,  %%ymm15\n\t"  // scale by alpha
+
+	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm10, 16*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm11, 24*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm12, 32*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm13, 40*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm14, 48*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm15, 56*4(%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y),      // 5
+	  "m" (pre)	// 6
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
+	  "%xmm0", "%xmm1", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+
+static void  sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+
+	float *pre = a + lda*3;
+
+	__asm__  __volatile__
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+	"movq		%6,	 %%r8\n\t"		// address for prefetch
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+	"prefetcht0   64(%%r8)\n\t"			// Prefetch
+
+	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
+	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
+	"vxorps		%%ymm10, %%ymm10, %%ymm10\n\t"	// set to zero
+	"vxorps		%%ymm11, %%ymm11, %%ymm11\n\t"	// set to zero
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
+	"nop					 \n\t"
+	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
+
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+	"prefetcht0   64(%%r8)\n\t"			// Prefetch
+
+	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
+	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
+	"vmulps  16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
+	"vmulps  24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
+
+	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
+	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
+	"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
+	"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
+
+
+
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
+	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
+	"vmulps		%%ymm10, %%ymm1,  %%ymm10\n\t"  // scale by alpha
+	"vmulps		%%ymm11, %%ymm1,  %%ymm11\n\t"  // scale by alpha
+
+	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm10, 16*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm11, 24*4(%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y),      // 5
+	  "m" (pre)	// 6
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
+	  "%xmm0", "%xmm1", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "memory"
+	);
+
+
+
+} 
+
+static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+	float *pre = a + lda*3;
+	
+	__asm__  __volatile__
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+	"movq		%6,	 %%r8\n\t"		// address for prefetch
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+	"prefetcht0   64(%%r8)\n\t"			// Prefetch
+
+	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
+	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
+	"nop					 \n\t"
+	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
+
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+
+	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
+	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
+
+	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
+	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
+
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
+	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
+
+	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y),      // 5
+	  "m" (pre)	// 6
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
+	  "%xmm0", "%xmm1", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "memory"
+	);
+
+
+} 
+
+
+static void  sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+	
+	__asm__  __volatile__
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+
+	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
+
+	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
+	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
+
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
+	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y)       // 5
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
+	  "%xmm0", "%xmm1", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "memory"
+	);
+
+
+} 
+
+
+static void  sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+
+	__asm__ __volatile__
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vbroadcastss   %1, 	 %%xmm1\n\t"		// alpha -> xmm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+
+	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
+
+	".L01LOOP%=:				 \n\t"
+	"vbroadcastss	(%%rdi),   %%xmm0	 \n\t"	// load values of c
+
+	"vmulps   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
+	"vaddps %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp
+
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulps		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
+
+	"vmovups	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y)       // 5
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+static void  sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+
+	__asm__ __volatile__
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+
+	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
+	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
+
+	".L01LOOP%=:				 \n\t"
+	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
+
+	"vmulps   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
+	"vmulps   1*4(%%rsi), %%xmm0, %%xmm5 \n\t" 		// multiply a and c and add to temp
+
+	"vaddps %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp
+	"vaddps %%xmm13, %%xmm5, %%xmm13     \n\t" 		// multiply a and c and add to temp
+
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
+	"vmulss		%%xmm13, %%xmm1,  %%xmm13\n\t"  // scale by alpha
+
+	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
+	"vmovss 	%%xmm13,    4(%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y)       // 5
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+
+static void  sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+
+	__asm__ __volatile__
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+
+	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
+
+	".L01LOOP%=:				 \n\t"
+	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+
+	"vmulss   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
+	"vaddss %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp
+
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
+
+	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y)       // 5
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From 5392d11b045abddfe51c45e69848c807121486e8 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sun, 20 Jul 2014 14:08:04 +0200
Subject: [PATCH 15/74] optimized sgemv_n_microk_sandy.c

---
 kernel/x86_64/sgemv_n_microk_sandy.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/kernel/x86_64/sgemv_n_microk_sandy.c b/kernel/x86_64/sgemv_n_microk_sandy.c
index 7d9360f94..9bdb06600 100644
--- a/kernel/x86_64/sgemv_n_microk_sandy.c
+++ b/kernel/x86_64/sgemv_n_microk_sandy.c
@@ -29,7 +29,7 @@ static void  sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x,
 {
 
 
-	float *pre = a + lda*3;
+	float *pre = a + lda*2;
 
 	__asm__  __volatile__
 	(
@@ -58,20 +58,19 @@ static void  sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x,
 	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
 
 	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-	"prefetcht0  128(%%r8)\n\t"			// Prefetch
-	"prefetcht0  192(%%r8)\n\t"			// Prefetch
-
 	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
 	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
+	"prefetcht0   64(%%r8)\n\t"			// Prefetch
 	"vmulps  16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
 	"vmulps  24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
 
 	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
 	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
+	"prefetcht0  128(%%r8)\n\t"			// Prefetch
 	"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
 	"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
 
+	"prefetcht0  192(%%r8)\n\t"			// Prefetch
 	"vmulps  32*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
 	"vmulps  40*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
 	"vmulps  48*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp

From 6acbafe45b732b3410e67db5b1a5f05ed5e90f1e Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sun, 20 Jul 2014 14:52:25 +0200
Subject: [PATCH 16/74] added sgemv_n microkernel for haswell

---
 kernel/x86_64/KERNEL.HASWELL           |   4 +-
 kernel/x86_64/sgemv_n_avx.c            |   2 +
 kernel/x86_64/sgemv_n_microk_haswell.c | 461 +++++++++++++++++++++++++
 3 files changed, 465 insertions(+), 2 deletions(-)
 create mode 100644 kernel/x86_64/sgemv_n_microk_haswell.c

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index 288e39537..871a7d490 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -1,8 +1,8 @@
 ifdef OS_WINDOWS
-#SGEMVNKERNEL = ../arm/gemv_n.c
+SGEMVNKERNEL = ../arm/gemv_n.c
 SGEMVTKERNEL = ../arm/gemv_t.c
 else
-#SGEMVNKERNEL = sgemv_n_avx.c
+SGEMVNKERNEL = sgemv_n_avx.c
 SGEMVTKERNEL = sgemv_t_avx.c
 endif
 
diff --git a/kernel/x86_64/sgemv_n_avx.c b/kernel/x86_64/sgemv_n_avx.c
index 96a03ec57..57aaad4b4 100644
--- a/kernel/x86_64/sgemv_n_avx.c
+++ b/kernel/x86_64/sgemv_n_avx.c
@@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(BULLDOZER) || defined(PILEDRIVER)
 #include "sgemv_n_microk_bulldozer.c"
+#elif defined(HASWELL)
+#include "sgemv_n_microk_haswell.c"
 #else
 #include "sgemv_n_microk_sandy.c"
 #endif
diff --git a/kernel/x86_64/sgemv_n_microk_haswell.c b/kernel/x86_64/sgemv_n_microk_haswell.c
new file mode 100644
index 000000000..9db3869d2
--- /dev/null
+++ b/kernel/x86_64/sgemv_n_microk_haswell.c
@@ -0,0 +1,461 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+static void  sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+
+	float *pre = a + lda*2;
+
+	__asm__  __volatile__
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+	"movq		%6,	 %%r8\n\t"		// address for prefetch
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+	"prefetcht0   64(%%r8)\n\t"			// Prefetch
+
+	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
+	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
+	"vxorps		%%ymm10, %%ymm10, %%ymm10\n\t"	// set to zero
+	"vxorps		%%ymm11, %%ymm11, %%ymm11\n\t"	// set to zero
+	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
+	"vxorps		%%ymm13, %%ymm13, %%ymm13\n\t"	// set to zero
+	"vxorps		%%ymm14, %%ymm14, %%ymm14\n\t"	// set to zero
+	"vxorps		%%ymm15, %%ymm15, %%ymm15\n\t"	// set to zero
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
+	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
+
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+	"vfmadd231ps   0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp
+	"vfmadd231ps   8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp
+	"prefetcht0   64(%%r8)\n\t"			// Prefetch
+	"vfmadd231ps  16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp
+	"vfmadd231ps  24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp
+	"prefetcht0  128(%%r8)\n\t"			// Prefetch
+	"vfmadd231ps  32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
+	"vfmadd231ps  40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
+	"prefetcht0  192(%%r8)\n\t"			// Prefetch
+	"vfmadd231ps  48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp
+	"vfmadd231ps  56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp
+
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
+	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
+	"vmulps		%%ymm10, %%ymm1,  %%ymm10\n\t"  // scale by alpha
+	"vmulps		%%ymm11, %%ymm1,  %%ymm11\n\t"  // scale by alpha
+	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
+	"vmulps		%%ymm13, %%ymm1,  %%ymm13\n\t"  // scale by alpha
+	"vmulps		%%ymm14, %%ymm1,  %%ymm14\n\t"  // scale by alpha
+	"vmulps		%%ymm15, %%ymm1,  %%ymm15\n\t"  // scale by alpha
+
+	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm10, 16*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm11, 24*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm12, 32*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm13, 40*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm14, 48*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm15, 56*4(%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y),      // 5
+	  "m" (pre)	// 6
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
+	  "%xmm0", "%xmm1", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+
+static void  sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+
+	float *pre = a + lda*3;
+
+	__asm__  __volatile__
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+	"movq		%6,	 %%r8\n\t"		// address for prefetch
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+	"prefetcht0   64(%%r8)\n\t"			// Prefetch
+
+	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
+	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
+	"vxorps		%%ymm10, %%ymm10, %%ymm10\n\t"	// set to zero
+	"vxorps		%%ymm11, %%ymm11, %%ymm11\n\t"	// set to zero
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
+	"nop					 \n\t"
+	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
+
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+	"prefetcht0   64(%%r8)\n\t"			// Prefetch
+
+	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
+	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
+	"vmulps  16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
+	"vmulps  24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
+
+	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
+	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
+	"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
+	"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
+
+
+
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
+	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
+	"vmulps		%%ymm10, %%ymm1,  %%ymm10\n\t"  // scale by alpha
+	"vmulps		%%ymm11, %%ymm1,  %%ymm11\n\t"  // scale by alpha
+
+	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm10, 16*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm11, 24*4(%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y),      // 5
+	  "m" (pre)	// 6
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
+	  "%xmm0", "%xmm1", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "memory"
+	);
+
+
+
+} 
+
+static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+	float *pre = a + lda*3;
+	
+	__asm__  __volatile__
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+	"movq		%6,	 %%r8\n\t"		// address for prefetch
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+	"prefetcht0   64(%%r8)\n\t"			// Prefetch
+
+	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
+	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
+	"nop					 \n\t"
+	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
+
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+
+	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
+	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
+
+	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
+	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
+
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
+	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
+
+	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y),      // 5
+	  "m" (pre)	// 6
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
+	  "%xmm0", "%xmm1", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "memory"
+	);
+
+
+} 
+
+
+static void  sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+	
+	__asm__  __volatile__
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+
+	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
+
+	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
+	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
+
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
+	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y)       // 5
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
+	  "%xmm0", "%xmm1", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "memory"
+	);
+
+
+} 
+
+
+static void  sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+
+	__asm__ __volatile__
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vbroadcastss   %1, 	 %%xmm1\n\t"		// alpha -> xmm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+
+	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
+
+	".L01LOOP%=:				 \n\t"
+	"vbroadcastss	(%%rdi),   %%xmm0	 \n\t"	// load values of c
+
+	"vmulps   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
+	"vaddps %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp
+
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulps		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
+
+	"vmovups	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y)       // 5
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+static void  sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+
+	__asm__ __volatile__
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+
+	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
+	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
+
+	".L01LOOP%=:				 \n\t"
+	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
+
+	"vmulps   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
+	"vmulps   1*4(%%rsi), %%xmm0, %%xmm5 \n\t" 		// multiply a and c and add to temp
+
+	"vaddps %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp
+	"vaddps %%xmm13, %%xmm5, %%xmm13     \n\t" 		// multiply a and c and add to temp
+
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
+	"vmulss		%%xmm13, %%xmm1,  %%xmm13\n\t"  // scale by alpha
+
+	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
+	"vmovss 	%%xmm13,    4(%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y)       // 5
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+
+static void  sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+
+	__asm__ __volatile__
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+
+	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
+
+	".L01LOOP%=:				 \n\t"
+	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+
+	"vmulss   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
+	"vaddss %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp
+
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
+
+	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y)       // 5
+	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From 11637b69263ed5d910457927c587616c9a972ab6 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Mon, 21 Jul 2014 06:25:42 +0200
Subject: [PATCH 17/74] add benchmark for ger

---
 benchmark/Makefile |  45 +++++++++-
 benchmark/ger.c    | 218 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 259 insertions(+), 4 deletions(-)
 create mode 100644 benchmark/ger.c

diff --git a/benchmark/Makefile b/benchmark/Makefile
index db183c8ad..4dd8be2ef 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -18,12 +18,12 @@ ATLAS=/usr/lib64/atlas
 LIBATLAS = -fopenmp $(ATLAS)/liblapack.a  $(ATLAS)/libptcblas.a  $(ATLAS)/libptf77blas.a  $(ATLAS)/libatlas.a -lgfortran -lm
 
 # Intel standard
-MKL=/opt/intel/mkl/lib/intel64
-LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm
+# MKL=/opt/intel/mkl/lib/intel64
+# LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm
 
 # Intel custom
-#MKL=/home/saar/intel_mkl
-#LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm
+MKL=/home/saar/intel_mkl
+LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm
 
 
 
@@ -34,6 +34,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
        strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \
        ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \
        ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
+       sger.goto dger.goto \
        chemm.goto zhemm.goto \
        cherk.goto zherk.goto \
        cher2k.goto zher2k.goto \
@@ -47,6 +48,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
        strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \
        ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \
        ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \
+       sger.acml dger.acml \
        chemm.acml zhemm.acml \
        cherk.acml zherk.acml \
        cher2k.acml zher2k.acml \
@@ -60,6 +62,8 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
        strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \
        ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \
        ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \
+       sger.atlas dger.atlas \
+       chemm.acml zhemm.acml \
        chemm.atlas zhemm.atlas \
        cherk.atlas zherk.atlas \
        cher2k.atlas zher2k.atlas \
@@ -73,6 +77,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
        strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \
        ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \
        ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \
+       sger.mkl dger.mkl \
        chemm.mkl zhemm.mkl \
        cherk.mkl zherk.mkl \
        cher2k.mkl zher2k.mkl \
@@ -659,6 +664,31 @@ zgemv.atlas : zgemv.$(SUFFIX)
 zgemv.mkl : zgemv.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
+##################################### Sger ####################################################
+sger.goto : sger.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+sger.acml : sger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sger.atlas : sger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sger.mkl : sger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dger ####################################################
+dger.goto : dger.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+dger.acml : dger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dger.atlas : dger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dger.mkl : dger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
 ###################################################################################################
 
@@ -788,6 +818,13 @@ cgemv.$(SUFFIX) : gemv.c
 zgemv.$(SUFFIX) : gemv.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
 
+sger.$(SUFFIX) : ger.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dger.$(SUFFIX) : ger.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+
 
 clean ::
 	@rm -f *.goto *.mkl *.acml *.atlas
diff --git a/benchmark/ger.c b/benchmark/ger.c
new file mode 100644
index 000000000..5085389da
--- /dev/null
+++ b/benchmark/ger.c
@@ -0,0 +1,218 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+
+#undef GER
+
+
+#ifdef DOUBLE
+#define GER   BLASFUNC(dger)
+#else
+#define GER   BLASFUNC(sger)
+#endif
+
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+  FILETIME ft;
+  unsigned __int64 tmpres = 0;
+  static int tzflag;
+
+  if (NULL != tv)
+    {
+      GetSystemTimeAsFileTime(&ft);
+
+      tmpres |= ft.dwHighDateTime;
+      tmpres <<= 32;
+      tmpres |= ft.dwLowDateTime;
+
+      /*converting file time to unix epoch*/
+      tmpres /= 10;  /*convert into microseconds*/
+      tmpres -= DELTA_EPOCH_IN_MICROSECS;
+      tv->tv_sec = (long)(tmpres / 1000000UL);
+      tv->tv_usec = (long)(tmpres % 1000000UL);
+    }
+
+  return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+  int shmid;
+  void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+  if ((shmid =shmget(IPC_PRIVATE,
+		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+    printf( "Memory allocation failed(shmget).\n");
+    exit(1);
+  }
+
+  address = shmat(shmid, NULL, SHM_RND);
+
+  if ((BLASLONG)address == -1){
+    printf( "Memory allocation failed(shmat).\n");
+    exit(1);
+  }
+
+  shmctl(shmid, IPC_RMID, 0);
+
+  return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int MAIN__(int argc, char *argv[]){
+
+  FLOAT *a, *x, *y;
+  FLOAT alpha[] = {1.0, 1.0};
+  blasint m, i, j;
+  blasint inc_x=1,inc_y=1;
+  blasint n=0;
+  int has_param_n = 0;
+  int loops = 1;
+  int l;
+  char *p;
+
+  int from =   1;
+  int to   = 200;
+  int step =   1;
+
+  struct timeval start, stop;
+  double time1,timeg;
+
+  argc--;argv++;
+
+  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+  if ((p = getenv("OPENBLAS_INCY")))   inc_y = atoi(p);
+  if ((p = getenv("OPENBLAS_PARAM_N"))) {
+	  n = atoi(p);
+	  if ((n>0) && (n<=to)) has_param_n = 1;
+  }
+
+  if ( has_param_n == 1 )
+    fprintf(stderr, "From : %3d  To : %3d Step = %3d N = %d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,n,inc_x,inc_y,loops);
+  else
+    fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
+
+  if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+  if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+#ifdef linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE       Flops\n");
+
+  for(m = from; m <= to; m += step)
+  {
+
+   timeg=0;
+
+   if ( has_param_n == 0 ) n = m;
+
+   fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
+
+   for(j = 0; j < m; j++){
+      		for(i = 0; i < n * COMPSIZE; i++){
+			a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+      		}
+   }
+
+
+   for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
+			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   }
+
+   for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
+			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   }
+
+    for (l=0; l<loops; l++)
+    {
+
+    	gettimeofday( &start, (struct timezone *)0);
+
+    	GER (&m, &n, alpha, x, &inc_x, y, &inc_y, a , &m);
+
+    	gettimeofday( &stop, (struct timezone *)0);
+
+    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+
+	timeg += time1;
+
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr,
+	    " %10.2f MFlops\n",
+	    COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
+
+  }
+
+  return 0;
+}
+
+void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+

From e4663be46ad67c629765ee3c9454db1ace98c783 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Mon, 21 Jul 2014 07:50:54 +0200
Subject: [PATCH 18/74] added symv benchmark

---
 benchmark/Makefile |  36 ++++++++
 benchmark/symv.c   | 218 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 254 insertions(+)
 create mode 100644 benchmark/symv.c

diff --git a/benchmark/Makefile b/benchmark/Makefile
index 4dd8be2ef..fefd99026 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -35,6 +35,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
        ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \
        ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
        sger.goto dger.goto \
+       ssymv.goto dsymv.goto \
        chemm.goto zhemm.goto \
        cherk.goto zherk.goto \
        cher2k.goto zher2k.goto \
@@ -49,6 +50,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
        ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \
        ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \
        sger.acml dger.acml \
+       ssymv.acml dsymv.acml \
        chemm.acml zhemm.acml \
        cherk.acml zherk.acml \
        cher2k.acml zher2k.acml \
@@ -63,6 +65,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
        ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \
        ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \
        sger.atlas dger.atlas \
+       ssymv.atlas dsymv.atlas \
        chemm.acml zhemm.acml \
        chemm.atlas zhemm.atlas \
        cherk.atlas zherk.atlas \
@@ -78,6 +81,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
        ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \
        ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \
        sger.mkl dger.mkl \
+       ssymv.mkl dsymv.mkl \
        chemm.mkl zhemm.mkl \
        cherk.mkl zherk.mkl \
        cher2k.mkl zher2k.mkl \
@@ -690,6 +694,33 @@ dger.atlas : dger.$(SUFFIX)
 dger.mkl : dger.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
+##################################### Ssymv ####################################################
+ssymv.goto : ssymv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+ssymv.acml : ssymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssymv.atlas : ssymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssymv.mkl : ssymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dsymv ####################################################
+dsymv.goto : dsymv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+dsymv.acml : dsymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsymv.atlas : dsymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsymv.mkl : dsymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+
 ###################################################################################################
 
 slinpack.$(SUFFIX) : linpack.c
@@ -824,6 +855,11 @@ sger.$(SUFFIX) : ger.c
 dger.$(SUFFIX) : ger.c
 	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
 
+ssymv.$(SUFFIX) : symv.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dsymv.$(SUFFIX) : symv.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
 
 
 clean ::
diff --git a/benchmark/symv.c b/benchmark/symv.c
new file mode 100644
index 000000000..4bcfb411b
--- /dev/null
+++ b/benchmark/symv.c
@@ -0,0 +1,218 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+
+#undef SYMV
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+#define SYMV   BLASFUNC(dsymv)
+#else
+#define SYMV   BLASFUNC(ssymv)
+#endif
+
+#else
+
+#ifdef DOUBLE
+#define SYMV   BLASFUNC(zsymv)
+#else
+#define SYMV   BLASFUNC(csymv)
+#endif
+
+#endif
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+  FILETIME ft;
+  unsigned __int64 tmpres = 0;
+  static int tzflag;
+
+  if (NULL != tv)
+    {
+      GetSystemTimeAsFileTime(&ft);
+
+      tmpres |= ft.dwHighDateTime;
+      tmpres <<= 32;
+      tmpres |= ft.dwLowDateTime;
+
+      /*converting file time to unix epoch*/
+      tmpres /= 10;  /*convert into microseconds*/
+      tmpres -= DELTA_EPOCH_IN_MICROSECS;
+      tv->tv_sec = (long)(tmpres / 1000000UL);
+      tv->tv_usec = (long)(tmpres % 1000000UL);
+    }
+
+  return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+  int shmid;
+  void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+  if ((shmid =shmget(IPC_PRIVATE,
+		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+    printf( "Memory allocation failed(shmget).\n");
+    exit(1);
+  }
+
+  address = shmat(shmid, NULL, SHM_RND);
+
+  if ((BLASLONG)address == -1){
+    printf( "Memory allocation failed(shmat).\n");
+    exit(1);
+  }
+
+  shmctl(shmid, IPC_RMID, 0);
+
+  return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int MAIN__(int argc, char *argv[]){
+
+  FLOAT *a, *x, *y;
+  FLOAT alpha[] = {1.0, 1.0};
+  FLOAT beta [] = {1.0, 1.0};
+  char uplo='L';
+  blasint m, i, j;
+  blasint inc_x=1,inc_y=1;
+  int loops = 1;
+  int l;
+  char *p;
+
+  int from =   1;
+  int to   = 200;
+  int step =   1;
+
+  struct timeval start, stop;
+  double time1,timeg;
+
+  argc--;argv++;
+
+  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+  if ((p = getenv("OPENBLAS_INCY")))   inc_y = atoi(p);
+  if ((p = getenv("OPENBLAS_UPLO")))  uplo=*p;
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops);
+
+  if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+  if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+#ifdef linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE       Flops\n");
+
+  for(m = from; m <= to; m += step)
+  {
+
+   timeg=0;
+
+   fprintf(stderr, " %6dx%d : ", (int)m,(int)m);
+
+   for(j = 0; j < m; j++){
+      		for(i = 0; i < m * COMPSIZE; i++){
+			a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+      		}
+   }
+
+
+    for (l=0; l<loops; l++)
+    {
+
+   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
+			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   	}
+
+   	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
+			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   	}
+    	gettimeofday( &start, (struct timezone *)0);
+
+    	SYMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
+
+    	gettimeofday( &stop, (struct timezone *)0);
+
+    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+
+	timeg += time1;
+
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr,
+	    " %10.2f MFlops\n",
+	    COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6);
+
+  }
+
+  return 0;
+}
+
+void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

From e213a42cdea3bdb756aa46a4e9bde25a05a6d045 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Mon, 21 Jul 2014 14:50:24 +0200
Subject: [PATCH 19/74] added a sample plot-filter scripts and a header file
 for gnuplot

---
 benchmark/plot-filter.sh | 38 ++++++++++++++++++++++++++++++++++++
 benchmark/plot-header    | 42 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+)
 create mode 100755 benchmark/plot-filter.sh
 create mode 100644 benchmark/plot-header

diff --git a/benchmark/plot-filter.sh b/benchmark/plot-filter.sh
new file mode 100755
index 000000000..b47535b6f
--- /dev/null
+++ b/benchmark/plot-filter.sh
@@ -0,0 +1,38 @@
+#!/bin/sh
+# **********************************************************************************
+# Copyright (c) 2014, The OpenBLAS Project
+# All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# 3. Neither the name of the OpenBLAS project nor the names of
+# its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# **********************************************************************************
+
+# ************************************************************************
+# sample filter for data output from benchmark programs
+#
+# usage example: 
+#   ./dgemm.goto 2>&1|./plotfilter.sh >OpenBLAS
+# ************************************************************************
+
+
+awk '/MFlops/ { print $1,int($3) }'|tail --lines=+2
+
diff --git a/benchmark/plot-header b/benchmark/plot-header
new file mode 100644
index 000000000..070c10d0a
--- /dev/null
+++ b/benchmark/plot-header
@@ -0,0 +1,42 @@
+# **********************************************************************************
+# Copyright (c) 2014, The OpenBLAS Project
+# All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# 3. Neither the name of the OpenBLAS project nor the names of
+# its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# **********************************************************************************
+
+set term x11 font sans;
+set ylabel "MFlops";
+set xlabel "Size";
+set grid xtics;
+set grid ytics;
+set key left;
+set timestamp "generated on %Y-%m-%d by `whoami`"
+set title "Dtrsm\nUPLO=U TRANS=N SIDE=L\nBulldozer 1 Thread"
+plot 'OpenBLAS' smooth bezier, 'ACML' smooth bezier, 'MKL' smooth bezier;
+set output "print.png";
+show title;
+show plot;
+show output;
+
+

From 125610d23b35980524aad77696b3703804b3b810 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Thu, 24 Jul 2014 18:43:31 +0200
Subject: [PATCH 20/74] allow to set custom value for ?GEMM_DEFAULT_UNROLL_MN,
 optimizations for syrk

---
 common_param.h                       | 16 +++++++++++++++
 driver/level3/level3_syrk_threaded.c |  8 ++++----
 driver/level3/syrk_thread.c          |  8 ++++----
 kernel/setparam-ref.c                | 30 ++++++++++++++++++++++++----
 param.h                              |  1 +
 5 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/common_param.h b/common_param.h
index 863216406..1c362e8cb 100644
--- a/common_param.h
+++ b/common_param.h
@@ -919,14 +919,22 @@ extern gotoblas_t *gotoblas;
 #define	SGEMM_R		SGEMM_DEFAULT_R
 #define SGEMM_UNROLL_M	SGEMM_DEFAULT_UNROLL_M
 #define SGEMM_UNROLL_N	SGEMM_DEFAULT_UNROLL_N
+#ifdef  SGEMM_DEFAULT_UNROLL_MN
+#define SGEMM_UNROLL_MN	SGEMM_DEFAULT_UNROLL_MN
+#else
 #define SGEMM_UNROLL_MN	MAX((SGEMM_UNROLL_M), (SGEMM_UNROLL_N))
+#endif
 
 #define	DGEMM_P		DGEMM_DEFAULT_P
 #define	DGEMM_Q		DGEMM_DEFAULT_Q
 #define	DGEMM_R		DGEMM_DEFAULT_R
 #define DGEMM_UNROLL_M	DGEMM_DEFAULT_UNROLL_M
 #define DGEMM_UNROLL_N	DGEMM_DEFAULT_UNROLL_N
+#ifdef  DGEMM_DEFAULT_UNROLL_MN
+#define DGEMM_UNROLL_MN	DGEMM_DEFAULT_UNROLL_MN
+#else
 #define DGEMM_UNROLL_MN	MAX((DGEMM_UNROLL_M), (DGEMM_UNROLL_N))
+#endif
 
 #define	QGEMM_P		QGEMM_DEFAULT_P
 #define	QGEMM_Q		QGEMM_DEFAULT_Q
@@ -940,14 +948,22 @@ extern gotoblas_t *gotoblas;
 #define	CGEMM_R		CGEMM_DEFAULT_R
 #define CGEMM_UNROLL_M	CGEMM_DEFAULT_UNROLL_M
 #define CGEMM_UNROLL_N	CGEMM_DEFAULT_UNROLL_N
+#ifdef  CGEMM_DEFAULT_UNROLL_MN
+#define CGEMM_UNROLL_MN	CGEMM_DEFAULT_UNROLL_MN
+#else
 #define CGEMM_UNROLL_MN	MAX((CGEMM_UNROLL_M), (CGEMM_UNROLL_N))
+#endif
 
 #define	ZGEMM_P		ZGEMM_DEFAULT_P
 #define	ZGEMM_Q		ZGEMM_DEFAULT_Q
 #define	ZGEMM_R		ZGEMM_DEFAULT_R
 #define ZGEMM_UNROLL_M	ZGEMM_DEFAULT_UNROLL_M
 #define ZGEMM_UNROLL_N	ZGEMM_DEFAULT_UNROLL_N
+#ifdef  ZGEMM_DEFAULT_UNROLL_MN
+#define ZGEMM_UNROLL_MN	ZGEMM_DEFAULT_UNROLL_MN
+#else
 #define ZGEMM_UNROLL_MN	MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N))
+#endif
 
 #define	XGEMM_P		XGEMM_DEFAULT_P
 #define	XGEMM_Q		XGEMM_DEFAULT_Q
diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c
index 01c7b23ed..5119baad3 100644
--- a/driver/level3/level3_syrk_threaded.c
+++ b/driver/level3/level3_syrk_threaded.c
@@ -538,10 +538,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
   mask  = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1;
 #elif defined(DOUBLE)
   mode  =  BLAS_DOUBLE  | BLAS_REAL;
-  mask  = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
+  mask  = DGEMM_UNROLL_MN - 1;
 #else
   mode  =  BLAS_SINGLE  | BLAS_REAL;
-  mask  = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
+  mask  = SGEMM_UNROLL_MN - 1;
 #endif
 #else
 #ifdef XDOUBLE
@@ -549,10 +549,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
   mask  = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1;
 #elif defined(DOUBLE)
   mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
-  mask  = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1;
+  mask  = ZGEMM_UNROLL_MN - 1;
 #else
   mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
-  mask  = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1;
+  mask  = CGEMM_UNROLL_MN - 1;
 #endif
 #endif
 
diff --git a/driver/level3/syrk_thread.c b/driver/level3/syrk_thread.c
index 0d9bdf209..94274be72 100644
--- a/driver/level3/syrk_thread.c
+++ b/driver/level3/syrk_thread.c
@@ -57,10 +57,10 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
 
     switch (mode & BLAS_PREC) {
     case BLAS_SINGLE:
-      mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
+      mask = SGEMM_UNROLL_MN - 1;
       break;
     case BLAS_DOUBLE:
-      mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
+      mask = DGEMM_UNROLL_MN - 1;
       break;
 #ifdef EXPRECISION
     case BLAS_XDOUBLE:
@@ -71,10 +71,10 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
   } else {
     switch (mode & BLAS_PREC) {
     case BLAS_SINGLE:
-      mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1;
+      mask = CGEMM_UNROLL_MN - 1;
       break;
     case BLAS_DOUBLE:
-      mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1;
+      mask = ZGEMM_UNROLL_MN - 1;
       break;
 #ifdef EXPRECISION
     case BLAS_XDOUBLE:
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index 5086420c1..b1beeae5c 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -54,7 +54,14 @@ gotoblas_t TABLE_NAME = {
   GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN,
 
   0, 0, 0,
-  SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, MAX(SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N),
+  SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N,
+#ifdef SGEMM_DEFAULT_UNROLL_MN
+ SGEMM_DEFAULT_UNROLL_MN,
+#else
+ MAX(SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N),
+#endif
+
+
 #ifdef HAVE_EXCLUSIVE_CACHE
   1,
 #else
@@ -110,7 +117,12 @@ gotoblas_t TABLE_NAME = {
 #endif
 
   0, 0, 0,
-  DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N),
+  DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N,
+#ifdef DGEMM_DEFAULT_UNROLL_MN
+ DGEMM_DEFAULT_UNROLL_MN,
+#else
+ MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N),
+#endif
 
   damax_kTS,  damin_kTS,  dmax_kTS,  dmin_kTS,
   idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS,
@@ -214,7 +226,12 @@ gotoblas_t TABLE_NAME = {
 #endif
 
   0, 0, 0,
-  CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N, MAX(CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N),
+  CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N,
+#ifdef CGEMM_DEFAULT_UNROLL_MN
+ CGEMM_DEFAULT_UNROLL_MN,
+#else
+ MAX(CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N),
+#endif
 
   camax_kTS, camin_kTS, icamax_kTS, icamin_kTS,
   cnrm2_kTS, casum_kTS, ccopy_kTS,
@@ -307,7 +324,12 @@ gotoblas_t TABLE_NAME = {
 #endif
 
   0, 0, 0,
-  ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N, MAX(ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N),
+  ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N,
+#ifdef ZGEMM_DEFAULT_UNROLL_MN
+ ZGEMM_DEFAULT_UNROLL_MN,
+#else
+ MAX(ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N),
+#endif
 
   zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS,
   znrm2_kTS, zasum_kTS, zcopy_kTS,
diff --git a/param.h b/param.h
index 880219b7c..863e83c32 100644
--- a/param.h
+++ b/param.h
@@ -1206,6 +1206,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ZGEMM_DEFAULT_UNROLL_N 2
 #define XGEMM_DEFAULT_UNROLL_N 1
 
+#define DGEMM_DEFAULT_UNROLL_MN 16
 #endif
 
 #ifdef ARCH_X86

From 1b10ff129a7cf0b072feacfbf1357eff44749d09 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Fri, 25 Jul 2014 10:00:23 +0200
Subject: [PATCH 21/74] optimizations for trmm

---
 driver/level3/trmm_L.c | 16 ++++++++++++----
 driver/level3/trmm_R.c | 24 ++++++++++++++++++------
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/driver/level3/trmm_L.c b/driver/level3/trmm_L.c
index c0a822b51..8a81d31a0 100644
--- a/driver/level3/trmm_L.c
+++ b/driver/level3/trmm_L.c
@@ -135,7 +135,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
     for(jjs = js; jjs < js + min_j; jjs += min_jj){
       min_jj = min_j + js - jjs;
-      if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
+      if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+      else
+      	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 
       START_RPCC();
 
@@ -199,7 +201,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
+        if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+        else
+	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 
 	START_RPCC();
 
@@ -288,7 +292,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
     for(jjs = js; jjs < js + min_j; jjs += min_jj){
       min_jj = min_j + js - jjs;
-      if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
+      if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+      else
+        if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 
       START_RPCC();
 
@@ -352,7 +358,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
+        if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+        else
+	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 
 	START_RPCC();
 
diff --git a/driver/level3/trmm_R.c b/driver/level3/trmm_R.c
index 6012386c8..bdd9370cd 100644
--- a/driver/level3/trmm_R.c
+++ b/driver/level3/trmm_R.c
@@ -119,7 +119,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = 0; jjs < ls - js; jjs += min_jj){
 	min_jj = ls - js - jjs;
-	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
+	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+	else
+	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 
 #ifndef TRANSA
 	GEMM_ONCOPY(min_l, min_jj, a + (ls + (js + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE);
@@ -137,7 +139,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = 0; jjs < min_l; jjs += min_jj){
 	min_jj = min_l - jjs;
-	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
+	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+	else
+	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 
 #ifndef TRANSA
 	TRMM_OLNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE);
@@ -188,7 +192,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
+	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+	else
+	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 
 #ifndef TRANSA
 	GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
@@ -239,7 +245,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = 0; jjs < min_l; jjs += min_jj){
 	min_jj = min_l - jjs;
-	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
+	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+	else
+	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 
 #ifndef TRANSA
 	TRMM_OUNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE);
@@ -258,7 +266,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){
 	min_jj = js - ls - min_l - jjs;
-	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
+	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+	else
+	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 
 #ifndef TRANSA
 	GEMM_ONCOPY(min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda,
@@ -313,7 +323,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
+	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+	else
+	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 
 #ifndef TRANSA
 	GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);

From 3ea4dadd3033b60397b485499bfac1f0e486d04b Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Fri, 25 Jul 2014 11:59:17 +0200
Subject: [PATCH 22/74] optimizations for trsm

---
 driver/level3/trsm_L.c |  8 ++++++--
 driver/level3/trsm_R.c | 16 ++++++++++++----
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/driver/level3/trsm_L.c b/driver/level3/trsm_L.c
index fa3b0d580..78da0eb6c 100644
--- a/driver/level3/trsm_L.c
+++ b/driver/level3/trsm_L.c
@@ -128,7 +128,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
+	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+	else
+	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 
 	GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
 
@@ -194,7 +196,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
+	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+	else
+	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 
 	GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
 
diff --git a/driver/level3/trsm_R.c b/driver/level3/trsm_R.c
index b6ee95654..169441d1e 100644
--- a/driver/level3/trsm_R.c
+++ b/driver/level3/trsm_R.c
@@ -123,7 +123,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
+	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+	else
+	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 
 #ifndef TRANSA
 	GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
@@ -177,7 +179,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = 0; jjs < min_j - min_l - ls + js; jjs += min_jj){
 	min_jj = min_j - min_l - ls + js - jjs;
-	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
+	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+	else
+	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 
 #ifndef TRANSA
       GEMM_ONCOPY (min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda,
@@ -238,7 +242,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
+	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+	else
+	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 
 #ifndef TRANSA
 	GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
@@ -297,7 +303,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = 0; jjs < min_j - js + ls; jjs += min_jj){
 	min_jj = min_j - js + ls - jjs;
-	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
+	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+	else
+	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 
 #ifndef TRANSA
 	GEMM_ONCOPY (min_l, min_jj, a + (ls + (js - min_j + jjs) * lda) * COMPSIZE, lda,

From 1d33547222b5c633fae7c0e3f803735e9a20a665 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sun, 27 Jul 2014 11:51:42 +0200
Subject: [PATCH 23/74] optimized zgemm kernel for haswell

---
 driver/level3/level3.c                   |   12 +-
 driver/level3/level3_thread.c            |   12 +-
 kernel/x86_64/zgemm_kernel_4x2_haswell.S | 2080 +++++++++++++++++++++-
 3 files changed, 2080 insertions(+), 24 deletions(-)

diff --git a/driver/level3/level3.c b/driver/level3/level3.c
index 261204099..70a6500b6 100644
--- a/driver/level3/level3.c
+++ b/driver/level3/level3.c
@@ -333,16 +333,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
 
-#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
-                if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
-                else
-                        if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
-                        else
-                                if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-#else
+        if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
+        else
+          if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 
-        if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-#endif
 
 
 	START_RPCC();
diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c
index 95860d0c0..6162a9f0d 100644
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -367,16 +367,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
       for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
 	min_jj = MIN(n_to, xxx + div_n) - jjs;
 
-#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
-		if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
-		else
-			if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
-			else
-				if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-#else
+	if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
+	else
+		if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 
-	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-#endif
 
 	START_RPCC();
 
diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S
index a71fff7af..e23e09ecc 100644
--- a/kernel/x86_64/zgemm_kernel_4x2_haswell.S
+++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S
@@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 **********************************************************************************/
 
 /********************************************************************************
-* 2014/06/28 Saar
+* 2014/07/28 Saar
 *        BLASTEST               : OK
 *        CTEST                  : OK
 *        TEST                   : OK
@@ -40,12 +40,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *	A_PR1			512
 *	B_PR1			512
 *
-*
+* 2014/07/28 Saar
 * Performance at 4608x4608x4608:
-*       1 thread:       43 GFLOPS	(SANDYBRIDGE:  29)	(MKL:   53)
-*       2 threads:      85 GFLOPS	(SANDYBRIDGE:  59)	(MKL:  100)
-*       3 threads:     122 GFLOPS	(SANDYBRIDGE:  86)	(MKL:  138)
-*       4 threads:     156 GFLOPS	(SANDYBRIDGE: 108)	(MKL:  172)
+*       1 thread:       53 GFLOPS	(SANDYBRIDGE:  29)	(MKL:   53)
+*       2 threads:     101 GFLOPS	(SANDYBRIDGE:  59)	(MKL:  100)
+*       3 threads:     146 GFLOPS	(SANDYBRIDGE:  86)	(MKL:  138)
+*       4 threads:     184 GFLOPS	(SANDYBRIDGE: 108)	(MKL:  172)
 *
 ********************************************************************************/
 
@@ -191,6 +191,379 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define	A_PR1	512
 #define	B_PR1	512
+
+
+
+/***************************************************************************************************/
+
+.macro KERNEL4x3_SUB
+        vmovups                   (AO), %ymm0
+        vmovups           4 * SIZE(AO), %ymm1
+	prefetcht0	  A_PR1(AO)
+
+        vbroadcastsd              (BO),   %ymm2
+        vbroadcastsd      1 * SIZE(BO),   %ymm3
+        VFMADDPD_R(        %ymm8 ,%ymm2,%ymm0 )
+        VFMADDPD_R(        %ymm12,%ymm2,%ymm1 )
+        VFMADDPD_I(        %ymm9 ,%ymm3,%ymm0 )
+        VFMADDPD_I(        %ymm13,%ymm3,%ymm1 )
+
+        vbroadcastsd      2 * SIZE(BO),   %ymm2
+        vbroadcastsd      3 * SIZE(BO),   %ymm3
+        VFMADDPD_R(        %ymm10,%ymm2,%ymm0 )
+        VFMADDPD_R(        %ymm14,%ymm2,%ymm1 )
+        VFMADDPD_I(        %ymm11,%ymm3,%ymm0 )
+        VFMADDPD_I(        %ymm15,%ymm3,%ymm1 )
+
+        vbroadcastsd      4 * SIZE(BO),   %ymm2
+        vbroadcastsd      5 * SIZE(BO),   %ymm3
+        VFMADDPD_R(        %ymm4 ,%ymm2,%ymm0 )
+        VFMADDPD_R(        %ymm6 ,%ymm2,%ymm1 )
+        VFMADDPD_I(        %ymm5 ,%ymm3,%ymm0 )
+        VFMADDPD_I(        %ymm7 ,%ymm3,%ymm1 )
+
+        addq    $6*SIZE, BO                           
+        addq    $8*SIZE, AO                           
+        decq	%rax                         
+.endm
+
+.macro SAVE4x3
+
+	vbroadcastsd	ALPHA_R, %ymm0
+	vbroadcastsd	ALPHA_I, %ymm1
+
+	// swap high and low 8 bytes
+        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
+        vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
+        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
+        vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
+        vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5
+        vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %ymm9, %ymm8 , %ymm8
+        vaddsubpd %ymm11,%ymm10, %ymm10
+        vaddsubpd %ymm13,%ymm12, %ymm12
+        vaddsubpd %ymm15,%ymm14, %ymm14
+        vaddsubpd %ymm5 ,%ymm4 , %ymm4
+        vaddsubpd %ymm7 ,%ymm6 , %ymm6
+
+        vshufpd $ 0x05, %ymm8 , %ymm8 , %ymm9
+        vshufpd $ 0x05, %ymm10, %ymm10, %ymm11
+        vshufpd $ 0x05, %ymm12, %ymm12, %ymm13
+        vshufpd $ 0x05, %ymm14, %ymm14, %ymm15
+        vshufpd $ 0x05, %ymm4 , %ymm4 , %ymm5
+        vshufpd $ 0x05, %ymm6 , %ymm6 , %ymm7
+
+#else
+        vaddsubpd %ymm8,  %ymm9 ,%ymm9
+        vaddsubpd %ymm10, %ymm11,%ymm11
+        vaddsubpd %ymm12, %ymm13,%ymm13
+        vaddsubpd %ymm14, %ymm15,%ymm15
+        vaddsubpd %ymm4 , %ymm5 ,%ymm5
+        vaddsubpd %ymm6 , %ymm7 ,%ymm7
+
+        vmovapd   %ymm9,  %ymm8
+        vmovapd   %ymm11, %ymm10
+        vmovapd   %ymm13, %ymm12
+        vmovapd   %ymm15, %ymm14
+        vmovapd   %ymm5 , %ymm4
+        vmovapd   %ymm7 , %ymm6
+
+	// swap high and low 8 bytes
+        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
+        vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
+        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
+        vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
+        vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5
+        vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %ymm8 , %ymm0, %ymm8
+        vmulpd  %ymm10, %ymm0, %ymm10
+        vmulpd  %ymm12, %ymm0, %ymm12
+        vmulpd  %ymm14, %ymm0, %ymm14
+        vmulpd  %ymm4 , %ymm0, %ymm4
+        vmulpd  %ymm6 , %ymm0, %ymm6
+
+	// multiply with ALPHA_I
+        vmulpd  %ymm9 , %ymm1, %ymm9
+        vmulpd  %ymm11, %ymm1, %ymm11
+        vmulpd  %ymm13, %ymm1, %ymm13
+        vmulpd  %ymm15, %ymm1, %ymm15
+        vmulpd  %ymm5 , %ymm1, %ymm5
+        vmulpd  %ymm7 , %ymm1, %ymm7
+
+	vaddsubpd %ymm9, %ymm8 , %ymm8
+        vaddsubpd %ymm11,%ymm10, %ymm10
+        vaddsubpd %ymm13,%ymm12, %ymm12
+        vaddsubpd %ymm15,%ymm14, %ymm14
+        vaddsubpd %ymm5 ,%ymm4 , %ymm4
+        vaddsubpd %ymm7 ,%ymm6 , %ymm6
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %ymm8 , %ymm8
+	vaddpd  4 * SIZE(CO1), %ymm12, %ymm12
+
+	vaddpd 	 	(CO1, LDC), %ymm10, %ymm10
+	vaddpd  4 * SIZE(CO1, LDC), %ymm14, %ymm14
+
+	vaddpd 	 	(CO1, LDC,2), %ymm4 , %ymm4
+	vaddpd  4 * SIZE(CO1, LDC,2), %ymm6 , %ymm6
+#endif
+
+	vmovups	%ymm8 ,  	 (CO1)
+	vmovups	%ymm12 , 4 * SIZE(CO1)
+
+	vmovups	%ymm10 ,  	 (CO1, LDC)
+	vmovups	%ymm14 , 4 * SIZE(CO1, LDC)
+
+	vmovups	%ymm4  ,  	 (CO1, LDC, 2)
+	vmovups	%ymm6  , 4 * SIZE(CO1, LDC, 2)
+
+	prefetcht0	64(CO1)
+	prefetcht0	64(CO1, LDC)
+
+.endm
+
+
+
+/***************************************************************************************************/
+
+.macro KERNEL2x3_SUB
+        vmovups                  (AO), %xmm0
+        vmovups          2 * SIZE(AO), %xmm1
+        vmovddup                 (BO), %xmm2
+        vmovddup         1 * SIZE(BO), %xmm3
+
+        VFMADDPD_R(        %xmm8 ,%xmm2,%xmm0 )
+        VFMADDPD_R(        %xmm12,%xmm2,%xmm1 )
+        VFMADDPD_I(        %xmm9 ,%xmm3,%xmm0 )
+        VFMADDPD_I(        %xmm13,%xmm3,%xmm1 )
+
+        vmovddup         2 * SIZE(BO), %xmm2
+        vmovddup         3 * SIZE(BO), %xmm3
+        VFMADDPD_R(        %xmm10,%xmm2,%xmm0 )
+        VFMADDPD_R(        %xmm14,%xmm2,%xmm1 )
+        VFMADDPD_I(        %xmm11,%xmm3,%xmm0 )
+        VFMADDPD_I(        %xmm15,%xmm3,%xmm1 )
+
+        vmovddup         4 * SIZE(BO), %xmm2
+        vmovddup         5 * SIZE(BO), %xmm3
+        VFMADDPD_R(        %xmm4 ,%xmm2,%xmm0 )
+        VFMADDPD_R(        %xmm6 ,%xmm2,%xmm1 )
+        VFMADDPD_I(        %xmm5 ,%xmm3,%xmm0 )
+        VFMADDPD_I(        %xmm7 ,%xmm3,%xmm1 )
+
+        addq    $6*SIZE, BO                           
+        addq    $4*SIZE, AO                           
+        decq    %rax                         
+.endm
+
+.macro SAVE2x3
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
+        vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
+        vshufpd $ 0x01, %xmm5 , %xmm5 , %xmm5
+        vshufpd $ 0x01, %xmm7 , %xmm7 , %xmm7
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+        vaddsubpd %xmm13,%xmm12, %xmm12
+        vaddsubpd %xmm15,%xmm14, %xmm14
+        vaddsubpd %xmm5, %xmm4 , %xmm4
+        vaddsubpd %xmm7, %xmm6 , %xmm6
+
+        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
+        vshufpd $ 0x01, %xmm12, %xmm12, %xmm13
+        vshufpd $ 0x01, %xmm14, %xmm14, %xmm15
+        vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5
+        vshufpd $ 0x01, %xmm6 , %xmm6, %xmm7
+
+#else
+        vaddsubpd %xmm8,  %xmm9 ,%xmm9
+        vaddsubpd %xmm10, %xmm11,%xmm11
+        vaddsubpd %xmm12, %xmm13,%xmm13
+        vaddsubpd %xmm14, %xmm15,%xmm15
+        vaddsubpd %xmm4,  %xmm5 ,%xmm5
+        vaddsubpd %xmm6,  %xmm7 ,%xmm7
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm11, %xmm10
+        vmovapd   %xmm13, %xmm12
+        vmovapd   %xmm15, %xmm14
+        vmovapd   %xmm5,  %xmm4
+        vmovapd   %xmm7,  %xmm6
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
+        vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
+        vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5
+        vshufpd $ 0x01, %xmm7 , %xmm7, %xmm7
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm10, %xmm0, %xmm10
+        vmulpd  %xmm12, %xmm0, %xmm12
+        vmulpd  %xmm14, %xmm0, %xmm14
+        vmulpd  %xmm4 , %xmm0, %xmm4
+        vmulpd  %xmm6 , %xmm0, %xmm6
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm11, %xmm1, %xmm11
+        vmulpd  %xmm13, %xmm1, %xmm13
+        vmulpd  %xmm15, %xmm1, %xmm15
+        vmulpd  %xmm5 , %xmm1, %xmm5
+        vmulpd  %xmm7 , %xmm1, %xmm7
+
+	vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+        vaddsubpd %xmm13,%xmm12, %xmm12
+        vaddsubpd %xmm15,%xmm14, %xmm14
+	vaddsubpd %xmm5, %xmm4 , %xmm4
+	vaddsubpd %xmm7, %xmm6 , %xmm6
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
+
+	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
+	vaddpd  2 * SIZE(CO1, LDC), %xmm14, %xmm14
+
+	vaddpd 	 	(CO1, LDC,2), %xmm4 , %xmm4
+	vaddpd  2 * SIZE(CO1, LDC,2), %xmm6 , %xmm6
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 2 * SIZE(CO1)
+
+	vmovups	%xmm10 ,  	(CO1, LDC)
+	vmovups	%xmm14 , 2 * SIZE(CO1, LDC)
+
+	vmovups	%xmm4  ,  	(CO1, LDC,2)
+	vmovups	%xmm6  , 2 * SIZE(CO1, LDC,2)
+
+.endm
+
+
+/************************************************************************************************/
+
+
+.macro KERNEL1x3_SUB
+        vmovups                  (AO), %xmm0
+        vmovddup                 (BO), %xmm2
+        vmovddup         1 * SIZE(BO), %xmm3
+
+        VFMADDPD_R(        %xmm8,%xmm2,%xmm0 )
+        VFMADDPD_I(        %xmm9,%xmm3,%xmm0 )
+
+        vmovddup         2 * SIZE(BO), %xmm2
+        vmovddup         3 * SIZE(BO), %xmm3
+        VFMADDPD_R(        %xmm10,%xmm2,%xmm0 )
+        VFMADDPD_I(        %xmm11,%xmm3,%xmm0 )
+
+        vmovddup         4 * SIZE(BO), %xmm2
+        vmovddup         5 * SIZE(BO), %xmm3
+        VFMADDPD_R(        %xmm4 ,%xmm2,%xmm0 )
+        VFMADDPD_I(        %xmm5 ,%xmm3,%xmm0 )
+
+        addq    $6*SIZE, BO                           
+        addq    $2*SIZE, AO                           
+        decq    %rax                         
+.endm
+
+.macro SAVE1x3
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+        vaddsubpd %xmm5, %xmm4 , %xmm4
+
+        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
+        vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5
+
+#else
+        vaddsubpd %xmm8, %xmm9, %xmm9
+        vaddsubpd %xmm10,%xmm11, %xmm11
+        vaddsubpd %xmm4, %xmm5, %xmm5
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm11, %xmm10
+        vmovapd   %xmm5,  %xmm4
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm10, %xmm0, %xmm10
+        vmulpd  %xmm4 , %xmm0, %xmm4
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm11, %xmm1, %xmm11
+        vmulpd  %xmm5 , %xmm1, %xmm5
+
+	vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+	vaddsubpd %xmm5, %xmm4 , %xmm4
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1)        , %xmm8 , %xmm8
+	vaddpd 	 	(CO1, LDC)   , %xmm10, %xmm10
+	vaddpd 	 	(CO1, LDC,2) , %xmm4 , %xmm4
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm10 ,  	(CO1, LDC)
+	vmovups	%xmm4  ,  	(CO1, LDC,2)
+
+.endm
+
+
+
+
 /***************************************************************************************************/
 
 .macro KERNEL4x2_SUB
@@ -676,6 +1049,1697 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 
+#if !defined(TRMMKERNEL)
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$ STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	vmovups	%xmm6,   64(%rsp)
+	vmovups	%xmm7,   80(%rsp)
+	vmovups	%xmm8,   96(%rsp)
+	vmovups	%xmm9,  112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	movsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+	vmovsd   OLD_ALPHA_I, %xmm1
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $ 128 + L_BUFFER_SIZE, %rsp
+        andq    $ -4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$ 0, OLD_M
+	je	.L999
+
+	cmpq	$ 0, OLD_N
+	je	.L999
+
+	cmpq	$ 0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA_R
+	vmovsd	 %xmm1, ALPHA_I
+
+	salq	$ ZBASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $ 6,  %rdi
+        divq    %rdi                    //    N / 6
+        movq    %rax, Ndiv6             //    N / 6
+        movq    %rdx, Nmod6             //    N % 6
+
+	
+
+/************************************************************************************************/
+.L6_00_0:
+
+	movq	Ndiv6,  J
+	cmpq	$ 0, J
+	je	.L2_00_0
+	ALIGN_4
+
+
+
+.L6_00_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	salq	$2, %rax		// 2 * COMPSIZE
+	leaq	(B, %rax,8), BO2 
+	movq	BO2, B			// next offset of B
+	movq	K, %rax
+	ALIGN_4
+
+.L6_00_02b:
+
+	vmovups		(BO1), %xmm0
+	vmovups	2 * SIZE(BO1), %xmm1
+	vmovups	        (BO2), %xmm2
+	vmovups	%xmm0,         (BO)
+	vmovups	%xmm1, 2 * SIZE(BO)
+	vmovups	%xmm2, 4 * SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO2
+	addq	$ 6*SIZE,BO
+	decq	%rax
+	jnz	.L6_00_02b
+
+.L6_00_02c:
+
+
+
+.L6_00_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+	movq	A, AO		 	// aoffset = a
+
+	movq	M,  I
+	sarq	$ 2, I			// i = (m >> 2)
+	je	.L6_2_10
+
+	ALIGN_4
+
+/******************************************************************************************************************/
+
+.L6_4_11:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L6_4_16
+	ALIGN_4
+
+.L6_4_12:
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	je	.L6_4_16
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	je	.L6_4_16
+
+	jmp	.L6_4_12
+	ALIGN_4
+
+.L6_4_16:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L6_4_19
+	ALIGN_4
+
+.L6_4_17:
+
+	KERNEL4x3_SUB
+
+	jnz	.L6_4_17
+	ALIGN_4
+
+
+.L6_4_19:
+
+	SAVE4x3
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L6_4_11
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+
+/******************************************************************************************************************/
+.L6_2_10:
+	testq	$ 2, M		
+	jz	.L6_2_40		// to next 2 lines of N
+
+.L6_2_11:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L6_2_16
+	ALIGN_4
+
+.L6_2_12:
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	je	.L6_2_16
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	je	.L6_2_16
+
+	jmp	.L6_2_12
+	ALIGN_4
+
+.L6_2_16:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L6_2_19
+	ALIGN_4
+
+.L6_2_17:
+
+	KERNEL2x3_SUB
+
+	jnz	.L6_2_17
+	ALIGN_4
+
+
+.L6_2_19:
+
+	SAVE2x3
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L6_2_40:
+	testq	$ 1, M		
+	jz	.L6_2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L6_2_41:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L6_2_46
+
+	ALIGN_4
+
+.L6_2_42:
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	je	.L6_2_46
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	je	.L6_2_46
+
+	jmp	.L6_2_42
+	ALIGN_4
+
+.L6_2_46:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L6_2_49
+
+	ALIGN_4
+
+.L6_2_47:
+
+	KERNEL1x3_SUB
+
+	jnz	.L6_2_47
+	ALIGN_4
+
+
+.L6_2_49:
+
+	SAVE1x3
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	decq	I			# i --
+	jg	.L6_2_41
+	ALIGN_4	
+
+
+
+	
+.L6_2_60:
+
+
+/************************************************************************************************/
+
+/************************************************************************************************/
+
+
+.L7_00_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	salq	$2, %rax		// 2 * COMPSIZE
+	leaq	(B, %rax,8), BO2 
+	movq	K, %rax
+	ALIGN_4
+
+.L7_00_02b:
+
+	vmovups	2 * SIZE(BO1), %xmm0
+	vmovups	        (BO2), %xmm1
+	vmovups	2 * SIZE(BO2), %xmm2
+	vmovups	%xmm0,         (BO)
+	vmovups	%xmm1, 2 * SIZE(BO)
+	vmovups	%xmm2, 4 * SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO2
+	addq	$ 6*SIZE,BO
+	decq	%rax
+	jnz	.L7_00_02b
+
+.L7_00_02c:
+
+	movq	BO2, B			// next offset of B
+
+
+.L7_00_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+	movq	A, AO		 	// aoffset = a
+
+	movq	M,  I
+	sarq	$ 2, I			// i = (m >> 2)
+	je	.L7_2_10
+
+	ALIGN_4
+
+/******************************************************************************************************************/
+
+.L7_4_11:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L7_4_16
+	ALIGN_4
+
+.L7_4_12:
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	je	.L7_4_16
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	je	.L7_4_16
+
+	jmp	.L7_4_12
+	ALIGN_4
+
+.L7_4_16:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L7_4_19
+
+	ALIGN_4
+
+.L7_4_17:
+
+	KERNEL4x3_SUB
+
+	jnz	.L7_4_17
+	ALIGN_4
+
+
+.L7_4_19:
+
+	SAVE4x3
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L7_4_11
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+
+/******************************************************************************************************************/
+.L7_2_10:
+	testq	$ 2, M		
+	jz	.L7_2_40		// to next 2 lines of N
+
+.L7_2_11:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L7_2_16
+	ALIGN_4
+
+.L7_2_12:
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	je	.L7_2_16
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	je	.L7_2_16
+
+	jmp	.L7_2_12
+	ALIGN_4
+
+.L7_2_16:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L7_2_19
+
+	ALIGN_4
+
+.L7_2_17:
+
+	KERNEL2x3_SUB
+
+	jnz	.L7_2_17
+	ALIGN_4
+
+
+.L7_2_19:
+
+	SAVE2x3
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L7_2_40:
+	testq	$ 1, M		
+	jz	.L7_2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L7_2_41:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L7_2_46
+
+	ALIGN_4
+
+.L7_2_42:
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	je	.L7_2_46
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	je	.L7_2_46
+
+	jmp	.L7_2_42
+	ALIGN_4
+
+.L7_2_46:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L7_2_49
+	ALIGN_4
+
+.L7_2_47:
+
+	KERNEL1x3_SUB
+
+	jnz	.L7_2_47
+	ALIGN_4
+
+
+.L7_2_49:
+
+	SAVE1x3
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	decq	I			# i --
+	jg	.L7_2_41
+	ALIGN_4	
+
+
+
+	
+.L7_2_60:
+
+	decq	J			// j --
+	jg	.L6_00_01		// next 6 lines of N
+
+/************************************************************************************************/
+
+
+
+/************************************************************************************************/
+.L2_00_0:
+
+	movq	Nmod6,  J
+	sarq	$1, J		// j = j / 2
+	cmpq	$ 0, J
+	je	.L1_2_0
+	ALIGN_4
+
+
+
+.L2_00_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_00_02b:
+
+	vmovups		(BO1), %xmm0
+	vmovups	2 * SIZE(BO1), %xmm1
+	vmovups	%xmm0,       (BO)
+	vmovups	%xmm1, 2 * SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO
+	decq	%rax
+	jnz	.L2_00_02b
+
+.L2_00_02c:
+
+	movq	BO1, B			// next offset of B
+
+
+.L2_00_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 8 * SIZE, AO
+
+	movq	M,  I
+	sarq	$ 2, I			// i = (m >> 2)
+	je	.L2_2_10
+
+	ALIGN_4
+
+/******************************************************************************************************************/
+
+.L2_4_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 4, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_12:
+
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+
+	je	.L2_4_16
+
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+
+	je	.L2_4_16
+
+	jmp	.L2_4_12
+	ALIGN_4
+
+.L2_4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_17:
+
+	KERNEL4x2_SUB
+
+	jl	.L2_4_17
+	ALIGN_4
+
+
+.L2_4_19:
+
+	SAVE4x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 4, KK
+#endif
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L2_4_11
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+
+/******************************************************************************************************************/
+.L2_2_10:
+	testq	$ 2, M		
+	jz	.L2_2_40		// to next 2 lines of N
+
+.L2_2_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 2, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_2_12:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_2_16
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_2_16
+
+	jmp	.L2_2_12
+	ALIGN_4
+
+.L2_2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_2_17:
+
+	KERNEL2x2_SUB
+
+	jl	.L2_2_17
+	ALIGN_4
+
+
+.L2_2_19:
+
+	SAVE2x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_2_40:
+	testq	$ 1, M		
+	jz	.L2_2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_2_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 1, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_2_42:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_2_46
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_2_46
+
+	jmp	.L2_2_42
+	ALIGN_4
+
+.L2_2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_2_47:
+
+	KERNEL1x2_SUB
+
+	jl	.L2_2_47
+	ALIGN_4
+
+
+.L2_2_49:
+
+	SAVE1x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 1, KK
+#endif
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	decq	I			# i --
+	jg	.L2_2_41
+	ALIGN_4	
+
+
+
+	
+.L2_2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	decq	J			// j --
+	jg	.L2_00_01			// next 2 lines of N
+
+
+
+.L1_2_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$ 1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_00_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_00_02b:
+
+	vmovups		(BO1), %xmm0
+	vmovups	%xmm0,       (BO)
+	addq	$ 2*SIZE,BO1
+	addq	$ 2*SIZE,BO
+	decq	%rax
+	jnz	.L1_00_02b
+
+.L1_00_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_00_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 8 * SIZE, AO
+
+	movq	M,  I
+	sarq	$ 2, I			// i = (m >> 2)
+	je	.L1_2_10
+
+	ALIGN_4
+
+/*******************************************************************************************************/
+
+
+.L1_4_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 4, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_12:
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_4_16
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_4_16
+
+	jmp	.L1_4_12
+	ALIGN_4
+
+.L1_4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_17:
+
+	KERNEL4x1_SUB
+
+	jl	.L1_4_17
+	ALIGN_4
+
+
+.L1_4_19:
+
+	SAVE4x1
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 4, KK
+#endif
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L1_4_11
+	ALIGN_4	
+
+
+
+
+/*******************************************************************************************************/
+.L1_2_10:
+	testq	$ 2, M		
+	jz	.L1_2_40
+
+
+.L1_2_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 2, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_2_12:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_2_16
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_2_16
+
+	jmp	.L1_2_12
+	ALIGN_4
+
+.L1_2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_2_17:
+
+	KERNEL2x1_SUB
+
+	jl	.L1_2_17
+	ALIGN_4
+
+
+.L1_2_19:
+
+	SAVE2x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_2_40:
+	testq	$ 1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_2_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 1, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_2_42:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_2_46
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_2_46
+
+	jmp	.L1_2_42
+	ALIGN_4
+
+.L1_2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_2_47:
+
+	KERNEL1x1_SUB
+
+	jl	.L1_2_47
+	ALIGN_4
+
+
+.L1_2_49:
+
+	SAVE1x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 1, KK
+#endif
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	decq	I			# i --
+	jg	.L1_2_41
+	ALIGN_4	
+
+
+
+
+
+
+.L999:
+	vzeroupper
+
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	vmovups	 64(%rsp), %xmm6
+	vmovups	 80(%rsp), %xmm7
+	vmovups	 96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+
+	addq	$ STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+#else
+/************************************************************************************************
+ TRMM Kernel
+************************************************************************************************/
 
 	PROLOGUE
 	PROFCODE
@@ -1811,3 +3875,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ret
 
 	EPILOGUE
+
+#endif
+
+

From 1cc02b43377c4246c78c660e189e4900da5338d9 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Mon, 28 Jul 2014 11:50:01 +0200
Subject: [PATCH 24/74] optimized sgemm kernel for haswell

---
 kernel/x86_64/sgemm_kernel_16x4_haswell.S | 3707 ++++++++++++++++++++-
 param.h                                   |    3 +-
 2 files changed, 3680 insertions(+), 30 deletions(-)

diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S
index 1f9f88657..d88add02b 100644
--- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S
+++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S
@@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 **********************************************************************************/
 
 /*********************************************************************
-* 2013/11/13 Saar
+* 2014/07/28 Saar
 *        BLASTEST               : OK
 *        CTEST                  : OK
 *        TEST                   : OK
@@ -40,12 +40,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *	A_PR1			512
 *	B_PR1			512
 *	
-*
+* 
+* 2014/07/28 Saar
 * Performance at 9216x9216x9216:
-*       1 thread:       86 GFLOPS       (SANDYBRIDGE:  59)      (MKL:   83)
-*       2 threads:     157 GFLOPS       (SANDYBRIDGE: 116)      (MKL:  155)
-*       3 threads:     235 GFLOPS       (SANDYBRIDGE: 165)      (MKL:  230)
-*       4 threads:     288 GFLOPS       (SANDYBRIDGE: 223)      (MKL:  267)
+*       1 thread:      102 GFLOPS       (SANDYBRIDGE:  59)      (MKL:   83)
+*       2 threads:     195 GFLOPS       (SANDYBRIDGE: 116)      (MKL:  155)
+*       3 threads:     281 GFLOPS       (SANDYBRIDGE: 165)      (MKL:  230)
+*       4 threads:     366 GFLOPS       (SANDYBRIDGE: 223)      (MKL:  267)
 *
 *********************************************************************/
 
@@ -69,6 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	CO1	%r15
 #define K	%r12
 #define BI	%rbp
+#define BO2	%rbp
 #define	SP	%rbx
 
 #define BO1	%rdi
@@ -90,7 +92,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
+#if defined(OS_WINDOWS)
 #define L_BUFFER_SIZE 8192
+#else
+#define L_BUFFER_SIZE 12288
+#endif
 
 #define Ndiv6	 24(%rsp)
 #define Nmod6	 32(%rsp)
@@ -145,6 +151,373 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	A_PR1	512
 #define	B_PR1	512
 
+/*******************************************************************************************
+* 6 lines of N
+*******************************************************************************************/
+
+.macro KERNEL16x6_SUB
+	vmovups 	-16 * SIZE(AO), %ymm0
+	vmovups 	 -8 * SIZE(AO), %ymm1
+	vbroadcastss	 -4 * SIZE(BO), %ymm2
+	vbroadcastss	 -3 * SIZE(BO), %ymm3
+	prefetcht0	A_PR1(AO)
+
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	VFMADD231PS_(  	%ymm5,%ymm2,%ymm1 )
+	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
+	VFMADD231PS_(  	%ymm7,%ymm3,%ymm1 )
+
+	vbroadcastss	 -2 * SIZE(BO), %ymm2
+	vbroadcastss	 -1 * SIZE(BO), %ymm3
+	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
+	VFMADD231PS_(  	%ymm9,%ymm2,%ymm1  )
+	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
+	VFMADD231PS_(  	%ymm11,%ymm3,%ymm1 )
+
+	vbroadcastss	  0 * SIZE(BO), %ymm2
+	vbroadcastss	  1 * SIZE(BO), %ymm3
+	VFMADD231PS_(  	%ymm12,%ymm2,%ymm0  )
+	VFMADD231PS_(  	%ymm13,%ymm2,%ymm1  )
+	VFMADD231PS_(  	%ymm14,%ymm3,%ymm0 )
+	VFMADD231PS_(  	%ymm15,%ymm3,%ymm1 )
+
+	addq	$6*SIZE, BO 
+	addq	$16*SIZE, AO 
+	decq	%rax 
+.endm
+
+.macro SAVE16x6
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm5 , %ymm5
+	vmulps	%ymm0 , %ymm6 , %ymm6
+	vmulps	%ymm0 , %ymm7 , %ymm7
+	vmulps	%ymm0 , %ymm8 , %ymm8
+	vmulps	%ymm0 , %ymm9 , %ymm9
+	vmulps	%ymm0 , %ymm10, %ymm10
+	vmulps	%ymm0 , %ymm11, %ymm11
+	vmulps	%ymm0 , %ymm12, %ymm12
+	vmulps	%ymm0 , %ymm13, %ymm13
+	vmulps	%ymm0 , %ymm14, %ymm14
+	vmulps	%ymm0 , %ymm15, %ymm15
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
+
+	vaddps 	        (CO1, LDC), %ymm6,%ymm6
+	vaddps  8 * SIZE(CO1, LDC), %ymm7,%ymm7
+
+	vaddps 	        (CO1, LDC,2), %ymm8,%ymm8
+	vaddps  8 * SIZE(CO1, LDC,2), %ymm9,%ymm9
+
+	vaddps 	        (CO2), %ymm10,%ymm10
+	vaddps  8 * SIZE(CO2), %ymm11,%ymm11
+
+	vaddps 	        (CO2, LDC), %ymm12,%ymm12
+	vaddps  8 * SIZE(CO2, LDC), %ymm13,%ymm13
+
+	vaddps 	        (CO2, LDC,2), %ymm14,%ymm14
+	vaddps  8 * SIZE(CO2, LDC,2), %ymm15,%ymm15
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 , 8 * SIZE(CO1)
+
+	vmovups	%ymm6 ,  	(CO1, LDC)
+	vmovups	%ymm7 , 8 * SIZE(CO1, LDC)
+
+	vmovups	%ymm8 ,  	(CO1, LDC,2)
+	vmovups	%ymm9 , 8 * SIZE(CO1, LDC,2)
+
+	vmovups	%ymm10,  	(CO2)
+	vmovups	%ymm11, 8 * SIZE(CO2)
+
+	vmovups	%ymm12,  	(CO2, LDC)
+	vmovups	%ymm13, 8 * SIZE(CO2, LDC)
+
+	vmovups	%ymm14,  	(CO2, LDC,2)
+	vmovups	%ymm15, 8 * SIZE(CO2, LDC,2)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x6_SUB
+	vmovups 	-16 * SIZE(AO), %ymm0
+	vbroadcastss	 -4 * SIZE(BO), %ymm2
+	vbroadcastss	 -3 * SIZE(BO), %ymm3
+
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
+
+	vbroadcastss	 -2 * SIZE(BO), %ymm2
+	vbroadcastss	 -1 * SIZE(BO), %ymm3
+	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
+	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
+
+	vbroadcastss	  0 * SIZE(BO), %ymm2
+	vbroadcastss	  1 * SIZE(BO), %ymm3
+	VFMADD231PS_(  	%ymm12,%ymm2,%ymm0  )
+	VFMADD231PS_(  	%ymm14,%ymm3,%ymm0 )
+
+	addq	$6*SIZE, BO 
+	addq	$8*SIZE, AO 
+	decq	%rax 
+.endm
+
+.macro SAVE8x6
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm6 , %ymm6
+	vmulps	%ymm0 , %ymm8 , %ymm8
+	vmulps	%ymm0 , %ymm10, %ymm10
+	vmulps	%ymm0 , %ymm12, %ymm12
+	vmulps	%ymm0 , %ymm14, %ymm14
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps 	        (CO1, LDC), %ymm6,%ymm6
+	vaddps 	        (CO1, LDC,2), %ymm8,%ymm8
+	vaddps 	        (CO2), %ymm10,%ymm10
+	vaddps 	        (CO2, LDC), %ymm12,%ymm12
+	vaddps 	        (CO2, LDC,2), %ymm14,%ymm14
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm6 ,  	(CO1, LDC)
+	vmovups	%ymm8 ,  	(CO1, LDC,2)
+	vmovups	%ymm10,  	(CO2)
+	vmovups	%ymm12,  	(CO2, LDC)
+	vmovups	%ymm14,  	(CO2, LDC,2)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x6_SUB
+	vmovups 	-16 * SIZE(AO), %xmm0
+	vbroadcastss	 -4 * SIZE(BO), %xmm2
+	vbroadcastss	 -3 * SIZE(BO), %xmm3
+
+	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231PS_(  	%xmm6,%xmm3,%xmm0 )
+
+	vbroadcastss	 -2 * SIZE(BO), %xmm2
+	vbroadcastss	 -1 * SIZE(BO), %xmm3
+	VFMADD231PS_(  	%xmm8,%xmm2,%xmm0  )
+	VFMADD231PS_(  	%xmm10,%xmm3,%xmm0 )
+
+	vbroadcastss	  0 * SIZE(BO), %xmm2
+	vbroadcastss	  1 * SIZE(BO), %xmm3
+	VFMADD231PS_(  	%xmm12,%xmm2,%xmm0  )
+	VFMADD231PS_(  	%xmm14,%xmm3,%xmm0 )
+
+	addq	$6*SIZE, BO 
+	addq	$4*SIZE, AO 
+	decq	%rax 
+.endm
+
+.macro SAVE4x6
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vmulps	%xmm0 , %xmm4 , %xmm4
+	vmulps	%xmm0 , %xmm6 , %xmm6
+	vmulps	%xmm0 , %xmm8 , %xmm8
+	vmulps	%xmm0 , %xmm10, %xmm10
+	vmulps	%xmm0 , %xmm12, %xmm12
+	vmulps	%xmm0 , %xmm14, %xmm14
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %xmm4,%xmm4
+	vaddps 	        (CO1, LDC), %xmm6,%xmm6
+	vaddps 	        (CO1, LDC,2), %xmm8,%xmm8
+	vaddps 	        (CO2), %xmm10,%xmm10
+	vaddps 	        (CO2, LDC), %xmm12,%xmm12
+	vaddps 	        (CO2, LDC,2), %xmm14,%xmm14
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm6 ,  	(CO1, LDC)
+	vmovups	%xmm8 ,  	(CO1, LDC,2)
+	vmovups	%xmm10,  	(CO2)
+	vmovups	%xmm12,  	(CO2, LDC)
+	vmovups	%xmm14,  	(CO2, LDC,2)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x6_SUB
+	vmovss 	-16 * SIZE(AO), %xmm0
+	vmovss 	-15 * SIZE(AO), %xmm1
+	vmovss	 -4 * SIZE(BO), %xmm2
+	vmovss	 -3 * SIZE(BO), %xmm3
+
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
+	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
+	VFMADD231SS_(  	%xmm7,%xmm3,%xmm1 )
+
+	vmovss	 -2 * SIZE(BO), %xmm2
+	vmovss	 -1 * SIZE(BO), %xmm3
+	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm9,%xmm2,%xmm1  )
+	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
+	VFMADD231SS_(  	%xmm11,%xmm3,%xmm1 )
+
+	vmovss	  0 * SIZE(BO), %xmm2
+	vmovss	  1 * SIZE(BO), %xmm3
+	VFMADD231SS_(  	%xmm12,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm13,%xmm2,%xmm1  )
+	VFMADD231SS_(  	%xmm14,%xmm3,%xmm0 )
+	VFMADD231SS_(  	%xmm15,%xmm3,%xmm1 )
+
+	addq	$6*SIZE, BO 
+	addq	$2*SIZE, AO 
+	decq	%rax 
+.endm
+
+.macro SAVE2x6
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm5 , %xmm5
+	vmulss	%xmm0 , %xmm6 , %xmm6
+	vmulss	%xmm0 , %xmm7 , %xmm7
+	vmulss	%xmm0 , %xmm8 , %xmm8
+	vmulss	%xmm0 , %xmm9 , %xmm9
+	vmulss	%xmm0 , %xmm10, %xmm10
+	vmulss	%xmm0 , %xmm11, %xmm11
+	vmulss	%xmm0 , %xmm12, %xmm12
+	vmulss	%xmm0 , %xmm13, %xmm13
+	vmulss	%xmm0 , %xmm14, %xmm14
+	vmulss	%xmm0 , %xmm15, %xmm15
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
+
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
+
+	vaddss 	        (CO1, LDC,2), %xmm8,%xmm8
+	vaddss  1 * SIZE(CO1, LDC,2), %xmm9,%xmm9
+
+	vaddss 	        (CO2), %xmm10,%xmm10
+	vaddss  1 * SIZE(CO2), %xmm11,%xmm11
+
+	vaddss 	        (CO2, LDC), %xmm12,%xmm12
+	vaddss  1 * SIZE(CO2, LDC), %xmm13,%xmm13
+
+	vaddss 	        (CO2, LDC,2), %xmm14,%xmm14
+	vaddss  1 * SIZE(CO2, LDC,2), %xmm15,%xmm15
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 , 1 * SIZE(CO1)
+
+	vmovss	%xmm6 ,  	(CO1, LDC)
+	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)
+
+	vmovss	%xmm8 ,  	(CO1, LDC,2)
+	vmovss	%xmm9 , 1 * SIZE(CO1, LDC,2)
+
+	vmovss	%xmm10,  	(CO2)
+	vmovss	%xmm11, 1 * SIZE(CO2)
+
+	vmovss	%xmm12,  	(CO2, LDC)
+	vmovss	%xmm13, 1 * SIZE(CO2, LDC)
+
+	vmovss	%xmm14,  	(CO2, LDC,2)
+	vmovss	%xmm15, 1 * SIZE(CO2, LDC,2)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL1x6_SUB
+	vmovss 	-16 * SIZE(AO), %xmm0
+	vmovss	 -4 * SIZE(BO), %xmm2
+	vmovss	 -3 * SIZE(BO), %xmm3
+
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
+
+	vmovss	 -2 * SIZE(BO), %xmm2
+	vmovss	 -1 * SIZE(BO), %xmm3
+	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
+
+	vmovss	  0 * SIZE(BO), %xmm2
+	vmovss	  1 * SIZE(BO), %xmm3
+	VFMADD231SS_(  	%xmm12,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm14,%xmm3,%xmm0 )
+
+	addq	$6*SIZE, BO 
+	addq	$1*SIZE, AO 
+	decq	%rax 
+.endm
+
+.macro SAVE1x6
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm6 , %xmm6
+	vmulss	%xmm0 , %xmm8 , %xmm8
+	vmulss	%xmm0 , %xmm10, %xmm10
+	vmulss	%xmm0 , %xmm12, %xmm12
+	vmulss	%xmm0 , %xmm14, %xmm14
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss 	        (CO1, LDC,2), %xmm8,%xmm8
+	vaddss 	        (CO2), %xmm10,%xmm10
+	vaddss 	        (CO2, LDC), %xmm12,%xmm12
+	vaddss 	        (CO2, LDC,2), %xmm14,%xmm14
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm6 ,  	(CO1, LDC)
+	vmovss	%xmm8 ,  	(CO1, LDC,2)
+	vmovss	%xmm10,  	(CO2)
+	vmovss	%xmm12,  	(CO2, LDC)
+	vmovss	%xmm14,  	(CO2, LDC,2)
+
+.endm
+
+
+/*******************************************************************************************/
+
+
 /*******************************************************************************************
 * 4 lines of N
 *******************************************************************************************/
@@ -343,17 +716,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if !defined(TRMMKERNEL)
 
-	vaddps 	        (CO1), %xmm4,%xmm4
-	vaddps  1 * SIZE(CO1), %xmm5,%xmm5
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
 
-	vaddps 	        (CO1, LDC), %xmm6,%xmm6
-	vaddps  1 * SIZE(CO1, LDC), %xmm7,%xmm7
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
 
-	vaddps 	        (CO2), %xmm8,%xmm8
-	vaddps  1 * SIZE(CO2), %xmm9,%xmm9
+	vaddss 	        (CO2), %xmm8,%xmm8
+	vaddss  1 * SIZE(CO2), %xmm9,%xmm9
 
-	vaddps 	        (CO2, LDC), %xmm10,%xmm10
-	vaddps  1 * SIZE(CO2, LDC), %xmm11,%xmm11
+	vaddss 	        (CO2, LDC), %xmm10,%xmm10
+	vaddss  1 * SIZE(CO2, LDC), %xmm11,%xmm11
 
 #endif
 
@@ -400,10 +773,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if !defined(TRMMKERNEL)
 
-	vaddps 	        (CO1), %xmm4,%xmm4
-	vaddps 	        (CO1, LDC), %xmm6,%xmm6
-	vaddps 	        (CO2), %xmm8,%xmm8
-	vaddps 	        (CO2, LDC), %xmm10,%xmm10
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss 	        (CO2), %xmm8,%xmm8
+	vaddss 	        (CO2, LDC), %xmm10,%xmm10
 
 #endif
 
@@ -556,11 +929,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if !defined(TRMMKERNEL)
 
-	vaddps 	        (CO1), %xmm4,%xmm4
-	vaddps  1 * SIZE(CO1), %xmm5,%xmm5
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
 
-	vaddps 	        (CO1, LDC), %xmm6,%xmm6
-	vaddps  1 * SIZE(CO1, LDC), %xmm7,%xmm7
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
 
 #endif
 
@@ -594,8 +967,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if !defined(TRMMKERNEL)
 
-	vaddps 	        (CO1), %xmm4,%xmm4
-	vaddps 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
 
 #endif
 
@@ -717,8 +1090,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if !defined(TRMMKERNEL)
 
-	vaddps 	        (CO1), %xmm4,%xmm4
-	vaddps  1 * SIZE(CO1), %xmm5,%xmm5
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
 
 #endif
 
@@ -746,7 +1119,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if !defined(TRMMKERNEL)
 
-	vaddps 	        (CO1), %xmm4,%xmm4
+	vaddss 	        (CO1), %xmm4,%xmm4
 
 #endif
 
@@ -757,6 +1130,3283 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 /*******************************************************************************************/
 
+#if !defined(TRMMKERNEL)
+
+/*************************************************************************************
+* GEMM Kernel
+*************************************************************************************/
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	movups	%xmm6,   64(%rsp)
+	movups	%xmm7,   80(%rsp)
+	movups	%xmm8,   96(%rsp)
+	movups	%xmm9,  112(%rsp)
+	movups	%xmm10, 128(%rsp)
+	movups	%xmm11, 144(%rsp)
+	movups	%xmm12, 160(%rsp)
+	movups	%xmm13, 176(%rsp)
+	movups	%xmm14, 192(%rsp)
+	movups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	vmovsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovss	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $12,  %rdi
+        divq    %rdi                    //    N / 12
+        movq    %rax, Ndiv6             //    N / 12
+        movq    %rdx, Nmod6             //    N % 12
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L4_00
+	ALIGN_4
+
+
+/*******************************************************************************************/
+
+.L6_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	salq	$2, %rax		// 4 values of B
+        leaq    (B, %rax,4), BO2
+        movq    BO2, B                  // next offset of B
+        movq    K, %rax
+
+	ALIGN_4
+
+
+.L6_02c:
+
+	vmovups	(BO1), %xmm0
+	vmovsd	(BO2), %xmm1
+	vmovups	%xmm0, (BO)
+	vmovsd	%xmm1, 4*SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO2
+	addq	$ 6*SIZE,BO
+	decq	%rax
+	jnz	.L6_02c
+
+
+.L6_10:
+	movq	 C, CO1
+	leaq	(C,   LDC, 2), CO2	
+	leaq	(CO2, LDC, 1), CO2		// co2 = c + 3 * ldc
+	leaq	(C,   LDC, 4), C	
+	leaq	(C,   LDC, 2), C		// c = c + 6 * ldc
+
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L6_20
+
+	ALIGN_4
+
+.L6_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L6_16
+
+	ALIGN_4
+
+.L6_12:
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	je	.L6_16
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	je	.L6_16
+
+	jmp	.L6_12
+	ALIGN_4
+
+.L6_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_19
+
+	ALIGN_4
+
+.L6_17:
+
+	KERNEL16x6_SUB
+
+	jnz	.L6_17
+	ALIGN_4
+
+
+.L6_19:
+
+	SAVE16x6
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	addq	$16 * SIZE, CO2		# coffset += 16
+	decq	I			# i --
+	jg	.L6_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L6_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L6_60		// to next 6 lines of N
+
+	testq	$8, M		
+	jz	.L6_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L6_20_1:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_20_6
+
+	ALIGN_4
+
+.L6_20_2:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	je	.L6_20_6
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	je	.L6_20_6
+
+	jmp	.L6_20_2
+	ALIGN_4
+
+.L6_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_20_9
+
+	ALIGN_4
+
+.L6_20_7:
+
+	KERNEL8x6_SUB
+
+	jnz	.L6_20_7
+	ALIGN_4
+
+
+.L6_20_9:
+
+	SAVE8x6
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	addq	$8 * SIZE, CO2		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L6_21pre:
+
+	testq	$4, M		
+	jz	.L6_30
+	ALIGN_4
+
+.L6_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_26
+
+	ALIGN_4
+
+.L6_22:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	je	.L6_26
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	je	.L6_26
+
+	jmp	.L6_22
+	ALIGN_4
+
+.L6_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_29
+
+	ALIGN_4
+
+.L6_27:
+
+	KERNEL4x6_SUB
+
+	jnz	.L6_27
+	ALIGN_4
+
+
+.L6_29:
+
+	SAVE4x6
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	addq	$4 * SIZE, CO2		# coffset += 4
+	ALIGN_4
+	
+
+.L6_30:
+	testq	$2, M		
+	jz	.L6_40
+
+	ALIGN_4
+
+.L6_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_36
+
+	ALIGN_4
+
+.L6_32:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	je	.L6_36
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	je	.L6_36
+
+	jmp	.L6_32
+	ALIGN_4
+
+.L6_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_39
+
+	ALIGN_4
+
+.L6_37:
+
+	KERNEL2x6_SUB
+
+	jnz	.L6_37
+	ALIGN_4
+
+
+.L6_39:
+
+	SAVE2x6
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	addq	$2 * SIZE, CO2		# coffset += 2
+	ALIGN_4
+
+.L6_40:
+	testq	$1, M		
+	jz	.L6_60		// to next 4 lines of N
+
+	ALIGN_4
+
+.L6_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_46
+
+	ALIGN_4
+
+.L6_42:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	je	.L6_46
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	je	.L6_46
+
+	jmp	.L6_42
+	ALIGN_4
+
+.L6_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_49
+
+	ALIGN_4
+
+.L6_47:
+
+	KERNEL1x6_SUB
+
+	jnz	.L6_47
+	ALIGN_4
+
+
+.L6_49:
+
+	SAVE1x6
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	addq	$1 * SIZE, CO2		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L6_60:
+
+
+/*******************************************************************************************/
+
+
+.L7_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	salq	$2, %rax		// 4 values of B
+        leaq    (B, %rax,4), BO2
+        movq    K, %rax
+
+	ALIGN_4
+
+
+.L7_02c:
+
+	vmovsd	2*SIZE(BO1), %xmm0
+	vmovups	      (BO2), %xmm1
+	vmovsd	%xmm0, (BO)
+	vmovups	%xmm1, 2*SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO2
+	addq	$ 6*SIZE,BO
+	decq	%rax
+	jnz	.L7_02c
+
+        movq    BO2, B                  // next offset of B
+
+.L7_10:
+	movq	 C, CO1
+	leaq	(C,   LDC, 2), CO2	
+	leaq	(CO2, LDC, 1), CO2		// co2 = c + 3 * ldc
+	leaq	(C,   LDC, 4), C	
+	leaq	(C,   LDC, 2), C		// c = c + 6 * ldc
+
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L7_20
+
+	ALIGN_4
+
+.L7_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L7_16
+
+	ALIGN_4
+
+.L7_12:
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	je	.L7_16
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	je	.L7_16
+
+	jmp	.L7_12
+	ALIGN_4
+
+.L7_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_19
+
+	ALIGN_4
+
+.L7_17:
+
+	KERNEL16x6_SUB
+
+	jnz	.L7_17
+	ALIGN_4
+
+
+.L7_19:
+
+	SAVE16x6
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	addq	$16 * SIZE, CO2		# coffset += 16
+	decq	I			# i --
+	jg	.L7_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L7_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L7_60		// to next 6 lines of N
+
+	testq	$8, M		
+	jz	.L7_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L7_20_1:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_20_6
+
+	ALIGN_4
+
+.L7_20_2:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	je	.L7_20_6
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	je	.L7_20_6
+
+	jmp	.L7_20_2
+	ALIGN_4
+
+.L7_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_20_9
+
+	ALIGN_4
+
+.L7_20_7:
+
+	KERNEL8x6_SUB
+
+	jnz	.L7_20_7
+	ALIGN_4
+
+
+.L7_20_9:
+
+	SAVE8x6
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	addq	$8 * SIZE, CO2		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L7_21pre:
+
+	testq	$4, M		
+	jz	.L7_30
+	ALIGN_4
+
+.L7_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_26
+
+	ALIGN_4
+
+.L7_22:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	je	.L7_26
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	je	.L7_26
+
+	jmp	.L7_22
+	ALIGN_4
+
+.L7_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_29
+
+	ALIGN_4
+
+.L7_27:
+
+	KERNEL4x6_SUB
+
+	jnz	.L7_27
+	ALIGN_4
+
+
+.L7_29:
+
+	SAVE4x6
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	addq	$4 * SIZE, CO2		# coffset += 4
+	ALIGN_4
+	
+
+.L7_30:
+	testq	$2, M		
+	jz	.L7_40
+
+	ALIGN_4
+
+.L7_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_36
+
+	ALIGN_4
+
+.L7_32:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	je	.L7_36
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	je	.L7_36
+
+	jmp	.L7_32
+	ALIGN_4
+
+.L7_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_39
+
+	ALIGN_4
+
+.L7_37:
+
+	KERNEL2x6_SUB
+
+	jnz	.L7_37
+	ALIGN_4
+
+
+.L7_39:
+
+	SAVE2x6
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	addq	$2 * SIZE, CO2		# coffset += 2
+	ALIGN_4
+
+.L7_40:
+	testq	$1, M		
+	jz	.L7_60		// to next 4 lines of N
+
+	ALIGN_4
+
+.L7_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_46
+
+	ALIGN_4
+
+.L7_42:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	je	.L7_46
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	je	.L7_46
+
+	jmp	.L7_42
+	ALIGN_4
+
+.L7_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_49
+
+	ALIGN_4
+
+.L7_47:
+
+	KERNEL1x6_SUB
+
+	jnz	.L7_47
+	ALIGN_4
+
+
+.L7_49:
+
+	SAVE1x6
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	addq	$1 * SIZE, CO2		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L7_60:
+
+	decq	J			// j --
+	jg	.L6_01			// next 12 lines of N
+
+
+
+
+/*******************************************************************************************/
+.L4_00:
+
+ 	movq    Nmod6,  J
+        sarq    $2, J           // j = j / 4
+        cmpq    $ 0, J
+        je      .L2_00
+        ALIGN_4
+
+
+.L4_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// K / 4
+	jz	.L4_01b
+	ALIGN_4
+
+
+.L4_01a:
+        prefetcht0 512(BO1)
+        prefetchw  512(BO)
+
+	vmovups	       (BO1), %xmm0
+	vmovups	 4*SIZE(BO1), %xmm1
+	vmovups	 8*SIZE(BO1), %xmm2
+	vmovups	12*SIZE(BO1), %xmm3
+
+	vmovups	%xmm0,       (BO)
+	vmovups	%xmm1, 4*SIZE(BO)
+	vmovups	%xmm2, 8*SIZE(BO)
+	vmovups	%xmm3,12*SIZE(BO)
+
+	addq	$ 16*SIZE,BO1
+	addq	$ 16*SIZE,BO
+	decq	%rax
+	jnz	.L4_01a
+
+
+.L4_01b:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L4_02d
+        ALIGN_4
+
+.L4_02c:
+
+	vmovups	(BO1), %xmm0
+	vmovups	%xmm0, (BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO
+	decq	%rax
+	jnz	.L4_02c
+
+.L4_02d:
+
+	movq	BO1, B			// next offset of B
+
+.L4_10:
+	movq	 C, CO1
+	leaq	(C, LDC, 2), CO2	
+	leaq	(C, LDC, 4), C		// c += 4 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L4_20
+
+	ALIGN_4
+
+.L4_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             	// first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $4, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4) , BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_12:
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	je	.L4_16
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	je	.L4_16
+
+	jmp	.L4_12
+	ALIGN_4
+
+.L4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_17:
+
+	KERNEL16x4_SUB
+
+	jl	.L4_17
+	ALIGN_4
+
+
+.L4_19:
+
+	SAVE16x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	addq	$16 * SIZE, CO2		# coffset += 16
+	decq	I			# i --
+	jg	.L4_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L4_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L4_60		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L4_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L4_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_20_2:
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	je	.L4_20_6
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	je	.L4_20_6
+
+	jmp	.L4_20_2
+	ALIGN_4
+
+.L4_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_20_7:
+
+	KERNEL8x4_SUB
+
+	jl	.L4_20_7
+	ALIGN_4
+
+
+.L4_20_9:
+
+	SAVE8x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	addq	$8 * SIZE, CO2		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L4_21pre:
+
+	testq	$4, M		
+	jz	.L4_30
+	ALIGN_4
+
+.L4_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_22:
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	je	.L4_26
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	je	.L4_26
+
+	jmp	.L4_22
+	ALIGN_4
+
+.L4_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_27:
+
+	KERNEL4x4_SUB
+
+	jl	.L4_27
+	ALIGN_4
+
+
+.L4_29:
+
+	SAVE4x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	addq	$4 * SIZE, CO2		# coffset += 4
+	ALIGN_4
+	
+
+.L4_30:
+	testq	$2, M		
+	jz	.L4_40
+
+	ALIGN_4
+
+.L4_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_32:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	je	.L4_36
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	je	.L4_36
+
+	jmp	.L4_32
+	ALIGN_4
+
+.L4_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_39
+
+	movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_37:
+
+	KERNEL2x4_SUB
+
+	jl	.L4_37
+	ALIGN_4
+
+
+.L4_39:
+
+	SAVE2x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	addq	$2 * SIZE, CO2		# coffset += 2
+	ALIGN_4
+
+.L4_40:
+	testq	$1, M		
+	jz	.L4_60		// to next 4 lines of N
+
+	ALIGN_4
+
+.L4_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L4_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_42:
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	je	.L4_46
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	je	.L4_46
+
+	jmp	.L4_42
+	ALIGN_4
+
+.L4_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_47:
+
+	KERNEL1x4_SUB
+
+	jl	.L4_47
+	ALIGN_4
+
+
+.L4_49:
+
+	SAVE1x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	addq	$1 * SIZE, CO2		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L4_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $4, KK
+#endif
+
+	decq	J			// j --
+	jg	.L4_01			// next 4 lines of N
+
+
+
+/*******************************************************************************************/
+.L2_00:
+
+	movq	Nmod6, J		
+	andq	$3, J			// j % 4
+	je	.L999
+
+	movq	Nmod6, J		
+	andq	$2, J			// j % 4
+	je	.L1_0
+
+.L2_01:
+
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// K / 4
+	jz	.L2_01b
+	ALIGN_4
+
+.L2_01a:
+
+	vmovsd	      (BO1), %xmm0
+	vmovsd	2*SIZE(BO1), %xmm1
+	vmovsd	4*SIZE(BO1), %xmm2
+	vmovsd	6*SIZE(BO1), %xmm3
+
+	vmovsd	%xmm0,       (BO)
+	vmovsd	%xmm1, 2*SIZE(BO)
+	vmovsd	%xmm2, 4*SIZE(BO)
+	vmovsd	%xmm3, 6*SIZE(BO)
+
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO
+	decq	%rax
+	jnz	.L2_01a
+
+
+.L2_01b:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L2_02d
+        ALIGN_4
+
+.L2_02c:
+
+	vmovsd 	(BO1), %xmm0
+	vmovsd 	%xmm0, (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L2_02c
+
+.L2_02d:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $2, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	je	.L2_16
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL16x2_SUB
+
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	SAVE16x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L2_60		// to next 2 lines of N
+
+	testq	$8, M		
+	jz	.L2_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L2_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_2:
+
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	je	.L2_20_6
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	je	.L2_20_6
+
+	jmp	.L2_20_2
+	ALIGN_4
+
+.L2_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_7:
+
+	KERNEL8x2_SUB
+
+	jl	.L2_20_7
+	ALIGN_4
+
+
+.L2_20_9:
+
+	SAVE8x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L2_21pre:
+
+	testq	$4, M		
+	jz	.L2_30
+	ALIGN_4
+
+.L2_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_26
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL4x2_SUB
+
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	SAVE4x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_32:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_36
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_36
+
+	jmp	.L2_32
+	ALIGN_4
+
+.L2_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB
+
+	jl	.L2_37
+	ALIGN_4
+
+
+.L2_39:
+
+	SAVE2x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_46
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB
+
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	SAVE1x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK
+#endif
+
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovss	(BO1), %xmm0
+	vmovss	%xmm0,       (BO)
+	addq	$1*SIZE,BO1
+	addq	$1*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $1, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	je	.L1_16
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL16x1_SUB
+
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	SAVE16x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L999
+
+	testq	$8, M		
+	jz	.L1_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L1_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_20_6
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_2:
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	je	.L1_20_6
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	je	.L1_20_6
+
+	jmp	.L1_20_2
+	ALIGN_4
+
+.L1_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_20_9
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_7:
+
+	KERNEL8x1_SUB
+
+	jl	.L1_20_7
+	ALIGN_4
+
+
+.L1_20_9:
+
+	SAVE8x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L1_21pre:
+
+	testq	$4, M		
+	jz	.L1_30
+	ALIGN_4
+
+.L1_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_26
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL4x1_SUB
+
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	SAVE4x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_36
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_32:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_36
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_36
+
+	jmp	.L1_32
+	ALIGN_4
+
+.L1_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	movq    %rax, BI                        //  Index for BO
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB
+
+	jl	.L1_37
+	ALIGN_4
+
+
+.L1_39:
+
+	SAVE2x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_46
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB
+
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	SAVE1x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+.L999:
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	movups	 64(%rsp), %xmm6
+	movups	 80(%rsp), %xmm7
+	movups	 96(%rsp), %xmm8
+	movups	112(%rsp), %xmm9
+	movups	128(%rsp), %xmm10
+	movups	144(%rsp), %xmm11
+	movups	160(%rsp), %xmm12
+	movups	176(%rsp), %xmm13
+	movups	192(%rsp), %xmm14
+	movups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+
+#else
+
 /*************************************************************************************
 * TRMM Kernel
 *************************************************************************************/
@@ -3152,6 +6802,5 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	EPILOGUE
 
 
-
-
+#endif
 
diff --git a/param.h b/param.h
index 863e83c32..c545d21a8 100644
--- a/param.h
+++ b/param.h
@@ -1237,10 +1237,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_DEFAULT_P 384
 #define ZGEMM_DEFAULT_P 256
 
-#define SGEMM_DEFAULT_Q 384
 #ifdef WINDOWS_ABI
+#define SGEMM_DEFAULT_Q 320
 #define DGEMM_DEFAULT_Q 128
 #else
+#define SGEMM_DEFAULT_Q 384
 #define DGEMM_DEFAULT_Q 256
 #endif
 #define CGEMM_DEFAULT_Q 192

From 46bc4fd50cb0581daa31fce012708680ead4818b Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 29 Jul 2014 08:53:09 +0200
Subject: [PATCH 25/74] optimized cgemm kernel for haswell

---
 kernel/x86_64/cgemm_kernel_8x2_haswell.S | 7212 +++++++++++++++-------
 1 file changed, 4927 insertions(+), 2285 deletions(-)

diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.S b/kernel/x86_64/cgemm_kernel_8x2_haswell.S
index baee3cd2f..98f40054e 100644
--- a/kernel/x86_64/cgemm_kernel_8x2_haswell.S
+++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.S
@@ -1,2285 +1,4927 @@
-/*********************************************************************************
-Copyright (c) 2013, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-**********************************************************************************/
-
-/*********************************************************************
-* 2014/06/28 Saar
-*        BLASTEST               : OK
-*        CTEST                  : OK
-*        TEST                   : OK
-*
-* 2013/10/28 Saar
-* Parameter:
-*       CGEMM_DEFAULT_UNROLL_N  2
-*       CGEMM_DEFAULT_UNROLL_M  8
-*       CGEMM_DEFAULT_P         384
-*       CGEMM_DEFAULT_Q         192
-*	A_PR1			512
-*	B_PR1			512
-*
-* Performance at 6912x6912x6912:
-*       1 thread:       84 GFLOPS       (SANDYBRIDGE:  60)      (MKL:   86)
-*       2 threads:     153 GFLOPS       (SANDYBRIDGE: 114)      (MKL:  155)
-*       3 threads:     224 GFLOPS       (SANDYBRIDGE: 162)      (MKL:  222)
-*       4 threads:     278 GFLOPS       (SANDYBRIDGE: 223)      (MKL:  279)
-*
-*
-*********************************************************************/
-
-
-
-#define ASSEMBLER
-#include "common.h"
- 
-#define OLD_M	%rdi
-#define OLD_N	%rsi
-#define M	%r13
-#define J	%r14
-#define OLD_K	%rdx
-
-#define A	%rcx
-#define B	%r8
-#define C	%r9
-#define LDC	%r10
-	
-#define I	%r11
-#define AO	%rdi
-#define BO	%rsi
-#define	CO1	%r15
-#define K	%r12
-#define BI	%rbp
-#define	SP	%rbx
-
-#define BO1	%rdi
-#define BO2	%r15
-
-#ifndef WINDOWS_ABI
-
-#define STACKSIZE 96
-
-#else
-
-#define STACKSIZE 320
-
-#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
-#define OLD_A           48 + STACKSIZE(%rsp)
-#define OLD_B           56 + STACKSIZE(%rsp)
-#define OLD_C           64 + STACKSIZE(%rsp)
-#define OLD_LDC         72 + STACKSIZE(%rsp)
-#define OLD_OFFSET      80 + STACKSIZE(%rsp)
-
-#endif
-
-#define L_BUFFER_SIZE 8192
-
-#define Ndiv6	 24(%rsp)
-#define Nmod6	 32(%rsp)
-#define N	 40(%rsp)
-#define ALPHA_R  48(%rsp)
-#define ALPHA_I  56(%rsp)
-#define OFFSET   64(%rsp)
-#define KK       72(%rsp)
-#define KKK      80(%rsp)
-#define BUFFER1	           128(%rsp)
-
-#if defined(OS_WINDOWS)
-#if   L_BUFFER_SIZE > 16384
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 4(%rsp);\
-        movl    $ 0,  4096 * 3(%rsp);\
-        movl    $ 0,  4096 * 2(%rsp);\
-        movl    $ 0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 12288
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 3(%rsp);\
-        movl    $ 0,  4096 * 2(%rsp);\
-        movl    $ 0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 8192
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 2(%rsp);\
-        movl    $ 0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 4096
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 1(%rsp);
-#else
-#define STACK_TOUCH
-#endif
-#else
-#define STACK_TOUCH
-#endif
-
-
-#if defined(BULLDOZER)
-
-#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
-
-#define	VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
-
-#define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
-
-#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
-
-#define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
-
-#define	VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
-
-#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-#define	VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
-
-#define	VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
-
-#else
-
-#define	VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
-
-#define	VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
-
-#endif
-
-#else
-
-#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
-
-#define	VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0
-
-#define	VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0
-
-#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
-
-#define	VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
-
-#define	VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0
-
-#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-#define	VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0
-
-#define	VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
-
-#else
-
-#define	VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
-
-#define	VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
-
-#endif
-
-#endif
-
-
-#define	A_PR1	512
-#define	B_PR1	512
-
-/***************************************************************************************************************************/
-
-.macro KERNEL8x2_SUB
-
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %ymm0
-        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %ymm4
-        VFMADDPS_R(        %ymm8,%ymm4,%ymm0  )
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %ymm1
-        VFMADDPS_R(        %ymm12,%ymm4,%ymm1 )
-        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %ymm5
-        VFMADDPS_I(        %ymm9,%ymm5,%ymm0  )
-        VFMADDPS_I(        %ymm13,%ymm5,%ymm1 )
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %ymm6
-        VFMADDPS_R(        %ymm10,%ymm6,%ymm0 )
-        VFMADDPS_R(        %ymm14,%ymm6,%ymm1 )
-        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %ymm7
-        VFMADDPS_I(        %ymm11,%ymm7,%ymm0 )
-        VFMADDPS_I(        %ymm15,%ymm7,%ymm1 )
-        addq    $ 4 , BI                           
-        addq    $ 16, %rax                         
-.endm
-
-.macro SAVE8x2
-
-	vbroadcastss	ALPHA_R, %ymm0
-	vbroadcastss	ALPHA_I, %ymm1
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
-        vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
-        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
-        vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %ymm9, %ymm8 , %ymm8
-        vaddsubps %ymm11,%ymm10, %ymm10
-        vaddsubps %ymm13,%ymm12, %ymm12
-        vaddsubps %ymm15,%ymm14, %ymm14
-
-        vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9
-        vshufps $ 0xb1, %ymm10, %ymm10, %ymm11
-        vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
-        vshufps $ 0xb1, %ymm14, %ymm14, %ymm15
-
-#else
-        vaddsubps %ymm8,  %ymm9 ,%ymm9
-        vaddsubps %ymm10, %ymm11,%ymm11
-        vaddsubps %ymm12, %ymm13,%ymm13
-        vaddsubps %ymm14, %ymm15,%ymm15
-
-        vmovaps   %ymm9,  %ymm8
-        vmovaps   %ymm11, %ymm10
-        vmovaps   %ymm13, %ymm12
-        vmovaps   %ymm15, %ymm14
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
-        vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
-        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
-        vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %ymm8 , %ymm0, %ymm8
-        vmulps  %ymm10, %ymm0, %ymm10
-        vmulps  %ymm12, %ymm0, %ymm12
-        vmulps  %ymm14, %ymm0, %ymm14
-
-	// multiply with ALPHA_I
-        vmulps  %ymm9 , %ymm1, %ymm9
-        vmulps  %ymm11, %ymm1, %ymm11
-        vmulps  %ymm13, %ymm1, %ymm13
-        vmulps  %ymm15, %ymm1, %ymm15
-
-	vaddsubps %ymm9, %ymm8 , %ymm8
-        vaddsubps %ymm11,%ymm10, %ymm10
-        vaddsubps %ymm13,%ymm12, %ymm12
-        vaddsubps %ymm15,%ymm14, %ymm14
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %ymm8 , %ymm8
-	vaddps  8 * SIZE(CO1), %ymm12, %ymm12
-
-	vaddps 	 	(CO1, LDC), %ymm10, %ymm10
-	vaddps  8 * SIZE(CO1, LDC), %ymm14, %ymm14
-
-#endif
-
-	vmovups	%ymm8 ,  	(CO1)
-	vmovups	%ymm12 , 8 * SIZE(CO1)
-
-	vmovups	%ymm10 ,  	(CO1, LDC)
-	vmovups	%ymm14 , 8 * SIZE(CO1, LDC)
-
-	prefetcht0	64(CO1)
-	prefetcht0	64(CO1, LDC)
-
-.endm
-
-/***************************************************************************************************************************/
-
-.macro KERNEL4x2_SUB
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0
-        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4
-        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
-        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1
-        VFMADDPS_R(        %xmm12,%xmm4,%xmm1 )
-        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5
-        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
-        VFMADDPS_I(        %xmm13,%xmm5,%xmm1 )
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6
-        VFMADDPS_R(        %xmm10,%xmm6,%xmm0 )
-        VFMADDPS_R(        %xmm14,%xmm6,%xmm1 )
-        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7
-        VFMADDPS_I(        %xmm11,%xmm7,%xmm0 )
-        VFMADDPS_I(        %xmm15,%xmm7,%xmm1 )
-        addq    $ 4, BI                           
-        addq    $ 8, %rax                         
-.endm
-
-.macro SAVE4x2
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
-        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
-        vshufps $ 0xb1, %xmm15, %xmm15, %xmm15
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-        vaddsubps %xmm13,%xmm12, %xmm12
-        vaddsubps %xmm15,%xmm14, %xmm14
-
-        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
-        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
-        vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
-        vshufps $ 0xb1, %xmm14, %xmm14, %xmm15
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-        vaddsubps %xmm10, %xmm11,%xmm11
-        vaddsubps %xmm12, %xmm13,%xmm13
-        vaddsubps %xmm14, %xmm15,%xmm15
-
-        vmovaps   %xmm9,  %xmm8
-        vmovaps   %xmm11, %xmm10
-        vmovaps   %xmm13, %xmm12
-        vmovaps   %xmm15, %xmm14
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
-        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
-        vshufps $ 0xb1, %xmm15, %xmm15, %xmm15
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-        vmulps  %xmm10, %xmm0, %xmm10
-        vmulps  %xmm12, %xmm0, %xmm12
-        vmulps  %xmm14, %xmm0, %xmm14
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-        vmulps  %xmm11, %xmm1, %xmm11
-        vmulps  %xmm13, %xmm1, %xmm13
-        vmulps  %xmm15, %xmm1, %xmm15
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-        vaddsubps %xmm13,%xmm12, %xmm12
-        vaddsubps %xmm15,%xmm14, %xmm14
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %xmm8 , %xmm8
-	vaddps  4 * SIZE(CO1), %xmm12, %xmm12
-
-	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
-	vaddps  4 * SIZE(CO1, LDC), %xmm14, %xmm14
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 4 * SIZE(CO1)
-
-	vmovups	%xmm10 ,  	(CO1, LDC)
-	vmovups	%xmm14 , 4 * SIZE(CO1, LDC)
-
-.endm
-
-/************************************************************************************************/
-
-.macro KERNEL2x2_SUB
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0
-        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4
-        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
-        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5
-        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6
-        VFMADDPS_R(        %xmm10,%xmm6,%xmm0 )
-        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7
-        VFMADDPS_I(        %xmm11,%xmm7,%xmm0 )
-        addq    $ 4, BI                           
-        addq    $ 4, %rax                         
-.endm
-
-.macro SAVE2x2
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 4 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-
-        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
-        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-        vaddsubps %xmm10, %xmm11,%xmm11
-
-        vmovaps   %xmm9,  %xmm8
-        vmovaps   %xmm11, %xmm10
-
-	// swap high and low 4 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-        vmulps  %xmm10, %xmm0, %xmm10
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-        vmulps  %xmm11, %xmm1, %xmm11
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %xmm8 , %xmm8
-
-	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-
-	vmovups	%xmm10 ,  	(CO1, LDC)
-
-.endm
-
-/************************************************************************************************/
-
-.macro KERNEL1x2_SUB
-        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0
-        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4
-        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
-        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5
-        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6
-        VFMADDPS_R(        %xmm10,%xmm6,%xmm0 )
-        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7
-        VFMADDPS_I(        %xmm11,%xmm7,%xmm0 )
-        addq    $ 4, BI                           
-        addq    $ 2, %rax                         
-.endm
-
-.macro SAVE1x2
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-
-        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
-        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-        vaddsubps %xmm10, %xmm11,%xmm11
-
-        vmovaps   %xmm9,  %xmm8
-        vmovaps   %xmm11, %xmm10
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-        vmulps  %xmm10, %xmm0, %xmm10
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-        vmulps  %xmm11, %xmm1, %xmm11
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-
-#ifndef TRMMKERNEL
-
-	vmovsd		(CO1), %xmm14
-	vaddps 	 	%xmm14, %xmm8 , %xmm8
-
-	vmovsd		(CO1, LDC), %xmm15
-	vaddps 	 	%xmm15, %xmm10, %xmm10
-
-#endif
-
-	vmovsd	%xmm8 ,  	(CO1)
-	vmovsd	%xmm10 ,  	(CO1, LDC)
-
-.endm
-
-/************************************************************************************************/
-
-.macro KERNEL8x1_SUB
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %ymm0
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %ymm1
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %ymm4
-        VFMADDPS_R(        %ymm8,%ymm4,%ymm0  )
-        VFMADDPS_R(        %ymm12,%ymm4,%ymm1 )
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %ymm5
-        VFMADDPS_I(        %ymm9,%ymm5,%ymm0  )
-        VFMADDPS_I(        %ymm13,%ymm5,%ymm1 )
-        addq    $ 2 , BI                           
-        addq    $ 16, %rax                         
-.endm
-
-.macro SAVE8x1
-
-	vbroadcastss	ALPHA_R, %ymm0
-	vbroadcastss	ALPHA_I, %ymm1
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
-        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %ymm9, %ymm8 , %ymm8
-        vaddsubps %ymm13,%ymm12, %ymm12
-
-        vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9
-        vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
-
-#else
-        vaddsubps %ymm8,  %ymm9 ,%ymm9
-        vaddsubps %ymm12, %ymm13,%ymm13
-
-        vmovaps   %ymm9,  %ymm8
-        vmovaps   %ymm13, %ymm12
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
-        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %ymm8 , %ymm0, %ymm8
-        vmulps  %ymm12, %ymm0, %ymm12
-
-	// multiply with ALPHA_I
-        vmulps  %ymm9 , %ymm1, %ymm9
-        vmulps  %ymm13, %ymm1, %ymm13
-
-	vaddsubps %ymm9, %ymm8 , %ymm8
-        vaddsubps %ymm13,%ymm12, %ymm12
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %ymm8 , %ymm8
-	vaddps  8 * SIZE(CO1), %ymm12, %ymm12
-
-#endif
-
-	vmovups	%ymm8 ,  	(CO1)
-	vmovups	%ymm12 , 8 * SIZE(CO1)
-
-.endm
-
-
-/************************************************************************************************/
-
-.macro KERNEL4x1_SUB
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4
-        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
-        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1
-        VFMADDPS_R(        %xmm12,%xmm4,%xmm1 )
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5
-        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
-        VFMADDPS_I(        %xmm13,%xmm5,%xmm1 )
-        addq    $ 2, BI                           
-        addq    $ 8, %rax                         
-.endm
-
-.macro SAVE4x1
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 4 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm13,%xmm12, %xmm12
-
-        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
-        vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-        vaddsubps %xmm12, %xmm13,%xmm13
-
-        vmovaps   %xmm9,  %xmm8
-        vmovaps   %xmm13, %xmm12
-
-	// swap high and low 4 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-        vmulps  %xmm12, %xmm0, %xmm12
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-        vmulps  %xmm13, %xmm1, %xmm13
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm13,%xmm12, %xmm12
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %xmm8 , %xmm8
-	vaddps  4 * SIZE(CO1), %xmm12, %xmm12
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 4 * SIZE(CO1)
-
-.endm
-
-/************************************************************************************************/
-
-.macro KERNEL2x1_SUB
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4
-        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5
-        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
-        addq    $ 2, BI                           
-        addq    $ 4, %rax                         
-.endm
-
-.macro SAVE2x1
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-
-        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-
-        vmovaps   %xmm9,  %xmm8
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %xmm8 , %xmm8
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-
-.endm
-
-/************************************************************************************************/
-
-.macro KERNEL1x1_SUB
-        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4
-        VFMADDPS_R(        %xmm8,%xmm4,%xmm0 )
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5
-        VFMADDPS_I(        %xmm9,%xmm5,%xmm0 )
-        addq    $ 2, BI                           
-        addq    $ 2, %rax                         
-.endm
-
-.macro SAVE1x1
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-
-        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-
-        vmovaps   %xmm9,  %xmm8
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-
-#ifndef TRMMKERNEL
-
-	vmovsd		(CO1), %xmm14
-	vaddps 	 	%xmm14, %xmm8 , %xmm8
-
-#endif
-
-	vmovsd	%xmm8 ,  	(CO1)
-
-.endm
-
-/************************************************************************************************/
-
-
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$ STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	vmovups	%xmm6,   64(%rsp)
-	vmovups	%xmm7,   80(%rsp)
-	vmovups	%xmm8,   96(%rsp)
-	vmovups	%xmm9,  112(%rsp)
-	vmovups	%xmm10, 128(%rsp)
-	vmovups	%xmm11, 144(%rsp)
-	vmovups	%xmm12, 160(%rsp)
-	vmovups	%xmm13, 176(%rsp)
-	vmovups	%xmm14, 192(%rsp)
-	vmovups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	movsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-	vmovsd   OLD_ALPHA_I, %xmm1
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	movsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $ 128 + L_BUFFER_SIZE, %rsp
-        andq    $ -4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$ 0, OLD_M
-	je	.L999
-
-	cmpq	$ 0, OLD_N
-	je	.L999
-
-	cmpq	$ 0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovss	 %xmm0, ALPHA_R
-	vmovss	 %xmm1, ALPHA_I
-
-	salq	$ ZBASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $ 2,  %rdi
-        divq    %rdi                    //    N / 2
-        movq    %rax, Ndiv6             //    N / 2
-        movq    %rdx, Nmod6             //    N % 2
-
-	
-
-#ifdef TRMMKERNEL
-	vmovsd	%xmm12, OFFSET
-	vmovsd	%xmm12, KK
-#ifndef LEFT
-	negq	KK
-#endif	
-#endif
-
-.L2_0:
-
-	movq	Ndiv6,  J
-	cmpq	$ 0, J
-	je	.L1_0
-	ALIGN_4
-
-
-
-.L2_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L2_02b:
-
-	vmovups	(BO1), %xmm0
-	vmovups	%xmm0,       (BO)
-	addq	$ 4*SIZE,BO1
-	addq	$ 4*SIZE,BO
-	decq	%rax
-	jnz	.L2_02b
-
-.L2_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$ 16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$ 3, I			// i = (m >> 3)
-	je	.L2_4_10
-
-	ALIGN_4
-/**********************************************************************************************************/
-
-.L2_8_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 4, %rax			// rax = rax *16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 8, %rax        // number of values in AO
-#else
-        addq    $ 2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L2_8_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 4, %rax			// rax = rax *16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_8_12:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
-
-	je	.L2_8_16
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
-
-	je	.L2_8_16
-
-	jmp	.L2_8_12
-	ALIGN_4
-
-.L2_8_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L2_8_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 4, %rax			// rax = rax *16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_8_17:
-
-	KERNEL8x2_SUB
-
-	jl	.L2_8_17
-	ALIGN_4
-
-
-.L2_8_19:
-
-	SAVE8x2
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 4, %rax			// rax = rax *16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 8, KK
-#endif
-
-	addq	$ 16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L2_8_11
-	ALIGN_4	
-
-
-/**********************************************************************************************************/
-
-
-
-
-.L2_4_10:
-	testq	$ 7, M		
-	jz	.L2_4_60		// to next 2 lines of N
-
-	testq	$ 4, M		
-	jz	.L2_4_20
-	ALIGN_4
-
-
-.L2_4_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 4, %rax        // number of values in AO
-#else
-        addq    $ 2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L2_4_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_4_12:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	je	.L2_4_16
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	je	.L2_4_16
-
-	jmp	.L2_4_12
-	ALIGN_4
-
-.L2_4_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L2_4_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_4_17:
-
-	KERNEL4x2_SUB
-
-	jl	.L2_4_17
-	ALIGN_4
-
-
-.L2_4_19:
-
-	SAVE4x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 4, KK
-#endif
-
-	addq	$ 8 * SIZE, CO1		# coffset += 8
-	ALIGN_4	
-
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-
-.L2_4_20:
-
-	testq	$ 2, M		
-	jz	.L2_4_40
-	ALIGN_4
-
-.L2_4_21:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 2, %rax        // number of values in AO
-#else
-        addq    $ 2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L2_4_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_4_22:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_4_26
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_4_26
-
-	jmp	.L2_4_22
-	ALIGN_4
-
-.L2_4_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L2_4_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_4_27:
-
-	KERNEL2x2_SUB
-
-	jl	.L2_4_27
-	ALIGN_4
-
-
-.L2_4_29:
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-
-        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
-        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-        vaddsubps %xmm10, %xmm11,%xmm11
-
-        vmovaps   %xmm9,  %xmm8
-        vmovaps   %xmm11, %xmm10
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-        vmulps  %xmm10, %xmm0, %xmm10
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-        vmulps  %xmm11, %xmm1, %xmm11
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %xmm8 , %xmm8
-
-	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-
-	vmovups	%xmm10 ,  	(CO1, LDC)
-
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 2, KK
-#endif
-
-	addq	$ 4 * SIZE, CO1		# coffset += 4
-	decq	I			# i --
-	jg	.L2_4_21
-	ALIGN_4	
-
-
-
-/**************************************************************************/
-.L2_4_40:
-	testq	$ 1, M		
-	jz	.L2_4_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_4_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 1, %rax        // number of values in AO
-#else
-        addq    $ 2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L2_4_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_4_42:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_4_46
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_4_46
-
-	jmp	.L2_4_42
-	ALIGN_4
-
-.L2_4_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L2_4_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_4_47:
-
-	KERNEL1x2_SUB
-
-	jl	.L2_4_47
-	ALIGN_4
-
-
-.L2_4_49:
-
-	SAVE1x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 1, KK
-#endif
-
-	addq	$ 2 * SIZE, CO1		# coffset += 2
-	decq	I			# i --
-	jg	.L2_4_41
-	ALIGN_4	
-
-
-
-	
-.L2_4_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $ 2, KK
-#endif
-
-	decq	J			// j --
-	jg	.L2_01			// next 2 lines of N
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$ 1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovsd		(BO1), %xmm0
-	vmovsd	%xmm0,       (BO)
-	addq	$ 2*SIZE,BO1
-	addq	$ 2*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$ 16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$ 3, I			// i = (m >> 3)
-	je	.L1_4_10
-
-	ALIGN_4
-
-/**************************************************************************************************/
-
-.L1_8_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 4, %rax			// rax = rax *16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 8, %rax        // number of values in AO
-#else
-        addq    $ 1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L1_8_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 4, %rax			// rax = rax *16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_8_12:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-
-	je	.L1_8_16
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-
-	je	.L1_8_16
-
-	jmp	.L1_8_12
-	ALIGN_4
-
-.L1_8_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L1_8_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 4, %rax			// rax = rax *16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_8_17:
-
-	KERNEL8x1_SUB
-
-	jl	.L1_8_17
-	ALIGN_4
-
-
-.L1_8_19:
-
-	SAVE8x1
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 4, %rax			// rax = rax *16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 8, KK
-#endif
-
-	addq	$ 16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L1_8_11
-	ALIGN_4	
-
-
-
-/**************************************************************************************************/
-.L1_4_10:
-
-	testq	$ 7, M		
-	jz	.L999
-
-	testq	$ 4, M		
-	jz	.L1_4_20
-
-
-.L1_4_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 4, %rax        // number of values in AO
-#else
-        addq    $ 1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L1_4_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_4_12:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_4_16
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_4_16
-
-	jmp	.L1_4_12
-	ALIGN_4
-
-.L1_4_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L1_4_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_4_17:
-
-	KERNEL4x1_SUB
-
-	jl	.L1_4_17
-	ALIGN_4
-
-
-.L1_4_19:
-
-	SAVE4x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 4, KK
-#endif
-
-	addq	$ 8 * SIZE, CO1		# coffset += 8
-	ALIGN_4	
-
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-
-.L1_4_20:
-
-	testq	$ 2, M		
-	jz	.L1_4_40
-	ALIGN_4
-
-.L1_4_21:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 2, %rax        // number of values in AO
-#else
-        addq    $ 1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L1_4_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_4_22:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_4_26
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_4_26
-
-	jmp	.L1_4_22
-	ALIGN_4
-
-.L1_4_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L1_4_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_4_27:
-
-	KERNEL2x1_SUB
-
-	jl	.L1_4_27
-	ALIGN_4
-
-
-.L1_4_29:
-
-	SAVE2x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 2, KK
-#endif
-
-	addq	$ 4 * SIZE, CO1		# coffset += 4
-	ALIGN_4	
-
-
-
-/**************************************************************************/
-.L1_4_40:
-	testq	$ 1, M		
-	jz	.L999		// to next 2 lines of N
-
-	ALIGN_4
-
-.L1_4_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 1, %rax        // number of values in AO
-#else
-        addq    $ 1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L1_4_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_4_42:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_4_46
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_4_46
-
-	jmp	.L1_4_42
-	ALIGN_4
-
-.L1_4_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L1_4_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_4_47:
-
-	KERNEL1x1_SUB
-
-	jl	.L1_4_47
-	ALIGN_4
-
-
-.L1_4_49:
-
-	SAVE1x1
-
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 1, KK
-#endif
-
-	addq	$ 2 * SIZE, CO1		# coffset += 2
-	ALIGN_4	
-
-
-.L999:
-	vzeroupper
-
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	vmovups	 64(%rsp), %xmm6
-	vmovups	 80(%rsp), %xmm7
-	vmovups	 96(%rsp), %xmm8
-	vmovups	112(%rsp), %xmm9
-	vmovups	128(%rsp), %xmm10
-	vmovups	144(%rsp), %xmm11
-	vmovups	160(%rsp), %xmm12
-	vmovups	176(%rsp), %xmm13
-	vmovups	192(%rsp), %xmm14
-	vmovups	208(%rsp), %xmm15
-#endif
-
-	addq	$ STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
+/*********************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+/*********************************************************************
+* 2014/07/29 Saar
+*        BLASTEST               : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+* 2013/10/28 Saar
+* Parameter:
+*       CGEMM_DEFAULT_UNROLL_N  2
+*       CGEMM_DEFAULT_UNROLL_M  8
+*       CGEMM_DEFAULT_P         384
+*       CGEMM_DEFAULT_Q         192
+*	A_PR1			512
+*	B_PR1			512
+*
+* 2014/07/29 Saar
+* Performance at 6912x6912x6912:
+*       1 thread:      107 GFLOPS       (SANDYBRIDGE:  60)      (MKL:   86)
+*       2 threads:     208 GFLOPS       (SANDYBRIDGE: 114)      (MKL:  155)
+*       3 threads:     289 GFLOPS       (SANDYBRIDGE: 162)      (MKL:  222)
+*       4 threads:     377 GFLOPS       (SANDYBRIDGE: 223)      (MKL:  279)
+*
+*
+*********************************************************************/
+
+
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define BI	%rbp
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define BO2	%rbp
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+
+#else
+
+#define STACKSIZE 320
+
+#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
+#define OLD_A           48 + STACKSIZE(%rsp)
+#define OLD_B           56 + STACKSIZE(%rsp)
+#define OLD_C           64 + STACKSIZE(%rsp)
+#define OLD_LDC         72 + STACKSIZE(%rsp)
+#define OLD_OFFSET      80 + STACKSIZE(%rsp)
+
+#endif
+
+#define L_BUFFER_SIZE 8192
+
+#define Ndiv6	 24(%rsp)
+#define Nmod6	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA_R  48(%rsp)
+#define ALPHA_I  56(%rsp)
+#define OFFSET   64(%rsp)
+#define KK       72(%rsp)
+#define KKK      80(%rsp)
+#define BUFFER1	           128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 4(%rsp);\
+        movl    $ 0,  4096 * 3(%rsp);\
+        movl    $ 0,  4096 * 2(%rsp);\
+        movl    $ 0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 3(%rsp);\
+        movl    $ 0,  4096 * 2(%rsp);\
+        movl    $ 0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 2(%rsp);\
+        movl    $ 0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+
+#if defined(BULLDOZER)
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+#define	VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
+
+#define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
+
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+
+#define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
+
+#define	VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
+
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+#define	VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
+
+#define	VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
+
+#else
+
+#define	VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
+
+#define	VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
+
+#endif
+
+#else
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+#define	VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0
+
+#define	VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0
+
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+
+#define	VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
+
+#define	VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0
+
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+#define	VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0
+
+#define	VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
+
+#else
+
+#define	VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
+
+#define	VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
+
+#endif
+
+#endif
+
+
+#define	A_PR1	512
+#define	B_PR1	512
+
+
+
+/***************************************************************************************************************************/
+
+.macro KERNEL8x3_SUB
+
+        vmovups         -16 * SIZE(AO), %ymm0
+        vmovups          -8 * SIZE(AO), %ymm1
+        vbroadcastss     -8 * SIZE(BO), %ymm2
+        vbroadcastss     -7 * SIZE(BO), %ymm3
+	prefetcht0	A_PR1(AO)
+
+        VFMADDPS_R(        %ymm8 ,%ymm2,%ymm0 )
+        VFMADDPS_R(        %ymm12,%ymm2,%ymm1 )
+        VFMADDPS_I(        %ymm9 ,%ymm3,%ymm0 )
+        VFMADDPS_I(        %ymm13,%ymm3,%ymm1 )
+
+        vbroadcastss     -6 * SIZE(BO), %ymm2
+        vbroadcastss     -5 * SIZE(BO), %ymm3
+        VFMADDPS_R(        %ymm10,%ymm2,%ymm0 )
+        VFMADDPS_R(        %ymm14,%ymm2,%ymm1 )
+        VFMADDPS_I(        %ymm11,%ymm3,%ymm0 )
+        VFMADDPS_I(        %ymm15,%ymm3,%ymm1 )
+
+        vbroadcastss     -4 * SIZE(BO), %ymm2
+        vbroadcastss     -3 * SIZE(BO), %ymm3
+        VFMADDPS_R(        %ymm4 ,%ymm2,%ymm0 )
+        VFMADDPS_R(        %ymm6 ,%ymm2,%ymm1 )
+        VFMADDPS_I(        %ymm5 ,%ymm3,%ymm0 )
+        VFMADDPS_I(        %ymm7 ,%ymm3,%ymm1 )
+
+
+        addq    $6*SIZE, BO                           
+        addq    $16*SIZE, AO                         
+	decq	%rax
+.endm
+
+.macro SAVE8x3
+
+	vbroadcastss	ALPHA_R, %ymm0
+	vbroadcastss	ALPHA_I, %ymm1
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9
+        vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
+        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
+        vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
+        vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5
+        vshufps $ 0xb1, %ymm7 , %ymm7 , %ymm7
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %ymm9, %ymm8 , %ymm8
+        vaddsubps %ymm11,%ymm10, %ymm10
+        vaddsubps %ymm13,%ymm12, %ymm12
+        vaddsubps %ymm15,%ymm14, %ymm14
+        vaddsubps %ymm5, %ymm4 , %ymm4
+        vaddsubps %ymm7, %ymm6 , %ymm6
+
+        vshufps $ 0xb1, %ymm8 , %ymm8 , %ymm9
+        vshufps $ 0xb1, %ymm10, %ymm10, %ymm11
+        vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
+        vshufps $ 0xb1, %ymm14, %ymm14, %ymm15
+        vshufps $ 0xb1, %ymm4 , %ymm4 , %ymm5
+        vshufps $ 0xb1, %ymm6 , %ymm6 , %ymm7
+
+#else
+        vaddsubps %ymm8,  %ymm9 ,%ymm9
+        vaddsubps %ymm10, %ymm11,%ymm11
+        vaddsubps %ymm12, %ymm13,%ymm13
+        vaddsubps %ymm14, %ymm15,%ymm15
+        vaddsubps %ymm4,  %ymm5 ,%ymm5
+        vaddsubps %ymm6,  %ymm7 ,%ymm7
+
+        vmovaps   %ymm9,  %ymm8
+        vmovaps   %ymm11, %ymm10
+        vmovaps   %ymm13, %ymm12
+        vmovaps   %ymm15, %ymm14
+        vmovaps   %ymm5,  %ymm4
+        vmovaps   %ymm7,  %ymm6
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9
+        vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
+        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
+        vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
+        vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5
+        vshufps $ 0xb1, %ymm7 , %ymm7 , %ymm7
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %ymm8 , %ymm0, %ymm8
+        vmulps  %ymm10, %ymm0, %ymm10
+        vmulps  %ymm12, %ymm0, %ymm12
+        vmulps  %ymm14, %ymm0, %ymm14
+        vmulps  %ymm4 , %ymm0, %ymm4
+        vmulps  %ymm6 , %ymm0, %ymm6
+
+	// multiply with ALPHA_I
+        vmulps  %ymm9 , %ymm1, %ymm9
+        vmulps  %ymm11, %ymm1, %ymm11
+        vmulps  %ymm13, %ymm1, %ymm13
+        vmulps  %ymm15, %ymm1, %ymm15
+        vmulps  %ymm5 , %ymm1, %ymm5
+        vmulps  %ymm7 , %ymm1, %ymm7
+
+	vaddsubps %ymm9, %ymm8 , %ymm8
+        vaddsubps %ymm11,%ymm10, %ymm10
+        vaddsubps %ymm13,%ymm12, %ymm12
+        vaddsubps %ymm15,%ymm14, %ymm14
+	vaddsubps %ymm5, %ymm4 , %ymm4
+	vaddsubps %ymm7, %ymm6 , %ymm6
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	 	(CO1), %ymm8 , %ymm8
+	vaddps  8 * SIZE(CO1), %ymm12, %ymm12
+
+	vaddps 	 	(CO1, LDC), %ymm10, %ymm10
+	vaddps  8 * SIZE(CO1, LDC), %ymm14, %ymm14
+
+	vaddps 	 	(CO1, LDC,2), %ymm4, %ymm4
+	vaddps  8 * SIZE(CO1, LDC,2), %ymm6, %ymm6
+
+#endif
+
+	vmovups	%ymm8 ,  	 (CO1)
+	vmovups	%ymm12 , 8 * SIZE(CO1)
+
+	vmovups	%ymm10 ,  	 (CO1, LDC)
+	vmovups	%ymm14 , 8 * SIZE(CO1, LDC)
+
+	vmovups	%ymm4  ,  	 (CO1, LDC,2)
+	vmovups	%ymm6  , 8 * SIZE(CO1, LDC,2)
+
+.endm
+
+
+/***************************************************************************************************************************/
+
+.macro KERNEL4x3_SUB
+
+        vmovups         -16 * SIZE(AO), %ymm0
+        vbroadcastss     -8 * SIZE(BO), %ymm2
+        vbroadcastss     -7 * SIZE(BO), %ymm3
+
+        VFMADDPS_R(        %ymm8 ,%ymm2,%ymm0 )
+        VFMADDPS_I(        %ymm9 ,%ymm3,%ymm0 )
+
+        vbroadcastss     -6 * SIZE(BO), %ymm2
+        vbroadcastss     -5 * SIZE(BO), %ymm3
+        VFMADDPS_R(        %ymm12,%ymm2,%ymm0 )
+        VFMADDPS_I(        %ymm13,%ymm3,%ymm0 )
+
+        vbroadcastss     -4 * SIZE(BO), %ymm2
+        vbroadcastss     -3 * SIZE(BO), %ymm3
+        VFMADDPS_R(        %ymm4 ,%ymm2,%ymm0 )
+        VFMADDPS_I(        %ymm5 ,%ymm3,%ymm0 )
+
+        addq    $6*SIZE, BO                           
+        addq    $8*SIZE, AO                         
+	decq	%rax
+.endm
+
+.macro SAVE4x3
+
+	vbroadcastss	ALPHA_R, %ymm0
+	vbroadcastss	ALPHA_I, %ymm1
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9
+        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
+        vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %ymm9, %ymm8 , %ymm8
+        vaddsubps %ymm13,%ymm12, %ymm12
+        vaddsubps %ymm5, %ymm4 , %ymm4
+
+        vshufps $ 0xb1, %ymm8 , %ymm8 , %ymm9
+        vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
+        vshufps $ 0xb1, %ymm4 , %ymm4 , %ymm5
+
+#else
+        vaddsubps %ymm8,  %ymm9 ,%ymm9
+        vaddsubps %ymm12, %ymm13,%ymm13
+        vaddsubps %ymm4,  %ymm5 ,%ymm5
+
+        vmovaps   %ymm9,  %ymm8
+        vmovaps   %ymm13, %ymm12
+        vmovaps   %ymm5,  %ymm4
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9
+        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
+        vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %ymm8 , %ymm0, %ymm8
+        vmulps  %ymm12, %ymm0, %ymm12
+        vmulps  %ymm4 , %ymm0, %ymm4
+
+	// multiply with ALPHA_I
+        vmulps  %ymm9 , %ymm1, %ymm9
+        vmulps  %ymm13, %ymm1, %ymm13
+        vmulps  %ymm5 , %ymm1, %ymm5
+
+	vaddsubps %ymm9, %ymm8 , %ymm8
+        vaddsubps %ymm13,%ymm12, %ymm12
+	vaddsubps %ymm5, %ymm4 , %ymm4
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	 	(CO1), %ymm8 , %ymm8
+	vaddps 	 	(CO1, LDC), %ymm12, %ymm12
+	vaddps 	 	(CO1, LDC,2), %ymm4, %ymm4
+
+#endif
+
+	vmovups	%ymm8 ,  	 (CO1)
+	vmovups	%ymm12 ,  	 (CO1, LDC)
+	vmovups	%ymm4  ,  	 (CO1, LDC,2)
+
+.endm
+
+/***************************************************************************************************************************/
+
+.macro KERNEL2x3_SUB
+
+        vmovups         -16 * SIZE(AO), %xmm0
+        vbroadcastss     -8 * SIZE(BO), %xmm2
+        vbroadcastss     -7 * SIZE(BO), %xmm3
+
+        VFMADDPS_R(        %xmm8 ,%xmm2,%xmm0 )
+        VFMADDPS_I(        %xmm9 ,%xmm3,%xmm0 )
+
+        vbroadcastss     -6 * SIZE(BO), %xmm2
+        vbroadcastss     -5 * SIZE(BO), %xmm3
+        VFMADDPS_R(        %xmm12,%xmm2,%xmm0 )
+        VFMADDPS_I(        %xmm13,%xmm3,%xmm0 )
+
+        vbroadcastss     -4 * SIZE(BO), %xmm2
+        vbroadcastss     -3 * SIZE(BO), %xmm3
+        VFMADDPS_R(        %xmm4 ,%xmm2,%xmm0 )
+        VFMADDPS_I(        %xmm5 ,%xmm3,%xmm0 )
+
+        addq    $6*SIZE, BO                           
+        addq    $4*SIZE, AO                         
+	decq	%rax
+
+.endm
+
+.macro SAVE2x3
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9
+        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
+        vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm13,%xmm12, %xmm12
+        vaddsubps %xmm5, %xmm4 , %xmm4
+
+        vshufps $ 0xb1, %xmm8 , %xmm8 , %xmm9
+        vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
+        vshufps $ 0xb1, %xmm4 , %xmm4 , %xmm5
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+        vaddsubps %xmm12, %xmm13,%xmm13
+        vaddsubps %xmm4,  %xmm5 ,%xmm5
+
+        vmovaps   %xmm9,  %xmm8
+        vmovaps   %xmm13, %xmm12
+        vmovaps   %xmm5,  %xmm4
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9
+        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
+        vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+        vmulps  %xmm12, %xmm0, %xmm12
+        vmulps  %xmm4 , %xmm0, %xmm4
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+        vmulps  %xmm13, %xmm1, %xmm13
+        vmulps  %xmm5 , %xmm1, %xmm5
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm13,%xmm12, %xmm12
+	vaddsubps %xmm5, %xmm4 , %xmm4
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	 	(CO1), %xmm8 , %xmm8
+	vaddps 	 	(CO1, LDC), %xmm12, %xmm12
+	vaddps 	 	(CO1, LDC,2), %xmm4, %xmm4
+
+#endif
+
+	vmovups	%xmm8 ,  	 (CO1)
+	vmovups	%xmm12 ,  	 (CO1, LDC)
+	vmovups	%xmm4  ,  	 (CO1, LDC,2)
+
+.endm
+
+
+/***************************************************************************************************************************/
+
+.macro KERNEL1x3_SUB
+
+        vmovsd          -16 * SIZE(AO), %xmm0
+        vbroadcastss     -8 * SIZE(BO), %xmm2
+        vbroadcastss     -7 * SIZE(BO), %xmm3
+
+        VFMADDPS_R(        %xmm8 ,%xmm2,%xmm0 )
+        VFMADDPS_I(        %xmm9 ,%xmm3,%xmm0 )
+
+        vbroadcastss     -6 * SIZE(BO), %xmm2
+        vbroadcastss     -5 * SIZE(BO), %xmm3
+        VFMADDPS_R(        %xmm12,%xmm2,%xmm0 )
+        VFMADDPS_I(        %xmm13,%xmm3,%xmm0 )
+
+        vbroadcastss     -4 * SIZE(BO), %xmm2
+        vbroadcastss     -3 * SIZE(BO), %xmm3
+        VFMADDPS_R(        %xmm4 ,%xmm2,%xmm0 )
+        VFMADDPS_I(        %xmm5 ,%xmm3,%xmm0 )
+
+        addq    $6*SIZE, BO                           
+        addq    $2*SIZE, AO                         
+	decq	%rax
+
+.endm
+
+.macro SAVE1x3
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9
+        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
+        vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm13,%xmm12, %xmm12
+        vaddsubps %xmm5, %xmm4 , %xmm4
+
+        vshufps $ 0xb1, %xmm8 , %xmm8 , %xmm9
+        vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
+        vshufps $ 0xb1, %xmm4 , %xmm4 , %xmm5
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+        vaddsubps %xmm12, %xmm13,%xmm13
+        vaddsubps %xmm4,  %xmm5 ,%xmm5
+
+        vmovaps   %xmm9,  %xmm8
+        vmovaps   %xmm13, %xmm12
+        vmovaps   %xmm5,  %xmm4
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9
+        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
+        vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+        vmulps  %xmm12, %xmm0, %xmm12
+        vmulps  %xmm4 , %xmm0, %xmm4
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+        vmulps  %xmm13, %xmm1, %xmm13
+        vmulps  %xmm5 , %xmm1, %xmm5
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm13,%xmm12, %xmm12
+	vaddsubps %xmm5, %xmm4 , %xmm4
+
+#if !defined(TRMMKERNEL)
+
+	vmovsd		(CO1)      , %xmm9
+	vmovsd		(CO1,LDC)  , %xmm13
+	vmovsd		(CO1,LDC,2), %xmm5
+	vaddps 	 	%xmm9 , %xmm8 , %xmm8
+	vaddps 	 	%xmm13, %xmm12, %xmm12
+	vaddps 	 	%xmm5 , %xmm4, %xmm4
+
+#endif
+
+	vmovsd	%xmm8 ,  	 (CO1)
+	vmovsd	%xmm12 ,  	 (CO1, LDC)
+	vmovsd	%xmm4  ,  	 (CO1, LDC,2)
+
+.endm
+
+
+/***************************************************************************************************************************/
+
+.macro KERNEL8x2_SUB
+
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %ymm0
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %ymm4
+        VFMADDPS_R(        %ymm8,%ymm4,%ymm0  )
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %ymm1
+        VFMADDPS_R(        %ymm12,%ymm4,%ymm1 )
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %ymm5
+        VFMADDPS_I(        %ymm9,%ymm5,%ymm0  )
+        VFMADDPS_I(        %ymm13,%ymm5,%ymm1 )
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %ymm6
+        VFMADDPS_R(        %ymm10,%ymm6,%ymm0 )
+        VFMADDPS_R(        %ymm14,%ymm6,%ymm1 )
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %ymm7
+        VFMADDPS_I(        %ymm11,%ymm7,%ymm0 )
+        VFMADDPS_I(        %ymm15,%ymm7,%ymm1 )
+        addq    $ 4 , BI                           
+        addq    $ 16, %rax                         
+.endm
+
+.macro SAVE8x2
+
+	vbroadcastss	ALPHA_R, %ymm0
+	vbroadcastss	ALPHA_I, %ymm1
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
+        vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
+        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
+        vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %ymm9, %ymm8 , %ymm8
+        vaddsubps %ymm11,%ymm10, %ymm10
+        vaddsubps %ymm13,%ymm12, %ymm12
+        vaddsubps %ymm15,%ymm14, %ymm14
+
+        vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9
+        vshufps $ 0xb1, %ymm10, %ymm10, %ymm11
+        vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
+        vshufps $ 0xb1, %ymm14, %ymm14, %ymm15
+
+#else
+        vaddsubps %ymm8,  %ymm9 ,%ymm9
+        vaddsubps %ymm10, %ymm11,%ymm11
+        vaddsubps %ymm12, %ymm13,%ymm13
+        vaddsubps %ymm14, %ymm15,%ymm15
+
+        vmovaps   %ymm9,  %ymm8
+        vmovaps   %ymm11, %ymm10
+        vmovaps   %ymm13, %ymm12
+        vmovaps   %ymm15, %ymm14
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
+        vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
+        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
+        vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %ymm8 , %ymm0, %ymm8
+        vmulps  %ymm10, %ymm0, %ymm10
+        vmulps  %ymm12, %ymm0, %ymm12
+        vmulps  %ymm14, %ymm0, %ymm14
+
+	// multiply with ALPHA_I
+        vmulps  %ymm9 , %ymm1, %ymm9
+        vmulps  %ymm11, %ymm1, %ymm11
+        vmulps  %ymm13, %ymm1, %ymm13
+        vmulps  %ymm15, %ymm1, %ymm15
+
+	vaddsubps %ymm9, %ymm8 , %ymm8
+        vaddsubps %ymm11,%ymm10, %ymm10
+        vaddsubps %ymm13,%ymm12, %ymm12
+        vaddsubps %ymm15,%ymm14, %ymm14
+
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	 	(CO1), %ymm8 , %ymm8
+	vaddps  8 * SIZE(CO1), %ymm12, %ymm12
+
+	vaddps 	 	(CO1, LDC), %ymm10, %ymm10
+	vaddps  8 * SIZE(CO1, LDC), %ymm14, %ymm14
+
+#endif
+
+	vmovups	%ymm8 ,  	(CO1)
+	vmovups	%ymm12 , 8 * SIZE(CO1)
+
+	vmovups	%ymm10 ,  	(CO1, LDC)
+	vmovups	%ymm14 , 8 * SIZE(CO1, LDC)
+
+	prefetcht0	64(CO1)
+	prefetcht0	64(CO1, LDC)
+
+.endm
+
+/***************************************************************************************************************************/
+
+.macro KERNEL4x2_SUB
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4
+        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
+        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1
+        VFMADDPS_R(        %xmm12,%xmm4,%xmm1 )
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5
+        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
+        VFMADDPS_I(        %xmm13,%xmm5,%xmm1 )
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6
+        VFMADDPS_R(        %xmm10,%xmm6,%xmm0 )
+        VFMADDPS_R(        %xmm14,%xmm6,%xmm1 )
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7
+        VFMADDPS_I(        %xmm11,%xmm7,%xmm0 )
+        VFMADDPS_I(        %xmm15,%xmm7,%xmm1 )
+        addq    $ 4, BI                           
+        addq    $ 8, %rax                         
+.endm
+
+.macro SAVE4x2
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
+        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
+        vshufps $ 0xb1, %xmm15, %xmm15, %xmm15
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+        vaddsubps %xmm13,%xmm12, %xmm12
+        vaddsubps %xmm15,%xmm14, %xmm14
+
+        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
+        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
+        vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
+        vshufps $ 0xb1, %xmm14, %xmm14, %xmm15
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+        vaddsubps %xmm10, %xmm11,%xmm11
+        vaddsubps %xmm12, %xmm13,%xmm13
+        vaddsubps %xmm14, %xmm15,%xmm15
+
+        vmovaps   %xmm9,  %xmm8
+        vmovaps   %xmm11, %xmm10
+        vmovaps   %xmm13, %xmm12
+        vmovaps   %xmm15, %xmm14
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
+        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
+        vshufps $ 0xb1, %xmm15, %xmm15, %xmm15
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+        vmulps  %xmm10, %xmm0, %xmm10
+        vmulps  %xmm12, %xmm0, %xmm12
+        vmulps  %xmm14, %xmm0, %xmm14
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+        vmulps  %xmm11, %xmm1, %xmm11
+        vmulps  %xmm13, %xmm1, %xmm13
+        vmulps  %xmm15, %xmm1, %xmm15
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+        vaddsubps %xmm13,%xmm12, %xmm12
+        vaddsubps %xmm15,%xmm14, %xmm14
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	 	(CO1), %xmm8 , %xmm8
+	vaddps  4 * SIZE(CO1), %xmm12, %xmm12
+
+	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
+	vaddps  4 * SIZE(CO1, LDC), %xmm14, %xmm14
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 4 * SIZE(CO1)
+
+	vmovups	%xmm10 ,  	(CO1, LDC)
+	vmovups	%xmm14 , 4 * SIZE(CO1, LDC)
+
+.endm
+
+/************************************************************************************************/
+
+.macro KERNEL2x2_SUB
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4
+        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5
+        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6
+        VFMADDPS_R(        %xmm10,%xmm6,%xmm0 )
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7
+        VFMADDPS_I(        %xmm11,%xmm7,%xmm0 )
+        addq    $ 4, BI                           
+        addq    $ 4, %rax                         
+.endm
+
+.macro SAVE2x2
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 4 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
+        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+        vaddsubps %xmm10, %xmm11,%xmm11
+
+        vmovaps   %xmm9,  %xmm8
+        vmovaps   %xmm11, %xmm10
+
+	// swap high and low 4 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+        vmulps  %xmm10, %xmm0, %xmm10
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+        vmulps  %xmm11, %xmm1, %xmm11
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	 	(CO1), %xmm8 , %xmm8
+
+	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+
+	vmovups	%xmm10 ,  	(CO1, LDC)
+
+.endm
+
+/************************************************************************************************/
+
+.macro KERNEL1x2_SUB
+        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4
+        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5
+        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6
+        VFMADDPS_R(        %xmm10,%xmm6,%xmm0 )
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7
+        VFMADDPS_I(        %xmm11,%xmm7,%xmm0 )
+        addq    $ 4, BI                           
+        addq    $ 2, %rax                         
+.endm
+
+.macro SAVE1x2
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
+        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+        vaddsubps %xmm10, %xmm11,%xmm11
+
+        vmovaps   %xmm9,  %xmm8
+        vmovaps   %xmm11, %xmm10
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+        vmulps  %xmm10, %xmm0, %xmm10
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+        vmulps  %xmm11, %xmm1, %xmm11
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+#if !defined(TRMMKERNEL)
+
+	vmovsd		(CO1), %xmm14
+	vaddps 	 	%xmm14, %xmm8 , %xmm8
+
+	vmovsd		(CO1, LDC), %xmm15
+	vaddps 	 	%xmm15, %xmm10, %xmm10
+
+#endif
+
+	vmovsd	%xmm8 ,  	(CO1)
+	vmovsd	%xmm10 ,  	(CO1, LDC)
+
+.endm
+
+/************************************************************************************************/
+
+.macro KERNEL8x1_SUB
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %ymm0
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %ymm1
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %ymm4
+        VFMADDPS_R(        %ymm8,%ymm4,%ymm0  )
+        VFMADDPS_R(        %ymm12,%ymm4,%ymm1 )
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %ymm5
+        VFMADDPS_I(        %ymm9,%ymm5,%ymm0  )
+        VFMADDPS_I(        %ymm13,%ymm5,%ymm1 )
+        addq    $ 2 , BI                           
+        addq    $ 16, %rax                         
+.endm
+
+.macro SAVE8x1
+
+	vbroadcastss	ALPHA_R, %ymm0
+	vbroadcastss	ALPHA_I, %ymm1
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
+        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %ymm9, %ymm8 , %ymm8
+        vaddsubps %ymm13,%ymm12, %ymm12
+
+        vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9
+        vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
+
+#else
+        vaddsubps %ymm8,  %ymm9 ,%ymm9
+        vaddsubps %ymm12, %ymm13,%ymm13
+
+        vmovaps   %ymm9,  %ymm8
+        vmovaps   %ymm13, %ymm12
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
+        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %ymm8 , %ymm0, %ymm8
+        vmulps  %ymm12, %ymm0, %ymm12
+
+	// multiply with ALPHA_I
+        vmulps  %ymm9 , %ymm1, %ymm9
+        vmulps  %ymm13, %ymm1, %ymm13
+
+	vaddsubps %ymm9, %ymm8 , %ymm8
+        vaddsubps %ymm13,%ymm12, %ymm12
+
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	 	(CO1), %ymm8 , %ymm8
+	vaddps  8 * SIZE(CO1), %ymm12, %ymm12
+
+#endif
+
+	vmovups	%ymm8 ,  	(CO1)
+	vmovups	%ymm12 , 8 * SIZE(CO1)
+
+.endm
+
+
+/************************************************************************************************/
+
+.macro KERNEL4x1_SUB
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4
+        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
+        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1
+        VFMADDPS_R(        %xmm12,%xmm4,%xmm1 )
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5
+        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
+        VFMADDPS_I(        %xmm13,%xmm5,%xmm1 )
+        addq    $ 2, BI                           
+        addq    $ 8, %rax                         
+.endm
+
+.macro SAVE4x1
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 4 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm13,%xmm12, %xmm12
+
+        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
+        vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+        vaddsubps %xmm12, %xmm13,%xmm13
+
+        vmovaps   %xmm9,  %xmm8
+        vmovaps   %xmm13, %xmm12
+
+	// swap high and low 4 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+        vmulps  %xmm12, %xmm0, %xmm12
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+        vmulps  %xmm13, %xmm1, %xmm13
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm13,%xmm12, %xmm12
+
+#ifndef TRMMKERNEL
+
+	vaddps 	 	(CO1), %xmm8 , %xmm8
+	vaddps  4 * SIZE(CO1), %xmm12, %xmm12
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 4 * SIZE(CO1)
+
+.endm
+
+/************************************************************************************************/
+
+.macro KERNEL2x1_SUB
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4
+        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5
+        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
+        addq    $ 2, BI                           
+        addq    $ 4, %rax                         
+.endm
+
+.macro SAVE2x1
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+
+        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+
+        vmovaps   %xmm9,  %xmm8
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	 	(CO1), %xmm8 , %xmm8
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+
+.endm
+
+/************************************************************************************************/
+
+.macro KERNEL1x1_SUB
+        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4
+        VFMADDPS_R(        %xmm8,%xmm4,%xmm0 )
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5
+        VFMADDPS_I(        %xmm9,%xmm5,%xmm0 )
+        addq    $ 2, BI                           
+        addq    $ 2, %rax                         
+.endm
+
+.macro SAVE1x1
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+
+        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+
+        vmovaps   %xmm9,  %xmm8
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+
+#if !defined(TRMMKERNEL)
+
+	vmovsd		(CO1), %xmm14
+	vaddps 	 	%xmm14, %xmm8 , %xmm8
+
+#endif
+
+	vmovsd	%xmm8 ,  	(CO1)
+
+.endm
+
+
+#if !defined(TRMMKERNEL)
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	vmovups	%xmm6,   64(%rsp)
+	vmovups	%xmm7,   80(%rsp)
+	vmovups	%xmm8,   96(%rsp)
+	vmovups	%xmm9,  112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+	vmovaps	%xmm3, %xmm0
+	vmovsd   OLD_ALPHA_I, %xmm1
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $ 128 + L_BUFFER_SIZE, %rsp
+        andq    $ -4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$ 0, OLD_M
+	je	.L999
+
+	cmpq	$ 0, OLD_N
+	je	.L999
+
+	cmpq	$ 0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovss	 %xmm0, ALPHA_R
+	vmovss	 %xmm1, ALPHA_I
+
+	salq	$ ZBASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $ 6,  %rdi
+        divq    %rdi                    //    N / 6
+        movq    %rax, Ndiv6             //    N / 6
+        movq    %rdx, Nmod6             //    N % 6
+
+/************************************************************************************************/
+	
+.L6_0:
+
+	movq	Ndiv6,  J
+	cmpq	$ 0, J
+	je	.L2_00
+	ALIGN_4
+
+
+
+.L6_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq    K, %rax
+        salq    $2, %rax                // 2 * COMPSIZE
+        leaq    (B, %rax,4), BO2
+        movq    BO2, B                  // next offset of B
+        movq    K, %rax
+
+	ALIGN_4
+
+.L6_02b:
+
+	vmovups	(BO1), %xmm0
+	vmovsd	(BO2), %xmm1
+	vmovups	%xmm0,       (BO)
+	vmovsd  %xmm1, 4*SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO2
+	addq	$ 6*SIZE,BO
+	decq	%rax
+	jnz	.L6_02b
+
+
+.L6_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$ 3, I			// i = (m >> 3)
+	je	.L6_4_10
+
+	ALIGN_4
+/**********************************************************************************************************/
+
+.L6_8_11:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L6_8_16
+
+	ALIGN_4
+
+.L6_8_12:
+
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+
+	je	.L6_8_16
+
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+
+	je	.L6_8_16
+
+	jmp	.L6_8_12
+	ALIGN_4
+
+.L6_8_16:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L6_8_19
+
+	ALIGN_4
+
+.L6_8_17:
+
+	KERNEL8x3_SUB
+
+	jnz	.L6_8_17
+	ALIGN_4
+
+
+.L6_8_19:
+
+	SAVE8x3
+
+	addq	$ 16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L6_8_11
+	ALIGN_4	
+
+
+/**********************************************************************************************************/
+
+
+.L6_4_10:
+	testq	$ 7, M		
+	jz	.L6_4_60		// to next 2 lines of N
+
+	testq	$ 4, M		
+	jz	.L6_4_20
+	ALIGN_4
+
+
+.L6_4_11:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L6_4_16
+
+	ALIGN_4
+
+.L6_4_12:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	je	.L6_4_16
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	je	.L6_4_16
+
+	jmp	.L6_4_12
+	ALIGN_4
+
+.L6_4_16:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L6_4_19
+
+	ALIGN_4
+
+.L6_4_17:
+
+	KERNEL4x3_SUB
+
+	jnz	.L6_4_17
+	ALIGN_4
+
+
+.L6_4_19:
+
+	SAVE4x3
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	ALIGN_4	
+
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+.L6_4_20:
+
+	testq	$ 2, M		
+	jz	.L6_4_40
+	ALIGN_4
+
+.L6_4_21:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L6_4_26
+
+	ALIGN_4
+
+.L6_4_22:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	je	.L6_4_26
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	je	.L6_4_26
+
+	jmp	.L6_4_22
+	ALIGN_4
+
+.L6_4_26:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L6_4_29
+
+	ALIGN_4
+
+.L6_4_27:
+
+	KERNEL2x3_SUB
+
+	jnz	.L6_4_27
+	ALIGN_4
+
+
+.L6_4_29:
+
+	SAVE2x3
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+	decq	I			# i --
+	jg	.L6_4_21
+	ALIGN_4	
+
+
+
+/**************************************************************************/
+.L6_4_40:
+	testq	$ 1, M		
+	jz	.L6_4_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L6_4_41:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L6_4_46
+
+	ALIGN_4
+
+.L6_4_42:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	je	.L6_4_46
+
+	prefetcht0	A_PR1(AO)
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	je	.L6_4_46
+
+	jmp	.L6_4_42
+	ALIGN_4
+
+.L6_4_46:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L6_4_49
+	ALIGN_4
+
+.L6_4_47:
+
+	KERNEL1x3_SUB
+
+	jnz	.L6_4_47
+	ALIGN_4
+
+
+.L6_4_49:
+
+	SAVE1x3
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	decq	I			# i --
+	jg	.L6_4_41
+	ALIGN_4	
+
+
+
+	
+.L6_4_60:
+
+
+/*******************************************************************************************/
+
+.L7_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq    K, %rax
+        salq    $2, %rax                // 2 * COMPSIZE
+        leaq    (B, %rax,4), BO2
+        movq    K, %rax
+
+	ALIGN_4
+
+.L7_02b:
+
+	vmovsd	2*SIZE(BO1), %xmm0
+	vmovups	      (BO2), %xmm1
+	vmovsd	 %xmm0,       (BO)
+	vmovups  %xmm1, 2*SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO2
+	addq	$ 6*SIZE,BO
+	decq	%rax
+	jnz	.L7_02b
+
+        movq    BO2, B                  // next offset of B
+
+.L7_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$ 3, I			// i = (m >> 3)
+	je	.L7_4_10
+
+	ALIGN_4
+/**********************************************************************************************************/
+
+.L7_8_11:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L7_8_16
+
+	ALIGN_4
+
+.L7_8_12:
+
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+
+	je	.L7_8_16
+
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+	KERNEL8x3_SUB
+
+	je	.L7_8_16
+
+	jmp	.L7_8_12
+	ALIGN_4
+
+.L7_8_16:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L7_8_19
+
+	ALIGN_4
+
+.L7_8_17:
+
+	KERNEL8x3_SUB
+
+	jnz	.L7_8_17
+	ALIGN_4
+
+
+.L7_8_19:
+
+	SAVE8x3
+
+	addq	$ 16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L7_8_11
+	ALIGN_4	
+
+
+/**********************************************************************************************************/
+
+
+.L7_4_10:
+	testq	$ 7, M		
+	jz	.L7_4_60		// to next 2 lines of N
+
+	testq	$ 4, M		
+	jz	.L7_4_20
+	ALIGN_4
+
+
+.L7_4_11:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L7_4_16
+
+	ALIGN_4
+
+.L7_4_12:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	je	.L7_4_16
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	je	.L7_4_16
+
+	jmp	.L7_4_12
+	ALIGN_4
+
+.L7_4_16:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L7_4_19
+
+	ALIGN_4
+
+.L7_4_17:
+
+	KERNEL4x3_SUB
+
+	jnz	.L7_4_17
+	ALIGN_4
+
+
+.L7_4_19:
+
+	SAVE4x3
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	ALIGN_4	
+
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+.L7_4_20:
+
+	testq	$ 2, M		
+	jz	.L7_4_40
+	ALIGN_4
+
+.L7_4_21:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L7_4_26
+
+	ALIGN_4
+
+.L7_4_22:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	je	.L7_4_26
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	je	.L7_4_26
+
+	jmp	.L7_4_22
+	ALIGN_4
+
+.L7_4_26:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L7_4_29
+
+	ALIGN_4
+
+.L7_4_27:
+
+	KERNEL2x3_SUB
+
+	jnz	.L7_4_27
+	ALIGN_4
+
+
+.L7_4_29:
+
+	SAVE2x3
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+	decq	I			# i --
+	jg	.L7_4_21
+	ALIGN_4	
+
+
+
+/**************************************************************************/
+.L7_4_40:
+	testq	$ 1, M		
+	jz	.L7_4_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L7_4_41:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L7_4_46
+
+	ALIGN_4
+
+.L7_4_42:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	je	.L7_4_46
+
+	prefetcht0	A_PR1(AO)
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	je	.L7_4_46
+
+	jmp	.L7_4_42
+	ALIGN_4
+
+.L7_4_46:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L7_4_49
+	ALIGN_4
+
+.L7_4_47:
+
+	KERNEL1x3_SUB
+
+	jnz	.L7_4_47
+	ALIGN_4
+
+
+.L7_4_49:
+
+	SAVE1x3
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	decq	I			# i --
+	jg	.L7_4_41
+	ALIGN_4	
+
+
+
+	
+.L7_4_60:
+
+	decq	J			// j --
+	jg	.L6_01			// next 6 lines of N
+
+
+
+/************************************************************************************************/
+
+.L2_00:
+
+	movq	Nmod6,  J
+	sarq    $1, J           // j = j / 2
+	cmpq	$ 0, J
+	je	.L1_0
+	ALIGN_4
+
+
+
+.L2_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_02b:
+
+	vmovups	(BO1), %xmm0
+	vmovups	%xmm0,       (BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO
+	decq	%rax
+	jnz	.L2_02b
+
+.L2_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$ 3, I			// i = (m >> 3)
+	je	.L2_4_10
+
+	ALIGN_4
+/**********************************************************************************************************/
+
+.L2_8_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 8, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_8_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_8_12:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+
+	je	.L2_8_16
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+
+	je	.L2_8_16
+
+	jmp	.L2_8_12
+	ALIGN_4
+
+.L2_8_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_8_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_8_17:
+
+	KERNEL8x2_SUB
+
+	jl	.L2_8_17
+	ALIGN_4
+
+
+.L2_8_19:
+
+	SAVE8x2
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 8, KK
+#endif
+
+	addq	$ 16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L2_8_11
+	ALIGN_4	
+
+
+/**********************************************************************************************************/
+
+
+
+
+.L2_4_10:
+	testq	$ 7, M		
+	jz	.L2_4_60		// to next 2 lines of N
+
+	testq	$ 4, M		
+	jz	.L2_4_20
+	ALIGN_4
+
+
+.L2_4_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 4, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_12:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_4_16
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_4_16
+
+	jmp	.L2_4_12
+	ALIGN_4
+
+.L2_4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_17:
+
+	KERNEL4x2_SUB
+
+	jl	.L2_4_17
+	ALIGN_4
+
+
+.L2_4_19:
+
+	SAVE4x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 4, KK
+#endif
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	ALIGN_4	
+
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+.L2_4_20:
+
+	testq	$ 2, M		
+	jz	.L2_4_40
+	ALIGN_4
+
+.L2_4_21:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 2, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_4_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_22:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_4_26
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_4_26
+
+	jmp	.L2_4_22
+	ALIGN_4
+
+.L2_4_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_4_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_27:
+
+	KERNEL2x2_SUB
+
+	jl	.L2_4_27
+	ALIGN_4
+
+
+.L2_4_29:
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
+        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+        vaddsubps %xmm10, %xmm11,%xmm11
+
+        vmovaps   %xmm9,  %xmm8
+        vmovaps   %xmm11, %xmm10
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+        vmulps  %xmm10, %xmm0, %xmm10
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+        vmulps  %xmm11, %xmm1, %xmm11
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddps 	 	(CO1), %xmm8 , %xmm8
+
+	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+
+	vmovups	%xmm10 ,  	(CO1, LDC)
+
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+	decq	I			# i --
+	jg	.L2_4_21
+	ALIGN_4	
+
+
+
+/**************************************************************************/
+.L2_4_40:
+	testq	$ 1, M		
+	jz	.L2_4_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_4_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 1, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_4_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_42:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_4_46
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_4_46
+
+	jmp	.L2_4_42
+	ALIGN_4
+
+.L2_4_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_4_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_47:
+
+	KERNEL1x2_SUB
+
+	jl	.L2_4_47
+	ALIGN_4
+
+
+.L2_4_49:
+
+	SAVE1x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 1, KK
+#endif
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	decq	I			# i --
+	jg	.L2_4_41
+	ALIGN_4	
+
+
+
+	
+.L2_4_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	decq	J			// j --
+	jg	.L2_01			// next 2 lines of N
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$ 1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovsd		(BO1), %xmm0
+	vmovsd	%xmm0,       (BO)
+	addq	$ 2*SIZE,BO1
+	addq	$ 2*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$ 3, I			// i = (m >> 3)
+	je	.L1_4_10
+
+	ALIGN_4
+
+/**************************************************************************************************/
+
+.L1_8_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 8, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_8_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_8_12:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+
+	je	.L1_8_16
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+
+	je	.L1_8_16
+
+	jmp	.L1_8_12
+	ALIGN_4
+
+.L1_8_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_8_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_8_17:
+
+	KERNEL8x1_SUB
+
+	jl	.L1_8_17
+	ALIGN_4
+
+
+.L1_8_19:
+
+	SAVE8x1
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 8, KK
+#endif
+
+	addq	$ 16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L1_8_11
+	ALIGN_4	
+
+
+
+/**************************************************************************************************/
+.L1_4_10:
+
+	testq	$ 7, M		
+	jz	.L999
+
+	testq	$ 4, M		
+	jz	.L1_4_20
+
+
+.L1_4_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 4, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_12:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_4_16
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_4_16
+
+	jmp	.L1_4_12
+	ALIGN_4
+
+.L1_4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_17:
+
+	KERNEL4x1_SUB
+
+	jl	.L1_4_17
+	ALIGN_4
+
+
+.L1_4_19:
+
+	SAVE4x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 4, KK
+#endif
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	ALIGN_4	
+
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+.L1_4_20:
+
+	testq	$ 2, M		
+	jz	.L1_4_40
+	ALIGN_4
+
+.L1_4_21:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 2, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_4_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_22:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_4_26
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_4_26
+
+	jmp	.L1_4_22
+	ALIGN_4
+
+.L1_4_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_4_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_27:
+
+	KERNEL2x1_SUB
+
+	jl	.L1_4_27
+	ALIGN_4
+
+
+.L1_4_29:
+
+	SAVE2x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+	ALIGN_4	
+
+
+
+/**************************************************************************/
+.L1_4_40:
+	testq	$ 1, M		
+	jz	.L999		// to next 2 lines of N
+
+	ALIGN_4
+
+.L1_4_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 1, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_4_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_42:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_4_46
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_4_46
+
+	jmp	.L1_4_42
+	ALIGN_4
+
+.L1_4_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_4_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_47:
+
+	KERNEL1x1_SUB
+
+	jl	.L1_4_47
+	ALIGN_4
+
+
+.L1_4_49:
+
+	SAVE1x1
+
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 1, KK
+#endif
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	ALIGN_4	
+
+
+.L999:
+	vzeroupper
+
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	vmovups	 64(%rsp), %xmm6
+	vmovups	 80(%rsp), %xmm7
+	vmovups	 96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+
+	addq	$ STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+#else
+
+/************************************************************************************************/
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$ STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	vmovups	%xmm6,   64(%rsp)
+	vmovups	%xmm7,   80(%rsp)
+	vmovups	%xmm8,   96(%rsp)
+	vmovups	%xmm9,  112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	movsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+	vmovsd   OLD_ALPHA_I, %xmm1
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $ 128 + L_BUFFER_SIZE, %rsp
+        andq    $ -4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$ 0, OLD_M
+	je	.L999
+
+	cmpq	$ 0, OLD_N
+	je	.L999
+
+	cmpq	$ 0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovss	 %xmm0, ALPHA_R
+	vmovss	 %xmm1, ALPHA_I
+
+	salq	$ ZBASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $ 2,  %rdi
+        divq    %rdi                    //    N / 2
+        movq    %rax, Ndiv6             //    N / 2
+        movq    %rdx, Nmod6             //    N % 2
+
+	
+
+#ifdef TRMMKERNEL
+	vmovsd	%xmm12, OFFSET
+	vmovsd	%xmm12, KK
+#ifndef LEFT
+	negq	KK
+#endif	
+#endif
+
+.L2_0:
+
+	movq	Ndiv6,  J
+	cmpq	$ 0, J
+	je	.L1_0
+	ALIGN_4
+
+
+
+.L2_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_02b:
+
+	vmovups	(BO1), %xmm0
+	vmovups	%xmm0,       (BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO
+	decq	%rax
+	jnz	.L2_02b
+
+.L2_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$ 3, I			// i = (m >> 3)
+	je	.L2_4_10
+
+	ALIGN_4
+/**********************************************************************************************************/
+
+.L2_8_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 8, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_8_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_8_12:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+
+	je	.L2_8_16
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x2_SUB
+
+	je	.L2_8_16
+
+	jmp	.L2_8_12
+	ALIGN_4
+
+.L2_8_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_8_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_8_17:
+
+	KERNEL8x2_SUB
+
+	jl	.L2_8_17
+	ALIGN_4
+
+
+.L2_8_19:
+
+	SAVE8x2
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 8, KK
+#endif
+
+	addq	$ 16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L2_8_11
+	ALIGN_4	
+
+
+/**********************************************************************************************************/
+
+
+
+
+.L2_4_10:
+	testq	$ 7, M		
+	jz	.L2_4_60		// to next 2 lines of N
+
+	testq	$ 4, M		
+	jz	.L2_4_20
+	ALIGN_4
+
+
+.L2_4_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 4, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_12:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_4_16
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_4_16
+
+	jmp	.L2_4_12
+	ALIGN_4
+
+.L2_4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_17:
+
+	KERNEL4x2_SUB
+
+	jl	.L2_4_17
+	ALIGN_4
+
+
+.L2_4_19:
+
+	SAVE4x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 4, KK
+#endif
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	ALIGN_4	
+
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+.L2_4_20:
+
+	testq	$ 2, M		
+	jz	.L2_4_40
+	ALIGN_4
+
+.L2_4_21:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 2, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_4_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_22:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_4_26
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_4_26
+
+	jmp	.L2_4_22
+	ALIGN_4
+
+.L2_4_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_4_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_27:
+
+	KERNEL2x2_SUB
+
+	jl	.L2_4_27
+	ALIGN_4
+
+
+.L2_4_29:
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
+        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+        vaddsubps %xmm10, %xmm11,%xmm11
+
+        vmovaps   %xmm9,  %xmm8
+        vmovaps   %xmm11, %xmm10
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+        vmulps  %xmm10, %xmm0, %xmm10
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+        vmulps  %xmm11, %xmm1, %xmm11
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddps 	 	(CO1), %xmm8 , %xmm8
+
+	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+
+	vmovups	%xmm10 ,  	(CO1, LDC)
+
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+	decq	I			# i --
+	jg	.L2_4_21
+	ALIGN_4	
+
+
+
+/**************************************************************************/
+.L2_4_40:
+	testq	$ 1, M		
+	jz	.L2_4_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_4_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 1, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_4_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_42:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_4_46
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_4_46
+
+	jmp	.L2_4_42
+	ALIGN_4
+
+.L2_4_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_4_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_47:
+
+	KERNEL1x2_SUB
+
+	jl	.L2_4_47
+	ALIGN_4
+
+
+.L2_4_49:
+
+	SAVE1x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 1, KK
+#endif
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	decq	I			# i --
+	jg	.L2_4_41
+	ALIGN_4	
+
+
+
+	
+.L2_4_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	decq	J			// j --
+	jg	.L2_01			// next 2 lines of N
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$ 1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovsd		(BO1), %xmm0
+	vmovsd	%xmm0,       (BO)
+	addq	$ 2*SIZE,BO1
+	addq	$ 2*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$ 3, I			// i = (m >> 3)
+	je	.L1_4_10
+
+	ALIGN_4
+
+/**************************************************************************************************/
+
+.L1_8_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 8, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_8_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_8_12:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+
+	je	.L1_8_16
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+
+	je	.L1_8_16
+
+	jmp	.L1_8_12
+	ALIGN_4
+
+.L1_8_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_8_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_8_17:
+
+	KERNEL8x1_SUB
+
+	jl	.L1_8_17
+	ALIGN_4
+
+
+.L1_8_19:
+
+	SAVE8x1
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 8, KK
+#endif
+
+	addq	$ 16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L1_8_11
+	ALIGN_4	
+
+
+
+/**************************************************************************************************/
+.L1_4_10:
+
+	testq	$ 7, M		
+	jz	.L999
+
+	testq	$ 4, M		
+	jz	.L1_4_20
+
+
+.L1_4_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 4, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_12:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_4_16
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_4_16
+
+	jmp	.L1_4_12
+	ALIGN_4
+
+.L1_4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_17:
+
+	KERNEL4x1_SUB
+
+	jl	.L1_4_17
+	ALIGN_4
+
+
+.L1_4_19:
+
+	SAVE4x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 4, KK
+#endif
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	ALIGN_4	
+
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+.L1_4_20:
+
+	testq	$ 2, M		
+	jz	.L1_4_40
+	ALIGN_4
+
+.L1_4_21:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 2, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_4_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_22:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_4_26
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_4_26
+
+	jmp	.L1_4_22
+	ALIGN_4
+
+.L1_4_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_4_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_27:
+
+	KERNEL2x1_SUB
+
+	jl	.L1_4_27
+	ALIGN_4
+
+
+.L1_4_29:
+
+	SAVE2x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+	ALIGN_4	
+
+
+
+/**************************************************************************/
+.L1_4_40:
+	testq	$ 1, M		
+	jz	.L999		// to next 2 lines of N
+
+	ALIGN_4
+
+.L1_4_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 1, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_4_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_42:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_4_46
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_4_46
+
+	jmp	.L1_4_42
+	ALIGN_4
+
+.L1_4_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_4_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_47:
+
+	KERNEL1x1_SUB
+
+	jl	.L1_4_47
+	ALIGN_4
+
+
+.L1_4_49:
+
+	SAVE1x1
+
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 1, KK
+#endif
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	ALIGN_4	
+
+
+.L999:
+	vzeroupper
+
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	vmovups	 64(%rsp), %xmm6
+	vmovups	 80(%rsp), %xmm7
+	vmovups	 96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+
+	addq	$ STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+#endif
+

From 5087096711ffe2869750afce96028b29b2489428 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 29 Jul 2014 19:07:21 +0200
Subject: [PATCH 26/74] optimization of sandybridge cgemm-kernel

---
 kernel/x86_64/cgemm_kernel_8x2_sandy.S | 174 +++++++++++++++++--------
 param.h                                |   8 +-
 2 files changed, 127 insertions(+), 55 deletions(-)

diff --git a/kernel/x86_64/cgemm_kernel_8x2_sandy.S b/kernel/x86_64/cgemm_kernel_8x2_sandy.S
index 564b73380..c85646d43 100644
--- a/kernel/x86_64/cgemm_kernel_8x2_sandy.S
+++ b/kernel/x86_64/cgemm_kernel_8x2_sandy.S
@@ -25,6 +25,32 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 **********************************************************************************/
 
+/*********************************************************************
+* 2014/07/29 Saar
+*        BLASTEST               : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+* 2013/10/28 Saar
+* Parameter:
+*       CGEMM_DEFAULT_UNROLL_N  2
+*       CGEMM_DEFAULT_UNROLL_M  8
+*       CGEMM_DEFAULT_P         768
+*       CGEMM_DEFAULT_Q         512
+*       A_PR1                   512
+*       B_PR1                   512
+*
+* 2014/07/29 Saar
+* Performance at 6192x6192x6192:
+*       1 thread:       49 GFLOPS       (MKL:   52)
+*       2 threads:      99 GFLOPS       (MKL:  102)
+*       3 threads:     148 GFLOPS       (MKL:  150)
+*       4 threads:     195 GFLOPS       (MKL:  194)
+*       8 threads:     354 GFLOPS       (MKL:  317)
+*
+*
+*********************************************************************/
+
 
 #define ASSEMBLER
 #include "common.h"
@@ -192,22 +218,108 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 /***************************************************************************************************************************/
 
-.macro KERNEL8x2_SUB
+.macro KERNEL8x2_1
 
         vmovups         -16 * SIZE(AO, %rax, SIZE), %ymm0
         vbroadcastss         -8 * SIZE(BO, BI, SIZE), %ymm4
-        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )
         vmovups          -8 * SIZE(AO, %rax, SIZE), %ymm1
-        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )
         vbroadcastss         -7 * SIZE(BO, BI, SIZE), %ymm5
-        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )
-        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+
+        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )
         vbroadcastss         -6 * SIZE(BO, BI, SIZE), %ymm6
-        VFMADDPS_YR(        %ymm10,%ymm6,%ymm0 )
-        VFMADDPS_YR(        %ymm14,%ymm6,%ymm1 )
+        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )
         vbroadcastss         -5 * SIZE(BO, BI, SIZE), %ymm7
+        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )
+        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )
+
+
+        VFMADDPS_YR(        %ymm10,%ymm6,%ymm0 )
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %ymm4
         VFMADDPS_YI(        %ymm11,%ymm7,%ymm0 )
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %ymm5
+        VFMADDPS_YR(        %ymm14,%ymm6,%ymm1 )
+        vmovups           0 * SIZE(AO, %rax, SIZE), %ymm0
         VFMADDPS_YI(        %ymm15,%ymm7,%ymm1 )
+
+        vmovups           8 * SIZE(AO, %rax, SIZE), %ymm1
+	prefetcht0	A_PR1+64(AO, %rax, SIZE)
+
+        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )
+        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %ymm6
+        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )
+        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %ymm7
+        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )
+        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )
+
+
+        VFMADDPS_YR(        %ymm10,%ymm6,%ymm0 )
+        vbroadcastss          0 * SIZE(BO, BI, SIZE), %ymm4
+        VFMADDPS_YI(        %ymm11,%ymm7,%ymm0 )
+        vbroadcastss          1 * SIZE(BO, BI, SIZE), %ymm5
+        VFMADDPS_YR(        %ymm14,%ymm6,%ymm1 )
+        vmovups          16 * SIZE(AO, %rax, SIZE), %ymm0
+        VFMADDPS_YI(        %ymm15,%ymm7,%ymm1 )
+
+        vmovups          24 * SIZE(AO, %rax, SIZE), %ymm1
+	prefetcht0	A_PR1+128(AO, %rax, SIZE)
+
+        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )
+        vbroadcastss          2 * SIZE(BO, BI, SIZE), %ymm6
+        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )
+        vbroadcastss          3 * SIZE(BO, BI, SIZE), %ymm7
+        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )
+        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )
+
+
+        VFMADDPS_YR(        %ymm10,%ymm6,%ymm0 )
+        vbroadcastss          4 * SIZE(BO, BI, SIZE), %ymm4
+        VFMADDPS_YI(        %ymm11,%ymm7,%ymm0 )
+        vbroadcastss          5 * SIZE(BO, BI, SIZE), %ymm5
+        VFMADDPS_YR(        %ymm14,%ymm6,%ymm1 )
+        vmovups          32 * SIZE(AO, %rax, SIZE), %ymm0
+        VFMADDPS_YI(        %ymm15,%ymm7,%ymm1 )
+
+        vmovups          40 * SIZE(AO, %rax, SIZE), %ymm1
+	prefetcht0	A_PR1+192(AO, %rax, SIZE)
+
+        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )
+        vbroadcastss          6 * SIZE(BO, BI, SIZE), %ymm6
+        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )
+        vbroadcastss          7 * SIZE(BO, BI, SIZE), %ymm7
+        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )
+        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )
+
+        VFMADDPS_YR(        %ymm10,%ymm6,%ymm0 )
+        VFMADDPS_YI(        %ymm11,%ymm7,%ymm0 )
+        addq    $ 16, BI                           
+        VFMADDPS_YR(        %ymm14,%ymm6,%ymm1 )
+        VFMADDPS_YI(        %ymm15,%ymm7,%ymm1 )
+
+        addq    $ 64, %rax                         
+.endm
+
+
+.macro KERNEL8x2_SUB
+
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %ymm0
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %ymm1
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %ymm4
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %ymm5
+
+        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %ymm6
+        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %ymm7
+        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )
+        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )
+
+
+        VFMADDPS_YR(        %ymm10,%ymm6,%ymm0 )
+        VFMADDPS_YI(        %ymm11,%ymm7,%ymm0 )
+        VFMADDPS_YR(        %ymm14,%ymm6,%ymm1 )
+        VFMADDPS_YI(        %ymm15,%ymm7,%ymm1 )
+
         addq    $ 4 , BI                           
         addq    $ 16, %rax                         
 .endm
@@ -984,47 +1096,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .L2_8_12:
 
-	prefetcht0	A_PR1(AO,%rax,SIZE)
 	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
+	KERNEL8x2_1
 
-	prefetcht0	A_PR1(AO,%rax,SIZE)
 	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
+	KERNEL8x2_1
 
 	je	.L2_8_16
 
-	prefetcht0	A_PR1(AO,%rax,SIZE)
 	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
+	KERNEL8x2_1
 
-	prefetcht0	A_PR1(AO,%rax,SIZE)
 	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x2_SUB
+	KERNEL8x2_1
 
 	je	.L2_8_16
 
@@ -1152,7 +1236,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .L2_4_12:
 
 	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
 	KERNEL4x2_SUB
 	KERNEL4x2_SUB
 	prefetcht0	A_PR1(AO,%rax,SIZE)
@@ -1160,7 +1243,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	KERNEL4x2_SUB
 
 	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
 	KERNEL4x2_SUB
 	KERNEL4x2_SUB
 	prefetcht0	A_PR1(AO,%rax,SIZE)
@@ -1170,7 +1252,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	je	.L2_4_16
 
 	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
 	KERNEL4x2_SUB
 	KERNEL4x2_SUB
 	prefetcht0	A_PR1(AO,%rax,SIZE)
@@ -1178,7 +1259,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	KERNEL4x2_SUB
 
 	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
 	KERNEL4x2_SUB
 	KERNEL4x2_SUB
 	prefetcht0	A_PR1(AO,%rax,SIZE)
@@ -1305,14 +1385,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .L2_4_22:
 
 	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 
 	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
@@ -1321,14 +1399,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	je	.L2_4_26
 
 	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 
 	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
@@ -1507,13 +1583,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .L2_4_42:
 
 	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 
-	prefetcht0	B_PR1(BO,BI,SIZE)
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
@@ -1522,13 +1596,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	je	.L2_4_46
 
 	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 
-	prefetcht0	B_PR1(BO,BI,SIZE)
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
diff --git a/param.h b/param.h
index c545d21a8..82f4ad842 100644
--- a/param.h
+++ b/param.h
@@ -1134,9 +1134,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define QGEMM_DEFAULT_P 504
 #define QGEMM_DEFAULT_R qgemm_r
 
-#define CGEMM_DEFAULT_P 384
-//#define CGEMM_DEFAULT_R cgemm_r
-#define CGEMM_DEFAULT_R 1024
+#define CGEMM_DEFAULT_P 768
+#define CGEMM_DEFAULT_R cgemm_r
+//#define CGEMM_DEFAULT_R 1024
 
 #define ZGEMM_DEFAULT_P 512
 #define ZGEMM_DEFAULT_R zgemm_r
@@ -1148,7 +1148,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_Q 384
 #define DGEMM_DEFAULT_Q 256
 #define QGEMM_DEFAULT_Q 128
-#define CGEMM_DEFAULT_Q 192
+#define CGEMM_DEFAULT_Q 512
 #define ZGEMM_DEFAULT_Q 192
 #define XGEMM_DEFAULT_Q 128
 

From ca63503e61d2e20d4489b755517e238763b51b9c Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Wed, 30 Jul 2014 13:03:42 +0200
Subject: [PATCH 27/74] extented plot-filter.sh for linpack and cholesky
 benchmarks

---
 benchmark/plot-filter.sh | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/benchmark/plot-filter.sh b/benchmark/plot-filter.sh
index b47535b6f..420ec9b02 100755
--- a/benchmark/plot-filter.sh
+++ b/benchmark/plot-filter.sh
@@ -33,6 +33,28 @@
 #   ./dgemm.goto 2>&1|./plotfilter.sh >OpenBLAS
 # ************************************************************************
 
+if [ $# -eq 1 ]
+then
+	arg1=$1
+else
+	arg1=0
+fi
 
-awk '/MFlops/ { print $1,int($3) }'|tail --lines=+2
+case $arg1 in
+
+L)
+	# Linpack Benchmark
+	awk '/MFlops/ { print $1,int($8) }'|tail --lines=+2
+	;;
+
+C)
+	# Cholesky Benchmark
+	awk '/MFlops/ { print $3,int($9) }'|tail --lines=+2
+	;;
+
+
+*)
+	awk '/MFlops/ { print $1,int($3) }'|tail --lines=+2
+	;;
+esac
 

From 799a0eabbd85876d0d94d344c1d5207a5d3aa641 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Wed, 30 Jul 2014 14:00:19 +0200
Subject: [PATCH 28/74] bugfix in cholesky.c

---
 benchmark/cholesky.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/benchmark/cholesky.c b/benchmark/cholesky.c
index 1ae3748bb..76c368eda 100644
--- a/benchmark/cholesky.c
+++ b/benchmark/cholesky.c
@@ -119,7 +119,11 @@ static __inline double getmflops(int ratio, int m, double secs){
 
 int MAIN__(int argc, char *argv[]){
 
+#ifndef COMPLEX
   char *trans[] = {"T", "N"};
+#else
+  char *trans[] = {"C", "N"};
+#endif
   char *uplo[]  = {"U", "L"};
   FLOAT alpha[] = {1.0, 0.0};
   FLOAT beta [] = {0.0, 0.0};

From a183ad1df4e88447473a41bec42d5be8f5f8e746 Mon Sep 17 00:00:00 2001
From: Isaiah Norton <isaiah.norton@gmail.com>
Date: Thu, 31 Jul 2014 04:59:30 +0000
Subject: [PATCH 29/74] cpuid_arm: fix detection when cpuinfo uses "Processor"

instead of "model name"
---
 cpuid_arm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpuid_arm.c b/cpuid_arm.c
index 809ef3d3a..b7181b2f9 100644
--- a/cpuid_arm.c
+++ b/cpuid_arm.c
@@ -90,7 +90,7 @@ int detect(void)
 	while (fgets(buffer, sizeof(buffer), infile))
 	{
 
-		if (!strncmp("model name", buffer, 10))
+		if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)))
 		{
 			p = strchr(buffer, ':') + 2;
 			break;

From 296564e36907288123179bcd6e6cb057d56097bc Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Thu, 31 Jul 2014 10:35:25 +0200
Subject: [PATCH 30/74] added lapack geev benchmark

---
 benchmark/Makefile |  72 +++++++++++++
 benchmark/geev.c   | 260 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 332 insertions(+)
 create mode 100644 benchmark/geev.c

diff --git a/benchmark/Makefile b/benchmark/Makefile
index fefd99026..c82e067fa 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -40,6 +40,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
        cherk.goto zherk.goto \
        cher2k.goto zher2k.goto \
        sgemv.goto dgemv.goto cgemv.goto zgemv.goto \
+       sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
        ssymm.goto dsymm.goto csymm.goto zsymm.goto
 
 acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
@@ -55,6 +56,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
        cherk.acml zherk.acml \
        cher2k.acml zher2k.acml \
        sgemv.acml dgemv.acml cgemv.acml zgemv.acml \
+       sgeev.acml dgeev.acml cgeev.acml zgeev.acml \
        ssymm.acml dsymm.acml csymm.acml zsymm.acml
 
 atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
@@ -71,6 +73,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
        cherk.atlas zherk.atlas \
        cher2k.atlas zher2k.atlas \
        sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \
+       sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \
        ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas
 
 mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
@@ -86,6 +89,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
        cherk.mkl zherk.mkl \
        cher2k.mkl zher2k.mkl \
        sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \
+       sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \
        ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl
 
 all :: goto atlas acml mkl
@@ -720,6 +724,61 @@ dsymv.atlas : dsymv.$(SUFFIX)
 dsymv.mkl : dsymv.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
+##################################### Sgeev ####################################################
+sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+sgeev.acml : sgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgeev.atlas : sgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgeev.mkl : sgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dgeev ####################################################
+dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+dgeev.acml : dgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgeev.atlas : dgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgeev.mkl : dgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Cgeev ####################################################
+
+cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+cgeev.acml : cgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgeev.atlas : cgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgeev.mkl : cgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zgeev ####################################################
+
+zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+zgeev.acml : zgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgeev.atlas : zgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgeev.mkl : zgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+
 
 ###################################################################################################
 
@@ -861,6 +920,19 @@ ssymv.$(SUFFIX) : symv.c
 dsymv.$(SUFFIX) : symv.c
 	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
 
+sgeev.$(SUFFIX) : geev.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dgeev.$(SUFFIX) : geev.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+cgeev.$(SUFFIX) : geev.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zgeev.$(SUFFIX) : geev.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+
 
 clean ::
 	@rm -f *.goto *.mkl *.acml *.atlas
diff --git a/benchmark/geev.c b/benchmark/geev.c
new file mode 100644
index 000000000..004e5b98d
--- /dev/null
+++ b/benchmark/geev.c
@@ -0,0 +1,260 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+
+#undef GEEV
+
+#ifndef COMPLEX
+#ifdef XDOUBLE
+#define GEEV   BLASFUNC(qgeev)
+#elif defined(DOUBLE)
+#define GEEV   BLASFUNC(dgeev)
+#else
+#define GEEV   BLASFUNC(sgeev)
+#endif
+#else
+#ifdef XDOUBLE
+#define GEEV   BLASFUNC(xgeev)
+#elif defined(DOUBLE)
+#define GEEV   BLASFUNC(zgeev)
+#else
+#define GEEV   BLASFUNC(cgeev)
+#endif
+#endif
+
+#ifndef COMPLEX
+extern void GEEV( char* jobvl, char* jobvr, blasint* n, FLOAT* a,
+                blasint* lda, FLOAT* wr, FLOAT* wi, FLOAT* vl, blasint* ldvl,
+                FLOAT* vr, blasint* ldvr, FLOAT* work, blasint* lwork, blasint* info );
+#else
+extern void GEEV( char* jobvl, char* jobvr, blasint* n, FLOAT* a,
+                blasint* lda, FLOAT* wr, FLOAT* vl, blasint* ldvl,
+                FLOAT* vr, blasint* ldvr, FLOAT* work, blasint* lwork, FLOAT *rwork, blasint* info );
+#endif
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+  FILETIME ft;
+  unsigned __int64 tmpres = 0;
+  static int tzflag;
+
+  if (NULL != tv)
+    {
+      GetSystemTimeAsFileTime(&ft);
+
+      tmpres |= ft.dwHighDateTime;
+      tmpres <<= 32;
+      tmpres |= ft.dwLowDateTime;
+
+      /*converting file time to unix epoch*/
+      tmpres /= 10;  /*convert into microseconds*/
+      tmpres -= DELTA_EPOCH_IN_MICROSECS;
+      tv->tv_sec = (long)(tmpres / 1000000UL);
+      tv->tv_usec = (long)(tmpres % 1000000UL);
+    }
+
+  return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+  int shmid;
+  void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+  if ((shmid =shmget(IPC_PRIVATE,
+		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+    printf( "Memory allocation failed(shmget).\n");
+    exit(1);
+  }
+
+  address = shmat(shmid, NULL, SHM_RND);
+
+  if ((BLASLONG)address == -1){
+    printf( "Memory allocation failed(shmat).\n");
+    exit(1);
+  }
+
+  shmctl(shmid, IPC_RMID, 0);
+
+  return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int MAIN__(int argc, char *argv[]){
+
+  FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork;
+  FLOAT wkopt;
+  char job[2]="V";
+  char *p;
+
+  blasint m, i, j, info,lwork;
+  double factor = 26.33;
+
+  int from =   1;
+  int to   = 200;
+  int step =   1;
+
+  struct timeval start, stop;
+  double time1;
+
+  argc--;argv++;
+
+  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+  if ((p = getenv("OPENBLAS_JOB")))  job[0]=*p;
+
+  if ( job[0] == 'N' ) factor = 10.0;
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Job=%s\n", from, to, step,job);
+
+  if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+  if (( vl = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+  if (( vr = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+  if (( wr = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+  if (( wi = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+  if (( rwork = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+    for(j = 0; j < to; j++){
+      for(i = 0; i < to * COMPSIZE; i++){
+	a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+      }
+    }
+
+
+    lwork = -1;
+    m=to;
+#ifndef COMPLEX
+    GEEV (job, job, &m, a, &m, wr, wi, vl, &m, vr, &m, &wkopt, &lwork, &info);
+#else
+    GEEV (job, job, &m, a, &m, wr, vl, &m, vr, &m, &wkopt, &lwork,rwork, &info);
+#endif
+
+  lwork = (blasint)wkopt;
+  if (( work = (FLOAT *)malloc(sizeof(FLOAT) * lwork * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+
+#ifdef linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE           FLops           Time          Lwork\n");
+
+  for(m = from; m <= to; m += step){
+
+    fprintf(stderr, " %6d : ", (int)m);
+    gettimeofday( &start, (struct timezone *)0);
+
+    lwork = -1;
+#ifndef COMPLEX
+    GEEV (job, job, &m, a, &m, wr, wi, vl, &m, vr, &m, &wkopt, &lwork, &info);
+#else
+    GEEV (job, job, &m, a, &m, wr, vl, &m, vr, &m, &wkopt, &lwork,rwork, &info);
+#endif
+
+    lwork = (blasint)wkopt;
+#ifndef COMPLEX
+    GEEV (job, job, &m, a, &m, wr, wi, vl, &m, vr, &m, work, &lwork, &info);
+#else
+    GEEV (job, job, &m, a, &m, wr, vl, &m, vr, &m, work, &lwork,rwork, &info);
+#endif
+
+    gettimeofday( &stop, (struct timezone *)0);
+
+    if (info) {
+      fprintf(stderr, "failed to compute eigenvalues .. %d\n", info);
+      exit(1);
+    }
+
+    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+
+    fprintf(stderr,
+	    " %10.2f MFlops : %10.2f Sec : %d\n",
+	    COMPSIZE * COMPSIZE * factor * (double)m * (double)m * (double)m / time1 * 1.e-6,time1,lwork);
+
+
+  }
+
+  return 0;
+}
+
+void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

From 7c611a2f9561198ae13a5f4c4bffc5c3b9c68331 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Thu, 31 Jul 2014 12:35:38 +0200
Subject: [PATCH 31/74] bugfix for zgeev

---
 benchmark/geev.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/benchmark/geev.c b/benchmark/geev.c
index 004e5b98d..3b7465360 100644
--- a/benchmark/geev.c
+++ b/benchmark/geev.c
@@ -142,8 +142,8 @@ static void *huge_malloc(BLASLONG size){
 int MAIN__(int argc, char *argv[]){
 
   FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork;
-  FLOAT wkopt;
-  char job[2]="V";
+  FLOAT wkopt[4];
+  char job='V';
   char *p;
 
   blasint m, i, j, info,lwork;
@@ -162,11 +162,11 @@ int MAIN__(int argc, char *argv[]){
   if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
   if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
 
-  if ((p = getenv("OPENBLAS_JOB")))  job[0]=*p;
+  if ((p = getenv("OPENBLAS_JOB")))  job=*p;
 
-  if ( job[0] == 'N' ) factor = 10.0;
+  if ( job == 'N' ) factor = 10.0;
 
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Job=%s\n", from, to, step,job);
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Job=%c\n", from, to, step,job);
 
   if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
     fprintf(stderr,"Out of Memory!!\n");exit(1);
@@ -202,12 +202,12 @@ int MAIN__(int argc, char *argv[]){
     lwork = -1;
     m=to;
 #ifndef COMPLEX
-    GEEV (job, job, &m, a, &m, wr, wi, vl, &m, vr, &m, &wkopt, &lwork, &info);
+    GEEV (&job, &job, &m, a, &m, wr, wi, vl, &m, vr, &m, wkopt, &lwork, &info);
 #else
-    GEEV (job, job, &m, a, &m, wr, vl, &m, vr, &m, &wkopt, &lwork,rwork, &info);
+    GEEV (&job, &job, &m, a, &m, wr, vl, &m, vr, &m, wkopt, &lwork,rwork, &info);
 #endif
 
-  lwork = (blasint)wkopt;
+  lwork = (blasint)wkopt[0];
   if (( work = (FLOAT *)malloc(sizeof(FLOAT) * lwork * COMPSIZE)) == NULL){
     fprintf(stderr,"Out of Memory!!\n");exit(1);
   }
@@ -226,16 +226,16 @@ int MAIN__(int argc, char *argv[]){
 
     lwork = -1;
 #ifndef COMPLEX
-    GEEV (job, job, &m, a, &m, wr, wi, vl, &m, vr, &m, &wkopt, &lwork, &info);
+    GEEV (&job, &job, &m, a, &m, wr, wi, vl, &m, vr, &m, wkopt, &lwork, &info);
 #else
-    GEEV (job, job, &m, a, &m, wr, vl, &m, vr, &m, &wkopt, &lwork,rwork, &info);
+    GEEV (&job, &job, &m, a, &m, wr, vl, &m, vr, &m, wkopt, &lwork,rwork, &info);
 #endif
 
-    lwork = (blasint)wkopt;
+    lwork = (blasint)wkopt[0];
 #ifndef COMPLEX
-    GEEV (job, job, &m, a, &m, wr, wi, vl, &m, vr, &m, work, &lwork, &info);
+    GEEV (&job, &job, &m, a, &m, wr, wi, vl, &m, vr, &m, work, &lwork, &info);
 #else
-    GEEV (job, job, &m, a, &m, wr, vl, &m, vr, &m, work, &lwork,rwork, &info);
+    GEEV (&job, &job, &m, a, &m, wr, vl, &m, vr, &m, work, &lwork,rwork, &info);
 #endif
 
     gettimeofday( &stop, (struct timezone *)0);

From 651dd22d7d7935f57bf3d46fccdb2b63a8d33173 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Fri, 1 Aug 2014 08:55:20 +0200
Subject: [PATCH 32/74] added benchmark program for lapack ?getri functions

---
 benchmark/Makefile |  72 ++++++++++++++
 benchmark/getri.c  | 234 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 306 insertions(+)
 create mode 100644 benchmark/getri.c

diff --git a/benchmark/Makefile b/benchmark/Makefile
index c82e067fa..3e605f2cb 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -41,6 +41,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
        cher2k.goto zher2k.goto \
        sgemv.goto dgemv.goto cgemv.goto zgemv.goto \
        sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
+       sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
        ssymm.goto dsymm.goto csymm.goto zsymm.goto
 
 acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
@@ -57,6 +58,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
        cher2k.acml zher2k.acml \
        sgemv.acml dgemv.acml cgemv.acml zgemv.acml \
        sgeev.acml dgeev.acml cgeev.acml zgeev.acml \
+       sgetri.acml dgetri.acml cgetri.acml zgetri.acml \
        ssymm.acml dsymm.acml csymm.acml zsymm.acml
 
 atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
@@ -74,6 +76,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
        cher2k.atlas zher2k.atlas \
        sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \
        sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \
+       sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \
        ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas
 
 mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
@@ -90,6 +93,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
        cher2k.mkl zher2k.mkl \
        sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \
        sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \
+       sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \
        ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl
 
 all :: goto atlas acml mkl
@@ -779,6 +783,61 @@ zgeev.mkl : zgeev.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
 
+##################################### Sgetri ####################################################
+sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+sgetri.acml : sgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgetri.atlas : sgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgetri.mkl : sgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dgetri ####################################################
+dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+dgetri.acml : dgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgetri.atlas : dgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgetri.mkl : dgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Cgetri ####################################################
+
+cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+cgetri.acml : cgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgetri.atlas : cgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgetri.mkl : cgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zgetri ####################################################
+
+zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+zgetri.acml : zgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgetri.atlas : zgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgetri.mkl : zgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+
 
 ###################################################################################################
 
@@ -932,6 +991,19 @@ cgeev.$(SUFFIX) : geev.c
 zgeev.$(SUFFIX) : geev.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
 
+sgetri.$(SUFFIX) : getri.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dgetri.$(SUFFIX) : getri.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+cgetri.$(SUFFIX) : getri.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zgetri.$(SUFFIX) : getri.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+
 
 
 clean ::
diff --git a/benchmark/getri.c b/benchmark/getri.c
new file mode 100644
index 000000000..897f1ff04
--- /dev/null
+++ b/benchmark/getri.c
@@ -0,0 +1,234 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+#undef GETRF
+#undef GETRI
+
+#ifndef COMPLEX
+#ifdef XDOUBLE
+#define GETRF   BLASFUNC(qgetrf)
+#define GETRI   BLASFUNC(qgetri)
+#elif defined(DOUBLE)
+#define GETRF   BLASFUNC(dgetrf)
+#define GETRI   BLASFUNC(dgetri)
+#else
+#define GETRF   BLASFUNC(sgetrf)
+#define GETRI   BLASFUNC(sgetri)
+#endif
+#else
+#ifdef XDOUBLE
+#define GETRF   BLASFUNC(xgetrf)
+#define GETRI   BLASFUNC(xgetri)
+#elif defined(DOUBLE)
+#define GETRF   BLASFUNC(zgetrf)
+#define GETRI   BLASFUNC(zgetri)
+#else
+#define GETRF   BLASFUNC(cgetrf)
+#define GETRI   BLASFUNC(cgetri)
+#endif
+#endif
+
+extern void GETRI(blasint *m, FLOAT *a, blasint *lda, blasint *ipiv, FLOAT *work, blasint *lwork, blasint *info);
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+  FILETIME ft;
+  unsigned __int64 tmpres = 0;
+  static int tzflag;
+
+  if (NULL != tv)
+    {
+      GetSystemTimeAsFileTime(&ft);
+
+      tmpres |= ft.dwHighDateTime;
+      tmpres <<= 32;
+      tmpres |= ft.dwLowDateTime;
+
+      /*converting file time to unix epoch*/
+      tmpres /= 10;  /*convert into microseconds*/
+      tmpres -= DELTA_EPOCH_IN_MICROSECS;
+      tv->tv_sec = (long)(tmpres / 1000000UL);
+      tv->tv_usec = (long)(tmpres % 1000000UL);
+    }
+
+  return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+  int shmid;
+  void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+  if ((shmid =shmget(IPC_PRIVATE,
+		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+    printf( "Memory allocation failed(shmget).\n");
+    exit(1);
+  }
+
+  address = shmat(shmid, NULL, SHM_RND);
+
+  if ((BLASLONG)address == -1){
+    printf( "Memory allocation failed(shmat).\n");
+    exit(1);
+  }
+
+  shmctl(shmid, IPC_RMID, 0);
+
+  return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int MAIN__(int argc, char *argv[]){
+
+  FLOAT *a,*work;
+  FLOAT wkopt[4];
+  blasint *ipiv;
+  blasint m, i, j, info,lwork;
+
+  int from =   1;
+  int to   = 200;
+  int step =   1;
+
+  struct timeval start, stop;
+  double time1;
+
+  argc--;argv++;
+
+  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d\n", from, to, step);
+
+  if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+  if (( ipiv = (blasint *)malloc(sizeof(blasint) * to * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+
+
+    for(j = 0; j < to; j++){
+      for(i = 0; i < to * COMPSIZE; i++){
+	a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+      }
+    }
+
+
+    lwork = -1;
+    m=to;
+
+  GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info);
+
+  lwork = (blasint)wkopt[0];
+  if (( work = (FLOAT *)malloc(sizeof(FLOAT) * lwork * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+
+#ifdef linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE           FLops           Time          Lwork\n");
+
+  for(m = from; m <= to; m += step){
+
+    fprintf(stderr, " %6d : ", (int)m);
+
+    GETRF (&m, &m, a, &m, ipiv, &info);
+
+    if (info) {
+      fprintf(stderr, "Matrix is not singular .. %d\n", info);
+      exit(1);
+    }
+
+    gettimeofday( &start, (struct timezone *)0);
+
+    lwork = -1;
+    GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info);
+
+    lwork = (blasint)wkopt[0];
+    GETRI(&m, a, &m, ipiv, work, &lwork, &info);
+    gettimeofday( &stop, (struct timezone *)0);
+
+    if (info) {
+      fprintf(stderr, "failed compute inverse matrix .. %d\n", info);
+      exit(1);
+    }
+
+    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+
+    fprintf(stderr,
+	    " %10.2f MFlops : %10.2f Sec : %d\n",
+	    COMPSIZE * COMPSIZE * (4.0/3.0 * (double)m * (double)m *(double)m - (double)m *(double)m + 5.0/3.0* (double)m) / time1 * 1.e-6,time1,lwork);
+
+
+  }
+
+  return 0;
+}
+
+void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

From f5f50b3563e3f1e5c8c31d96bbc6c8ebd06b516d Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Fri, 1 Aug 2014 21:08:37 +0200
Subject: [PATCH 33/74] added benchmarks for lapack potrf, potrs and potri
 functions

---
 benchmark/Makefile |  71 ++++++++++++
 benchmark/potrf.c  | 282 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 353 insertions(+)
 create mode 100644 benchmark/potrf.c

diff --git a/benchmark/Makefile b/benchmark/Makefile
index 3e605f2cb..de94dcc59 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -42,6 +42,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
        sgemv.goto dgemv.goto cgemv.goto zgemv.goto \
        sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
        sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
+       spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
        ssymm.goto dsymm.goto csymm.goto zsymm.goto
 
 acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
@@ -59,6 +60,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
        sgemv.acml dgemv.acml cgemv.acml zgemv.acml \
        sgeev.acml dgeev.acml cgeev.acml zgeev.acml \
        sgetri.acml dgetri.acml cgetri.acml zgetri.acml \
+       spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \
        ssymm.acml dsymm.acml csymm.acml zsymm.acml
 
 atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
@@ -77,6 +79,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
        sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \
        sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \
        sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \
+       spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \
        ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas
 
 mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
@@ -94,6 +97,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
        sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \
        sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \
        sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \
+       spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \
        ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl
 
 all :: goto atlas acml mkl
@@ -838,6 +842,60 @@ zgetri.mkl : zgetri.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
 
+##################################### Spotrf ####################################################
+spotrf.goto : spotrf.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+spotrf.acml : spotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+spotrf.atlas : spotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+spotrf.mkl : spotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dpotrf ####################################################
+dpotrf.goto : dpotrf.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+dpotrf.acml : dpotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dpotrf.atlas : dpotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dpotrf.mkl : dpotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Cpotrf ####################################################
+
+cpotrf.goto : cpotrf.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+cpotrf.acml : cpotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cpotrf.atlas : cpotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cpotrf.mkl : cpotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zpotrf ####################################################
+
+zpotrf.goto : zpotrf.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+zpotrf.acml : zpotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zpotrf.atlas : zpotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zpotrf.mkl : zpotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
 
 ###################################################################################################
 
@@ -1003,6 +1061,19 @@ cgetri.$(SUFFIX) : getri.c
 zgetri.$(SUFFIX) : getri.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
 
+spotrf.$(SUFFIX) : potrf.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dpotrf.$(SUFFIX) : potrf.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+cpotrf.$(SUFFIX) : potrf.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zpotrf.$(SUFFIX) : potrf.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+
 
 
 
diff --git a/benchmark/potrf.c b/benchmark/potrf.c
new file mode 100644
index 000000000..7b6cdd799
--- /dev/null
+++ b/benchmark/potrf.c
@@ -0,0 +1,282 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+double fabs(double);
+
+#undef POTRF
+
+#ifndef COMPLEX
+#ifdef XDOUBLE
+#define POTRF   BLASFUNC(qpotrf)
+#define POTRS   BLASFUNC(qpotrs)
+#define POTRI   BLASFUNC(qpotri)
+#define SYRK    BLASFUNC(qsyrk)
+#elif defined(DOUBLE)
+#define POTRF   BLASFUNC(dpotrf)
+#define POTRS   BLASFUNC(dpotrs)
+#define POTRI   BLASFUNC(dpotri)
+#define SYRK    BLASFUNC(dsyrk)
+#else
+#define POTRF   BLASFUNC(spotrf)
+#define POTRS   BLASFUNC(spotrs)
+#define POTRI   BLASFUNC(spotri)
+#define SYRK    BLASFUNC(ssyrk)
+#endif
+#else
+#ifdef XDOUBLE
+#define POTRF   BLASFUNC(xpotrf)
+#define POTRS   BLASFUNC(xpotrs)
+#define POTRI   BLASFUNC(xpotri)
+#define SYRK    BLASFUNC(xherk)
+#elif defined(DOUBLE)
+#define POTRF   BLASFUNC(zpotrf)
+#define POTRS   BLASFUNC(zpotrs)
+#define POTRI   BLASFUNC(zpotri)
+#define SYRK    BLASFUNC(zherk)
+#else
+#define POTRF   BLASFUNC(cpotrf)
+#define POTRS   BLASFUNC(cpotrs)
+#define POTRI   BLASFUNC(cpotri)
+#define SYRK    BLASFUNC(cherk)
+#endif
+#endif
+
+// extern void POTRI(char *uplo, blasint *m, FLOAT *a, blasint *lda, blasint *info);
+// extern void POTRS(char *uplo, blasint *m, blasint *n, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb, blasint *info);
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+  FILETIME ft;
+  unsigned __int64 tmpres = 0;
+  static int tzflag;
+
+  if (NULL != tv)
+    {
+      GetSystemTimeAsFileTime(&ft);
+
+      tmpres |= ft.dwHighDateTime;
+      tmpres <<= 32;
+      tmpres |= ft.dwLowDateTime;
+
+      /*converting file time to unix epoch*/
+      tmpres /= 10;  /*convert into microseconds*/
+      tmpres -= DELTA_EPOCH_IN_MICROSECS;
+      tv->tv_sec = (long)(tmpres / 1000000UL);
+      tv->tv_usec = (long)(tmpres % 1000000UL);
+    }
+
+  return 0;
+}
+
+#endif
+
+int MAIN__(int argc, char *argv[]){
+
+#ifndef COMPLEX
+  char *trans[] = {"T", "N"};
+#else
+  char *trans[] = {"C", "N"};
+#endif
+  char *uplo[]  = {"U", "L"};
+  FLOAT alpha[] = {1.0, 0.0};
+  FLOAT beta [] = {0.0, 0.0};
+
+  FLOAT *a, *b;
+
+  char *p;
+  char btest = 'F';
+
+  blasint m, i, j, info, uplos=0;
+  double flops;
+
+  int from =   1;
+  int to   = 200;
+  int step =   1;
+
+  struct timeval start, stop;
+  double time1;
+
+  argc--;argv++;
+
+  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+  if ((p = getenv("OPENBLAS_UPLO")))
+	if (*p == 'L') uplos=1;
+
+  if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]);
+
+  if (( a    = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+  if (( b    = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+  for(m = from; m <= to; m += step){
+
+#ifndef COMPLEX
+      if (uplos & 1) {
+	for (j = 0; j < m; j++) {
+	  for(i = 0; i < j; i++)     a[i + j * m] = 0.;
+	                             a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.;
+	  for(i = j + 1; i < m; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5;
+	}
+      } else {
+	for (j = 0; j < m; j++) {
+	  for(i = 0; i < j; i++)     a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5;
+	                             a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.;
+	  for(i = j + 1; i < m; i++) a[i + j * m] = 0.;
+	}
+      }
+#else
+      if (uplos & 1) {
+	for (j = 0; j < m; j++) {
+	  for(i = 0; i < j; i++) {
+	    a[(i + j * m) * 2 + 0] = 0.;
+	    a[(i + j * m) * 2 + 1] = 0.;
+	  }
+
+	  a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.;
+	  a[(j + j * m) * 2 + 1] = 0.;
+
+	  for(i = j + 1; i < m; i++) {
+	    a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5;
+	    a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5;
+	  }
+	}
+      } else {
+	for (j = 0; j < m; j++) {
+	  for(i = 0; i < j; i++) {
+	    a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5;
+	    a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5;
+	  }
+
+	  a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.;
+	  a[(j + j * m) * 2 + 1] = 0.;
+
+	  for(i = j + 1; i < m; i++) {
+	    a[(i + j * m) * 2 + 0] = 0.;
+	    a[(i + j * m) * 2 + 1] = 0.;
+	  }
+	}
+      }
+#endif
+
+      SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m);
+
+      gettimeofday( &start, (struct timezone *)0);
+
+      POTRF(uplo[uplos], &m, b, &m, &info);
+
+      gettimeofday( &stop, (struct timezone *)0);
+
+      if (info != 0) {
+	fprintf(stderr, "Potrf info = %d\n", info);
+	exit(1);
+      }
+
+      time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+      flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6;
+
+      if ( btest == 'S' )
+      {
+	
+ 	for(j = 0; j < to; j++){
+      		for(i = 0; i < to * COMPSIZE; i++){
+        		a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+      		}
+    	}
+
+      	gettimeofday( &start, (struct timezone *)0);
+
+      	POTRS(uplo[uplos], &m, &m, b, &m, a, &m,  &info);
+
+      	gettimeofday( &stop, (struct timezone *)0);
+
+      	if (info != 0) {
+		fprintf(stderr, "Potrs info = %d\n", info);
+		exit(1);
+        }
+        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+        flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6;
+
+      }
+	
+      if ( btest == 'I' )
+      {
+	
+      	gettimeofday( &start, (struct timezone *)0);
+
+      	POTRI(uplo[uplos], &m, b, &m, &info);
+
+      	gettimeofday( &stop, (struct timezone *)0);
+
+      	if (info != 0) {
+		fprintf(stderr, "Potri info = %d\n", info);
+		exit(1);
+        }
+
+        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+        flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6;
+      }
+	
+      fprintf(stderr, "%8d : %10.2f MFlops : %10.3f Sec : Test=%c\n",m,flops ,time1,btest);
+
+
+  }
+
+
+  return 0;
+}
+
+void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+

From 271af406f33d44a455b171f01d23a8006a18d471 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Fri, 1 Aug 2014 23:10:08 +0200
Subject: [PATCH 34/74] bugfix for linux affinity code

---
 common.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/common.h b/common.h
index 7125ce3bc..1250e2e61 100644
--- a/common.h
+++ b/common.h
@@ -524,10 +524,21 @@ static __inline void blas_unlock(volatile BLASULONG *address){
   *address = 0;
 }
 
+
+#ifdef OS_WINDOWS
 static __inline int readenv_atoi(char *env) {
   env_var_t p;
   return readenv(p,env) ? 0 : atoi(p);
 }
+#else
+static __inline int readenv_atoi(char *env) {
+  char *p;
+  if (( p = getenv(env) ))
+  	return (atoi(p));
+  else
+	return(0);
+}
+#endif
 
 
 #if !defined(XDOUBLE) || !defined(QUAD_PRECISION)

From 793175be3aa34c19d7be57ea16b96cdab07e5443 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sat, 2 Aug 2014 13:40:16 +0200
Subject: [PATCH 35/74] added experimental support for big numa machines

---
 Makefile.rule        |  3 +++
 Makefile.system      |  4 ++++
 driver/others/init.c | 18 ++++++++++++++++++
 3 files changed, 25 insertions(+)

diff --git a/Makefile.rule b/Makefile.rule
index 1969761d6..7bbb39e7a 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -95,6 +95,9 @@ NO_WARMUP = 1
 # If you want to disable CPU/Memory affinity on Linux.
 NO_AFFINITY = 1
 
+# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
+# BIGNUMA = 1
+
 # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
 # and OS. However, the performance is low.
 # NO_AVX = 1
diff --git a/Makefile.system b/Makefile.system
index 370da5928..ccde8e9ce 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -803,6 +803,10 @@ ifeq ($(USE_OPENMP), 1)
 CCOMMON_OPT	+= -DUSE_OPENMP
 endif
 
+ifeq ($(BIGNUMA), 1)
+CCOMMON_OPT	+= -DBIGNUMA
+endif
+
 endif
 
 ifeq ($(NO_WARMUP), 1)
diff --git a/driver/others/init.c b/driver/others/init.c
index 30d35e05d..913538555 100644
--- a/driver/others/init.c
+++ b/driver/others/init.c
@@ -85,8 +85,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <unistd.h>
 #include <string.h>
 
+#if defined(BIGNUMA)
+// max number of nodes as defined in numa.h
+// max cpus as defined in sched.h
+#define MAX_NODES	128
+#define MAX_CPUS	CPU_SETSIZE
+#else
 #define MAX_NODES	16
 #define MAX_CPUS	256
+#endif
+
 #define NCPUBITS        (8*sizeof(unsigned long))
 #define MAX_BITMASK_LEN (MAX_CPUS/NCPUBITS)
 #define CPUELT(cpu)	((cpu) / NCPUBITS)
@@ -544,16 +552,26 @@ static inline int is_dead(int id) {
 
   return shmctl(id, IPC_STAT, &ds);
 }
+
 static void open_shmem(void) {
 
   int try = 0;
 
   do {
 
+#if defined(BIGNUMA)
+    // raised to 32768, enough for 128 nodes and 1024 cups
+    shmid = shmget(SH_MAGIC, 32768, 0666);
+#else
     shmid = shmget(SH_MAGIC, 4096, 0666);
+#endif
 
     if (shmid == -1) {
+#if defined(BIGNUMA)
+      shmid = shmget(SH_MAGIC, 32768, IPC_CREAT | 0666);
+#else
       shmid = shmget(SH_MAGIC, 4096, IPC_CREAT | 0666);
+#endif
     }
 
     try ++;

From f7eb81a84676bce20fcf1df3462639d0bc1b28f0 Mon Sep 17 00:00:00 2001
From: Isaac Dunham <ibid.ag@gmail.com>
Date: Sun, 3 Aug 2014 15:06:30 -0700
Subject: [PATCH 36/74] Fix link error on Linux/musl.

get_nprocs() is a GNU convenience function equivalent to POSIX2008
sysconf(_SC_NPROCESSORS_ONLN); the latter should be available in unistd.h
on any current *nix. (OS X supports this call since 10.5, and FreeBSD
currently supports it. But this commit does not change FreeBSD or OS X
versions.)
---
 CONTRIBUTORS.md        | 3 +++
 driver/others/init.c   | 2 +-
 driver/others/memory.c | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 58748ea1c..18a218cec 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -114,5 +114,8 @@ In chronological order:
 * carlkl <https://github.com/carlkl>
   * [2013-12-13] Fixed LAPACKE building bug on Windows
 
+* Isaac Dunham <https://github.com/idunham>
+  * [2014-08-03] Fixed link error on Linux/musl
+
 * [Your name or handle] <[email or website]>
   * [Date] [Brief summary of your changes]
diff --git a/driver/others/init.c b/driver/others/init.c
index 913538555..50a1a23f7 100644
--- a/driver/others/init.c
+++ b/driver/others/init.c
@@ -865,7 +865,7 @@ void gotoblas_set_affinity2(int threads) {};
 
 void gotoblas_affinity_reschedule(void) {};
 
-int get_num_procs(void) { return get_nprocs(); }
+int get_num_procs(void) { return sysconf(_SC_NPROCESSORS_ONLN); }
 
 int get_num_nodes(void) { return 1; }
 
diff --git a/driver/others/memory.c b/driver/others/memory.c
index f44b37b97..9fdb18f69 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -162,7 +162,7 @@ int get_num_procs(void);
 #else
 int get_num_procs(void) {
   static int nums = 0;
-  if (!nums) nums = get_nprocs();
+  if (!nums) nums = sysconf(_SC_NPROCESSORS_ONLN);
   return nums;
 }
 #endif

From db6917303f12a64fe11430a3b46aaabdefd36234 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Mon, 4 Aug 2014 14:29:01 +0200
Subject: [PATCH 37/74] added a better optimized sgemv_n kernel for bulldozer
 and piledriver

---
 kernel/x86_64/KERNEL.BULLDOZER             |   2 +-
 kernel/x86_64/KERNEL.PILEDRIVER            |   2 +-
 kernel/x86_64/sgemv_n.c                    | 203 +++++++++++++++++++++
 kernel/x86_64/sgemv_n_microk_bulldozer-2.c |  99 ++++++++++
 4 files changed, 304 insertions(+), 2 deletions(-)
 create mode 100644 kernel/x86_64/sgemv_n.c
 create mode 100644 kernel/x86_64/sgemv_n_microk_bulldozer-2.c

diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER
index 73a9ad2ec..ca7a0b7c4 100644
--- a/kernel/x86_64/KERNEL.BULLDOZER
+++ b/kernel/x86_64/KERNEL.BULLDOZER
@@ -2,7 +2,7 @@ ifdef OS_WINDOWS
 SGEMVNKERNEL = ../arm/gemv_n.c
 SGEMVTKERNEL = ../arm/gemv_t.c
 else
-SGEMVNKERNEL = sgemv_n_avx.c
+SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t_avx.c
 endif
 
diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER
index 453e7b762..b9d680c43 100644
--- a/kernel/x86_64/KERNEL.PILEDRIVER
+++ b/kernel/x86_64/KERNEL.PILEDRIVER
@@ -2,7 +2,7 @@ ifdef OS_WINDOWS
 SGEMVNKERNEL = ../arm/gemv_n.c
 SGEMVTKERNEL = ../arm/gemv_t.c
 else
-SGEMVNKERNEL = sgemv_n_avx.c
+SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t_avx.c
 endif
 
diff --git a/kernel/x86_64/sgemv_n.c b/kernel/x86_64/sgemv_n.c
new file mode 100644
index 000000000..6f240797d
--- /dev/null
+++ b/kernel/x86_64/sgemv_n.c
@@ -0,0 +1,203 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+
+#if defined(BULLDOZER) || defined(PILEDRIVER)
+#include "sgemv_n_microk_bulldozer-2.c"
+#endif
+
+
+#define NBMAX 4096
+
+#ifndef HAVE_KERNEL_16x4
+
+static void sgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+	int i;
+	float *a0,*a1,*a2,*a3;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3];
+
+	for ( i=0; i< n; i+=4 )
+	{
+		y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];		
+		y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];		
+		y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];		
+		y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];		
+	}
+}
+	
+#endif
+
+static void sgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+	int i;
+	float *a0;
+	a0 = ap;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		y[i] += a0[i]*x[0];		
+		y[i+1] += a0[i+1]*x[0];		
+		y[i+2] += a0[i+2]*x[0];		
+		y[i+3] += a0[i+3]*x[0];		
+	}
+}
+	
+
+static void zero_y(BLASLONG n, FLOAT *dest)
+{
+	BLASLONG i;
+	for ( i=0; i<n; i++ )
+	{
+		*dest = 0.0;
+		dest++;
+	}
+}
+
+
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+	BLASLONG i;
+	if ( inc_dest == 1 )
+	{
+		for ( i=0; i<n; i+=4 )
+		{
+			dest[i] += src[i];
+			dest[i+1] += src[i+1];
+			dest[i+2] += src[i+2];
+			dest[i+3] += src[i+3];
+		}
+
+	}
+	else
+	{
+		for ( i=0; i<n; i++ )
+		{
+			*dest += *src;
+			src++;
+			dest += inc_dest;
+		}
+	}
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	FLOAT *ap[4];
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG n2;
+	FLOAT xbuffer[4],*ybuffer;
+	ybuffer = buffer;
+	
+	n1 = n / 4 ;
+	n2 = n % 4 ;
+
+	
+	m1 = m - ( m % 16 );
+	m2 = (m % NBMAX) - (m % 16) ;
+	
+	y_ptr = y;
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		a_ptr = a;
+		x_ptr = x;
+		zero_y(NB,ybuffer);
+		for( i = 0; i < n1 ; i++)
+		{
+			xbuffer[0] = alpha * x_ptr[0];
+			x_ptr += inc_x;	
+			xbuffer[1] = alpha * x_ptr[0];
+			x_ptr += inc_x;	
+			xbuffer[2] = alpha * x_ptr[0];
+			x_ptr += inc_x;	
+			xbuffer[3] = alpha * x_ptr[0];
+			x_ptr += inc_x;	
+			ap[0] = a_ptr;
+			ap[1] = a_ptr + lda;
+			ap[2] = ap[1] + lda;
+			ap[3] = ap[2] + lda;
+			sgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
+			a_ptr += 4 * lda;
+		}
+
+		for( i = 0; i < n2 ; i++)
+		{
+			xbuffer[0] = alpha * x_ptr[0];
+			x_ptr += inc_x;	
+			sgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
+			a_ptr += 1 * lda;
+
+		}
+		add_y(NB,ybuffer,y_ptr,inc_y);
+		a     += NB;
+		y_ptr += NB * inc_y;
+	}
+	j=0;
+	while ( j < (m % 16))
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp = 0.0;
+		for( i = 0; i < n; i++ )
+		{
+			temp += a_ptr[0] * x_ptr[0];
+			a_ptr += lda;
+			x_ptr += inc_x;
+		}
+		y_ptr[0] += alpha * temp;
+		y_ptr += inc_y;
+		a++;
+		j++;
+	}
+	return(0);
+}
+
+
diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-2.c b/kernel/x86_64/sgemv_n_microk_bulldozer-2.c
new file mode 100644
index 000000000..7e9ee5cc9
--- /dev/null
+++ b/kernel/x86_64/sgemv_n_microk_bulldozer-2.c
@@ -0,0 +1,99 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16x4 1
+static void sgemv_kernel_16x4( long n, float **ap, float *x, float *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_16x4( long n, float **ap, float *x, float *y)
+{
+
+	long register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vbroadcastss    (%2), %%xmm12	 \n\t"	// x0 
+	"vbroadcastss   4(%2), %%xmm13	 \n\t"	// x1 
+	"vbroadcastss   8(%2), %%xmm14	 \n\t"	// x2 
+	"vbroadcastss  12(%2), %%xmm15	 \n\t"	// x3 
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vmovups	(%3,%0,4), %%xmm4	 \n\t"	// 4 * y
+	"vmovups      16(%3,%0,4), %%xmm5	 \n\t"	// 4 * y
+	"vmovups      32(%3,%0,4), %%xmm6	 \n\t"	// 4 * y
+	"vmovups      48(%3,%0,4), %%xmm7	 \n\t"	// 4 * y
+
+	"prefetcht0	 192(%4,%0,4)		       \n\t"
+	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" 
+	"vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t" 
+	"prefetcht0	 192(%5,%0,4)		       \n\t"
+	"vfmaddps %%xmm4,   (%5,%0,4), %%xmm13, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" 
+	"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t" 
+	"prefetcht0	 192(%6,%0,4)		       \n\t"
+	"vfmaddps %%xmm4,   (%6,%0,4), %%xmm14, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" 
+	"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t" 
+	"prefetcht0	 192(%7,%0,4)		       \n\t"
+	"vfmaddps %%xmm4,   (%7,%0,4), %%xmm15, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" 
+	"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" 
+
+	"vmovups  %%xmm4,   (%3,%0,4)		      \n\t"	// 4 * y
+	"vmovups  %%xmm5, 16(%3,%0,4)		      \n\t"	// 4 * y
+	"vmovups  %%xmm6, 32(%3,%0,4)		      \n\t"	// 4 * y
+	"vmovups  %%xmm7, 48(%3,%0,4)		      \n\t"	// 4 * y
+
+        "addq		$16, %0	  	 	      \n\t"
+	"subq	        $16, %1			      \n\t"		
+	"jnz		.L01LOOP%=		      \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From 3fbc13eb652fbeff928c6259203373bc7b359be2 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Mon, 4 Aug 2014 16:22:11 +0200
Subject: [PATCH 38/74] modified sgemv_n for haswell

---
 kernel/x86_64/KERNEL.HASWELL             |  2 +-
 kernel/x86_64/sgemv_n.c                  |  2 +
 kernel/x86_64/sgemv_n_microk_haswell-2.c | 86 ++++++++++++++++++++++++
 3 files changed, 89 insertions(+), 1 deletion(-)
 create mode 100644 kernel/x86_64/sgemv_n_microk_haswell-2.c

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index 871a7d490..df4aad92f 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -2,7 +2,7 @@ ifdef OS_WINDOWS
 SGEMVNKERNEL = ../arm/gemv_n.c
 SGEMVTKERNEL = ../arm/gemv_t.c
 else
-SGEMVNKERNEL = sgemv_n_avx.c
+SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t_avx.c
 endif
 
diff --git a/kernel/x86_64/sgemv_n.c b/kernel/x86_64/sgemv_n.c
index 6f240797d..d1f4d5f60 100644
--- a/kernel/x86_64/sgemv_n.c
+++ b/kernel/x86_64/sgemv_n.c
@@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(BULLDOZER) || defined(PILEDRIVER)
 #include "sgemv_n_microk_bulldozer-2.c"
+#elif defined(HASWELL)
+#include "sgemv_n_microk_haswell-2.c"
 #endif
 
 
diff --git a/kernel/x86_64/sgemv_n_microk_haswell-2.c b/kernel/x86_64/sgemv_n_microk_haswell-2.c
new file mode 100644
index 000000000..0bad0ec79
--- /dev/null
+++ b/kernel/x86_64/sgemv_n_microk_haswell-2.c
@@ -0,0 +1,86 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16x4 1
+static void sgemv_kernel_16x4( long n, float **ap, float *x, float *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_16x4( long n, float **ap, float *x, float *y)
+{
+
+	long register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vbroadcastss    (%2), %%ymm12	 \n\t"	// x0 
+	"vbroadcastss   4(%2), %%ymm13	 \n\t"	// x1 
+	"vbroadcastss   8(%2), %%ymm14	 \n\t"	// x2 
+	"vbroadcastss  12(%2), %%ymm15	 \n\t"	// x3 
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vmovups	(%3,%0,4), %%ymm4	 \n\t"	// 8 * y
+	"vmovups      32(%3,%0,4), %%ymm5	 \n\t"	// 8 * y
+
+	"prefetcht0	 192(%4,%0,4)		       \n\t"
+	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5      \n\t" 
+	"prefetcht0	 192(%5,%0,4)		       \n\t"
+	"vfmadd231ps   (%5,%0,4), %%ymm13, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5      \n\t" 
+	"prefetcht0	 192(%6,%0,4)		       \n\t"
+	"vfmadd231ps   (%6,%0,4), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5      \n\t" 
+	"prefetcht0	 192(%7,%0,4)		       \n\t"
+	"vfmadd231ps   (%7,%0,4), %%ymm15, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5      \n\t" 
+
+	"vmovups  %%ymm4,   (%3,%0,4)		      \n\t"	// 8 * y
+	"vmovups  %%ymm5, 32(%3,%0,4)		      \n\t"	// 8 * y
+
+        "addq		$16, %0	  	 	      \n\t"
+	"subq	        $16, %1			      \n\t"		
+	"jnz		.L01LOOP%=		      \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From 7fa7ea3e1e73a79edc6a9facaa573a5f94193827 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 5 Aug 2014 08:04:47 +0200
Subject: [PATCH 39/74] updated haswell optimized sgmv_n kernel

---
 kernel/x86_64/sgemv_n_microk_haswell-2.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/x86_64/sgemv_n_microk_haswell-2.c b/kernel/x86_64/sgemv_n_microk_haswell-2.c
index 0bad0ec79..b19db9e1e 100644
--- a/kernel/x86_64/sgemv_n_microk_haswell-2.c
+++ b/kernel/x86_64/sgemv_n_microk_haswell-2.c
@@ -35,6 +35,7 @@ static void sgemv_kernel_16x4( long n, float **ap, float *x, float *y)
 
 	__asm__  __volatile__
 	(
+	"vzeroupper			 \n\t"
 	"vbroadcastss    (%2), %%ymm12	 \n\t"	// x0 
 	"vbroadcastss   4(%2), %%ymm13	 \n\t"	// x1 
 	"vbroadcastss   8(%2), %%ymm14	 \n\t"	// x2 
@@ -64,6 +65,7 @@ static void sgemv_kernel_16x4( long n, float **ap, float *x, float *y)
         "addq		$16, %0	  	 	      \n\t"
 	"subq	        $16, %1			      \n\t"		
 	"jnz		.L01LOOP%=		      \n\t"
+	"vzeroupper			 \n\t"
 
 	:
         : 

From a4dde45f879494387f8a6a6015fefc0174b86238 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 5 Aug 2014 08:53:09 +0200
Subject: [PATCH 40/74] optimized sgemv_n kernel for sandybridge

---
 kernel/x86_64/KERNEL.SANDYBRIDGE       |  2 +-
 kernel/x86_64/sgemv_n.c                |  2 +
 kernel/x86_64/sgemv_n_microk_sandy-2.c | 97 ++++++++++++++++++++++++++
 3 files changed, 100 insertions(+), 1 deletion(-)
 create mode 100644 kernel/x86_64/sgemv_n_microk_sandy-2.c

diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE
index 9d7a49562..7869e37a8 100644
--- a/kernel/x86_64/KERNEL.SANDYBRIDGE
+++ b/kernel/x86_64/KERNEL.SANDYBRIDGE
@@ -2,7 +2,7 @@ ifdef OS_WINDOWS
 SGEMVNKERNEL = ../arm/gemv_n.c
 SGEMVTKERNEL = ../arm/gemv_t.c
 else
-SGEMVNKERNEL = sgemv_n_avx.c
+SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t_avx.c
 endif
 
diff --git a/kernel/x86_64/sgemv_n.c b/kernel/x86_64/sgemv_n.c
index d1f4d5f60..2f30fd9de 100644
--- a/kernel/x86_64/sgemv_n.c
+++ b/kernel/x86_64/sgemv_n.c
@@ -33,6 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "sgemv_n_microk_bulldozer-2.c"
 #elif defined(HASWELL)
 #include "sgemv_n_microk_haswell-2.c"
+#elif defined(SANDYBRIDGE)
+#include "sgemv_n_microk_sandy-2.c"
 #endif
 
 
diff --git a/kernel/x86_64/sgemv_n_microk_sandy-2.c b/kernel/x86_64/sgemv_n_microk_sandy-2.c
new file mode 100644
index 000000000..dfcb0e17b
--- /dev/null
+++ b/kernel/x86_64/sgemv_n_microk_sandy-2.c
@@ -0,0 +1,97 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16x4 1
+static void sgemv_kernel_16x4( long n, float **ap, float *x, float *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_16x4( long n, float **ap, float *x, float *y)
+{
+
+	long register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+	"vbroadcastss    (%2), %%ymm12	 \n\t"	// x0 
+	"vbroadcastss   4(%2), %%ymm13	 \n\t"	// x1 
+	"vbroadcastss   8(%2), %%ymm14	 \n\t"	// x2 
+	"vbroadcastss  12(%2), %%ymm15	 \n\t"	// x3 
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vmovups	(%3,%0,4), %%ymm4	 \n\t"	// 8 * y
+	"vmovups      32(%3,%0,4), %%ymm5	 \n\t"	// 8 * y
+
+	"prefetcht0	 192(%4,%0,4)		       \n\t"
+	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vmulps 32(%4,%0,4), %%ymm12, %%ymm9      \n\t" 
+	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
+	"prefetcht0	 192(%5,%0,4)		       \n\t"
+	"vmulps   (%5,%0,4), %%ymm13, %%ymm10     \n\t" 
+	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
+	"vmulps 32(%5,%0,4), %%ymm13, %%ymm11     \n\t" 
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
+	"prefetcht0	 192(%6,%0,4)		       \n\t"
+	"vmulps   (%6,%0,4), %%ymm14, %%ymm8      \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vmulps 32(%6,%0,4), %%ymm14, %%ymm9      \n\t" 
+	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
+	"prefetcht0	 192(%7,%0,4)		       \n\t"
+	"vmulps   (%7,%0,4), %%ymm15, %%ymm10     \n\t" 
+	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
+	"vmulps 32(%7,%0,4), %%ymm15, %%ymm11     \n\t" 
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
+
+	"vmovups  %%ymm4,   (%3,%0,4)		      \n\t"	// 8 * y
+	"vmovups  %%ymm5, 32(%3,%0,4)		      \n\t"	// 8 * y
+
+        "addq		$16, %0	  	 	      \n\t"
+	"subq	        $16, %1			      \n\t"		
+	"jnz		.L01LOOP%=		      \n\t"
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From 793f2d43b0a557163f556f744f81fd9031779919 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 5 Aug 2014 10:50:08 +0200
Subject: [PATCH 41/74] added optimized sgemv_n kernel for nehalem

---
 kernel/x86_64/KERNEL.NEHALEM             |   7 ++
 kernel/x86_64/sgemv_n.c                  |   2 +
 kernel/x86_64/sgemv_n_microk_nehalem-2.c | 144 +++++++++++++++++++++++
 3 files changed, 153 insertions(+)
 create mode 100644 kernel/x86_64/sgemv_n_microk_nehalem-2.c

diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM
index 2f9c20583..bb1264316 100644
--- a/kernel/x86_64/KERNEL.NEHALEM
+++ b/kernel/x86_64/KERNEL.NEHALEM
@@ -1,3 +1,10 @@
+ifdef OS_WINDOWS
+SGEMVNKERNEL = ../arm/gemv_n.c
+SGEMVTKERNEL = ../arm/gemv_t.c
+else
+SGEMVNKERNEL = sgemv_n.c
+endif
+
 
 SGEMMKERNEL    =  gemm_kernel_4x8_nehalem.S
 SGEMMINCOPY    =  gemm_ncopy_4.S
diff --git a/kernel/x86_64/sgemv_n.c b/kernel/x86_64/sgemv_n.c
index 2f30fd9de..4961deffb 100644
--- a/kernel/x86_64/sgemv_n.c
+++ b/kernel/x86_64/sgemv_n.c
@@ -35,6 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "sgemv_n_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "sgemv_n_microk_sandy-2.c"
+#elif defined(NEHALEM)
+#include "sgemv_n_microk_nehalem-2.c"
 #endif
 
 
diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-2.c b/kernel/x86_64/sgemv_n_microk_nehalem-2.c
new file mode 100644
index 000000000..8499a01a6
--- /dev/null
+++ b/kernel/x86_64/sgemv_n_microk_nehalem-2.c
@@ -0,0 +1,144 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16x4 1
+static void sgemv_kernel_16x4( long n, float **ap, float *x, float *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_16x4( long n, float **ap, float *x, float *y)
+{
+
+	long register i = 0;
+
+	__asm__  __volatile__
+	(
+	"movss    (%2), %%xmm12	 \n\t"	// x0 
+	"movss   4(%2), %%xmm13	 \n\t"	// x1 
+	"movss   8(%2), %%xmm14	 \n\t"	// x2 
+	"movss  12(%2), %%xmm15	 \n\t"	// x3 
+	"shufps $0,  %%xmm12, %%xmm12\n\t"	
+	"shufps $0,  %%xmm13, %%xmm13\n\t"	
+	"shufps $0,  %%xmm14, %%xmm14\n\t"	
+	"shufps $0,  %%xmm15, %%xmm15\n\t"	
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"movups	       (%3,%0,4), %%xmm4	 \n\t"	// 4 * y
+	"movups      16(%3,%0,4), %%xmm5	 \n\t"	// 4 * y
+	"movups      32(%3,%0,4), %%xmm6	 \n\t"	// 4 * y
+	"movups      48(%3,%0,4), %%xmm7	 \n\t"	// 4 * y
+
+	"prefetcht0	 192(%4,%0,4)		       \n\t"
+
+	"movups             (%4,%0,4), %%xmm8          \n\t" 
+	"movups           16(%4,%0,4), %%xmm9          \n\t" 
+	"movups           32(%4,%0,4), %%xmm10         \n\t" 
+	"movups           48(%4,%0,4), %%xmm11         \n\t" 
+	"mulps		%%xmm12, %%xmm8		       \n\t"
+	"addps		%%xmm8 , %%xmm4		       \n\t"
+	"mulps		%%xmm12, %%xmm9		       \n\t"
+	"addps		%%xmm9 , %%xmm5		       \n\t"
+	"mulps		%%xmm12, %%xmm10	       \n\t"
+	"addps		%%xmm10, %%xmm6		       \n\t"
+	"mulps		%%xmm12, %%xmm11	       \n\t"
+	"addps		%%xmm11, %%xmm7		       \n\t"
+
+	"prefetcht0	 192(%5,%0,4)		       \n\t"
+
+	"movups             (%5,%0,4), %%xmm8          \n\t" 
+	"movups           16(%5,%0,4), %%xmm9          \n\t" 
+	"movups           32(%5,%0,4), %%xmm10         \n\t" 
+	"movups           48(%5,%0,4), %%xmm11         \n\t" 
+	"mulps		%%xmm13, %%xmm8		       \n\t"
+	"addps		%%xmm8 , %%xmm4		       \n\t"
+	"mulps		%%xmm13, %%xmm9		       \n\t"
+	"addps		%%xmm9 , %%xmm5		       \n\t"
+	"mulps		%%xmm13, %%xmm10	       \n\t"
+	"addps		%%xmm10, %%xmm6		       \n\t"
+	"mulps		%%xmm13, %%xmm11	       \n\t"
+	"addps		%%xmm11, %%xmm7		       \n\t"
+
+	"prefetcht0	 192(%6,%0,4)		       \n\t"
+
+	"movups             (%6,%0,4), %%xmm8          \n\t" 
+	"movups           16(%6,%0,4), %%xmm9          \n\t" 
+	"movups           32(%6,%0,4), %%xmm10         \n\t" 
+	"movups           48(%6,%0,4), %%xmm11         \n\t" 
+	"mulps		%%xmm14, %%xmm8		       \n\t"
+	"addps		%%xmm8 , %%xmm4		       \n\t"
+	"mulps		%%xmm14, %%xmm9		       \n\t"
+	"addps		%%xmm9 , %%xmm5		       \n\t"
+	"mulps		%%xmm14, %%xmm10	       \n\t"
+	"addps		%%xmm10, %%xmm6		       \n\t"
+	"mulps		%%xmm14, %%xmm11	       \n\t"
+	"addps		%%xmm11, %%xmm7		       \n\t"
+
+	"prefetcht0	 192(%7,%0,4)		       \n\t"
+
+	"movups             (%7,%0,4), %%xmm8          \n\t" 
+	"movups           16(%7,%0,4), %%xmm9          \n\t" 
+	"movups           32(%7,%0,4), %%xmm10         \n\t" 
+	"movups           48(%7,%0,4), %%xmm11         \n\t" 
+	"mulps		%%xmm15, %%xmm8		       \n\t"
+	"addps		%%xmm8 , %%xmm4		       \n\t"
+	"mulps		%%xmm15, %%xmm9		       \n\t"
+	"addps		%%xmm9 , %%xmm5		       \n\t"
+	"mulps		%%xmm15, %%xmm10	       \n\t"
+	"addps		%%xmm10, %%xmm6		       \n\t"
+	"mulps		%%xmm15, %%xmm11	       \n\t"
+	"addps		%%xmm11, %%xmm7		       \n\t"
+
+
+	"movups  %%xmm4,   (%3,%0,4)		      \n\t"	// 4 * y
+	"movups  %%xmm5, 16(%3,%0,4)		      \n\t"	// 4 * y
+	"movups  %%xmm6, 32(%3,%0,4)		      \n\t"	// 4 * y
+	"movups  %%xmm7, 48(%3,%0,4)		      \n\t"	// 4 * y
+
+        "addq		$16, %0	  	 	      \n\t"
+	"subq	        $16, %1			      \n\t"		
+	"jnz		.L01LOOP%=		      \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From 9175b8bd5f8df59146b8539f9760d7ed42611648 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 5 Aug 2014 13:28:39 +0200
Subject: [PATCH 42/74] changed long to blaslong for windows compatibility

---
 kernel/x86_64/sgemv_n_microk_bulldozer-2.c | 6 +++---
 kernel/x86_64/sgemv_n_microk_haswell-2.c   | 6 +++---
 kernel/x86_64/sgemv_n_microk_nehalem-2.c   | 6 +++---
 kernel/x86_64/sgemv_n_microk_sandy-2.c     | 6 +++---
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-2.c b/kernel/x86_64/sgemv_n_microk_bulldozer-2.c
index 7e9ee5cc9..d50fa4268 100644
--- a/kernel/x86_64/sgemv_n_microk_bulldozer-2.c
+++ b/kernel/x86_64/sgemv_n_microk_bulldozer-2.c
@@ -26,12 +26,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #define HAVE_KERNEL_16x4 1
-static void sgemv_kernel_16x4( long n, float **ap, float *x, float *y) __attribute__ ((noinline));
+static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
 
-static void sgemv_kernel_16x4( long n, float **ap, float *x, float *y)
+static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
 {
 
-	long register i = 0;
+	BLASLONG register i = 0;
 
 	__asm__  __volatile__
 	(
diff --git a/kernel/x86_64/sgemv_n_microk_haswell-2.c b/kernel/x86_64/sgemv_n_microk_haswell-2.c
index b19db9e1e..d3fee67c3 100644
--- a/kernel/x86_64/sgemv_n_microk_haswell-2.c
+++ b/kernel/x86_64/sgemv_n_microk_haswell-2.c
@@ -26,12 +26,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #define HAVE_KERNEL_16x4 1
-static void sgemv_kernel_16x4( long n, float **ap, float *x, float *y) __attribute__ ((noinline));
+static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
 
-static void sgemv_kernel_16x4( long n, float **ap, float *x, float *y)
+static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
 {
 
-	long register i = 0;
+	BLASLONG register i = 0;
 
 	__asm__  __volatile__
 	(
diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-2.c b/kernel/x86_64/sgemv_n_microk_nehalem-2.c
index 8499a01a6..3cfb82a45 100644
--- a/kernel/x86_64/sgemv_n_microk_nehalem-2.c
+++ b/kernel/x86_64/sgemv_n_microk_nehalem-2.c
@@ -26,12 +26,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #define HAVE_KERNEL_16x4 1
-static void sgemv_kernel_16x4( long n, float **ap, float *x, float *y) __attribute__ ((noinline));
+static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
 
-static void sgemv_kernel_16x4( long n, float **ap, float *x, float *y)
+static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
 {
 
-	long register i = 0;
+	BLASLONG register i = 0;
 
 	__asm__  __volatile__
 	(
diff --git a/kernel/x86_64/sgemv_n_microk_sandy-2.c b/kernel/x86_64/sgemv_n_microk_sandy-2.c
index dfcb0e17b..21eff1c5e 100644
--- a/kernel/x86_64/sgemv_n_microk_sandy-2.c
+++ b/kernel/x86_64/sgemv_n_microk_sandy-2.c
@@ -26,12 +26,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #define HAVE_KERNEL_16x4 1
-static void sgemv_kernel_16x4( long n, float **ap, float *x, float *y) __attribute__ ((noinline));
+static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
 
-static void sgemv_kernel_16x4( long n, float **ap, float *x, float *y)
+static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
 {
 
-	long register i = 0;
+	BLASLONG register i = 0;
 
 	__asm__  __volatile__
 	(

From 2bab92961f5031f8881d5d6635e470b6e7c0a416 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 5 Aug 2014 14:52:54 +0200
Subject: [PATCH 43/74] enabled optimized sgemv_n kernels for windows

---
 kernel/x86_64/KERNEL.BULLDOZER   | 2 +-
 kernel/x86_64/KERNEL.HASWELL     | 2 +-
 kernel/x86_64/KERNEL.NEHALEM     | 3 ++-
 kernel/x86_64/KERNEL.PILEDRIVER  | 2 +-
 kernel/x86_64/KERNEL.SANDYBRIDGE | 2 +-
 5 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER
index ca7a0b7c4..21fc94701 100644
--- a/kernel/x86_64/KERNEL.BULLDOZER
+++ b/kernel/x86_64/KERNEL.BULLDOZER
@@ -1,5 +1,5 @@
 ifdef OS_WINDOWS
-SGEMVNKERNEL = ../arm/gemv_n.c
+SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = ../arm/gemv_t.c
 else
 SGEMVNKERNEL = sgemv_n.c
diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index df4aad92f..3e20fcfc7 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -1,5 +1,5 @@
 ifdef OS_WINDOWS
-SGEMVNKERNEL = ../arm/gemv_n.c
+SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = ../arm/gemv_t.c
 else
 SGEMVNKERNEL = sgemv_n.c
diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM
index bb1264316..04efa391a 100644
--- a/kernel/x86_64/KERNEL.NEHALEM
+++ b/kernel/x86_64/KERNEL.NEHALEM
@@ -1,8 +1,9 @@
 ifdef OS_WINDOWS
-SGEMVNKERNEL = ../arm/gemv_n.c
+SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = ../arm/gemv_t.c
 else
 SGEMVNKERNEL = sgemv_n.c
+SGEMVTKERNEL = ../arm/gemv_t.c
 endif
 
 
diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER
index b9d680c43..b7565edeb 100644
--- a/kernel/x86_64/KERNEL.PILEDRIVER
+++ b/kernel/x86_64/KERNEL.PILEDRIVER
@@ -1,5 +1,5 @@
 ifdef OS_WINDOWS
-SGEMVNKERNEL = ../arm/gemv_n.c
+SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = ../arm/gemv_t.c
 else
 SGEMVNKERNEL = sgemv_n.c
diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE
index 7869e37a8..9dae3a41d 100644
--- a/kernel/x86_64/KERNEL.SANDYBRIDGE
+++ b/kernel/x86_64/KERNEL.SANDYBRIDGE
@@ -1,5 +1,5 @@
 ifdef OS_WINDOWS
-SGEMVNKERNEL = ../arm/gemv_n.c
+SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = ../arm/gemv_t.c
 else
 SGEMVNKERNEL = sgemv_n.c

From c80084a98f2882a7bb959bad26cc66ebd65bef63 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 5 Aug 2014 19:42:56 +0200
Subject: [PATCH 44/74] changed default x86_64 sgemv_n kernel to sgemv_n.c

---
 kernel/x86_64/KERNEL | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL
index ec21826d7..9d0080ae7 100644
--- a/kernel/x86_64/KERNEL
+++ b/kernel/x86_64/KERNEL
@@ -373,7 +373,7 @@ endif
 GEMVDEP = ../l2param.h
 
 ifndef SGEMVNKERNEL
-SGEMVNKERNEL = ../arm/gemv_n.c
+SGEMVNKERNEL = sgemv_n.c
 endif
 
 ifndef SGEMVTKERNEL

From 8c05b8105b1dad9b9f1ae16d7e0774ce153b6bdd Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 5 Aug 2014 20:14:29 +0200
Subject: [PATCH 45/74] bugfix in sgemv_n.c

---
 kernel/x86_64/sgemv_n.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/x86_64/sgemv_n.c b/kernel/x86_64/sgemv_n.c
index 4961deffb..f2de1b76a 100644
--- a/kernel/x86_64/sgemv_n.c
+++ b/kernel/x86_64/sgemv_n.c
@@ -46,8 +46,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 static void sgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {
-	int i;
-	float *a0,*a1,*a2,*a3;
+	BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3;
 	a0 = ap[0];
 	a1 = ap[1];
 	a2 = ap[2];
@@ -66,8 +66,8 @@ static void sgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 
 static void sgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 {
-	int i;
-	float *a0;
+	BLASLONG i;
+	FLOAT *a0;
 	a0 = ap;
 
 	for ( i=0; i< n; i+=4 )
@@ -130,11 +130,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	BLASLONG m2;
 	BLASLONG n2;
 	FLOAT xbuffer[4],*ybuffer;
+
 	ybuffer = buffer;
 	
 	n1 = n / 4 ;
 	n2 = n % 4 ;
-
 	
 	m1 = m - ( m % 16 );
 	m2 = (m % NBMAX) - (m % 16) ;

From 95a8caa2f340b5936ef3a8106f04df07a07e4d93 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Wed, 6 Aug 2014 12:12:17 +0200
Subject: [PATCH 46/74] added optimized sgemv_t kernel

---
 kernel/x86_64/KERNEL                       |   2 +-
 kernel/x86_64/KERNEL.BULLDOZER             |   2 +-
 kernel/x86_64/KERNEL.HASWELL               |   2 +-
 kernel/x86_64/KERNEL.NEHALEM               |   2 +-
 kernel/x86_64/KERNEL.PILEDRIVER            |   2 +-
 kernel/x86_64/KERNEL.SANDYBRIDGE           |   2 +-
 kernel/x86_64/sgemv_t.c                    | 200 +++++++++++++++++++++
 kernel/x86_64/sgemv_t_microk_bulldozer-2.c | 109 +++++++++++
 8 files changed, 315 insertions(+), 6 deletions(-)
 create mode 100644 kernel/x86_64/sgemv_t.c
 create mode 100644 kernel/x86_64/sgemv_t_microk_bulldozer-2.c

diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL
index 9d0080ae7..3508753ee 100644
--- a/kernel/x86_64/KERNEL
+++ b/kernel/x86_64/KERNEL
@@ -377,7 +377,7 @@ SGEMVNKERNEL = sgemv_n.c
 endif
 
 ifndef SGEMVTKERNEL
-SGEMVTKERNEL = ../arm/gemv_t.c
+SGEMVTKERNEL = sgemv_t.c
 endif
 
 ifndef DGEMVNKERNEL
diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER
index 21fc94701..accdddf0e 100644
--- a/kernel/x86_64/KERNEL.BULLDOZER
+++ b/kernel/x86_64/KERNEL.BULLDOZER
@@ -3,7 +3,7 @@ SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = ../arm/gemv_t.c
 else
 SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = sgemv_t_avx.c
+SGEMVTKERNEL = sgemv_t.c
 endif
 
 
diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index 3e20fcfc7..878a56b04 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -3,7 +3,7 @@ SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = ../arm/gemv_t.c
 else
 SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = sgemv_t_avx.c
+SGEMVTKERNEL = sgemv_t.c
 endif
 
 
diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM
index 04efa391a..8276150c6 100644
--- a/kernel/x86_64/KERNEL.NEHALEM
+++ b/kernel/x86_64/KERNEL.NEHALEM
@@ -3,7 +3,7 @@ SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = ../arm/gemv_t.c
 else
 SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = ../arm/gemv_t.c
+SGEMVTKERNEL = sgemv_t.c
 endif
 
 
diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER
index b7565edeb..7b3c9a7b8 100644
--- a/kernel/x86_64/KERNEL.PILEDRIVER
+++ b/kernel/x86_64/KERNEL.PILEDRIVER
@@ -3,7 +3,7 @@ SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = ../arm/gemv_t.c
 else
 SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = sgemv_t_avx.c
+SGEMVTKERNEL = sgemv_t.c
 endif
 
 
diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE
index 9dae3a41d..26706b61d 100644
--- a/kernel/x86_64/KERNEL.SANDYBRIDGE
+++ b/kernel/x86_64/KERNEL.SANDYBRIDGE
@@ -3,7 +3,7 @@ SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = ../arm/gemv_t.c
 else
 SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = sgemv_t_avx.c
+SGEMVTKERNEL = sgemv_t.c
 endif
 
 
diff --git a/kernel/x86_64/sgemv_t.c b/kernel/x86_64/sgemv_t.c
new file mode 100644
index 000000000..89254c256
--- /dev/null
+++ b/kernel/x86_64/sgemv_t.c
@@ -0,0 +1,200 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(BULLDOZER) || defined(PILEDRIVER)
+#include "sgemv_t_microk_bulldozer-2.c"
+#endif
+
+/*
+#if defined(BULLDOZER) || defined(PILEDRIVER)
+#include "sgemv_n_microk_bulldozer-2.c"
+#elif defined(HASWELL)
+#include "sgemv_n_microk_haswell-2.c"
+#elif defined(SANDYBRIDGE)
+#include "sgemv_n_microk_sandy-2.c"
+#elif defined(NEHALEM)
+#include "sgemv_n_microk_nehalem-2.c"
+#endif
+*/
+
+#define NBMAX 4096
+
+#ifndef HAVE_KERNEL_16x4
+
+static void sgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3];
+	FLOAT temp0 = 0.0;
+	FLOAT temp1 = 0.0;
+	FLOAT temp2 = 0.0;
+	FLOAT temp3 = 0.0;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];		
+		temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];		
+		temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];		
+		temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];		
+	}
+	y[0] = temp0;
+	y[1] = temp1;
+	y[2] = temp2;
+	y[3] = temp3;
+}
+	
+#endif
+
+static void sgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+	FLOAT *a0;
+	a0 = ap;
+	FLOAT temp = 0.0;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		temp += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];		
+	}
+	*y = temp;
+}
+	
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
+{
+        BLASLONG i;
+        for ( i=0; i<n; i++ )
+        {
+                *dest = *src;
+                dest++;
+                src += inc_src;
+        }
+}
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	FLOAT *ap[4];
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG n2;
+	FLOAT ybuffer[4],*xbuffer;
+
+	xbuffer = buffer;
+	
+	n1 = n / 4 ;
+	n2 = n % 4 ;
+	
+	m1 = m - ( m % 16 );
+	m2 = (m % NBMAX) - (m % 16) ;
+	
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		y_ptr = y;
+		a_ptr = a;
+		x_ptr = x;
+		copy_x(NB,x_ptr,xbuffer,inc_x);
+		for( i = 0; i < n1 ; i++)
+		{
+			ap[0] = a_ptr;
+			ap[1] = a_ptr + lda;
+			ap[2] = ap[1] + lda;
+			ap[3] = ap[2] + lda;
+			sgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
+			a_ptr += 4 * lda;
+			*y_ptr += ybuffer[0]*alpha;
+			y_ptr  += inc_y;
+			*y_ptr += ybuffer[1]*alpha;
+			y_ptr  += inc_y;
+			*y_ptr += ybuffer[2]*alpha;
+			y_ptr  += inc_y;
+			*y_ptr += ybuffer[3]*alpha;
+			y_ptr  += inc_y;
+		}
+
+		for( i = 0; i < n2 ; i++)
+		{
+			sgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
+			a_ptr += 1 * lda;
+			*y_ptr += ybuffer[0]*alpha;
+			y_ptr  += inc_y;
+
+		}
+		a += NB;
+		x += NB * inc_x;	
+	}
+
+	BLASLONG m3 = m % 16;
+	if ( m3 == 0 ) return(0);
+	x_ptr = x;
+	for ( i=0; i< m3; i++ )
+	{
+		xbuffer[i] = *x_ptr;
+		x_ptr += inc_x;
+	}
+	j=0;
+	a_ptr = a;
+	y_ptr = y;
+	while ( j < n)
+	{
+		FLOAT temp = 0.0;
+		for( i = 0; i < m3; i++ )
+		{
+			temp += a_ptr[i] * xbuffer[i];
+		}
+		a_ptr += lda;
+		y_ptr[0] += alpha * temp;
+		y_ptr += inc_y;
+		j++;
+	}
+	return(0);
+}
+
+
diff --git a/kernel/x86_64/sgemv_t_microk_bulldozer-2.c b/kernel/x86_64/sgemv_t_microk_bulldozer-2.c
new file mode 100644
index 000000000..54bdca63a
--- /dev/null
+++ b/kernel/x86_64/sgemv_t_microk_bulldozer-2.c
@@ -0,0 +1,109 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16x4 1
+static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vxorps		%%xmm4, %%xmm4, %%xmm4	 \n\t"
+	"vxorps		%%xmm5, %%xmm5, %%xmm5	 \n\t"
+	"vxorps		%%xmm6, %%xmm6, %%xmm6	 \n\t"
+	"vxorps		%%xmm7, %%xmm7, %%xmm7	 \n\t"
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+        "vmovups        (%2,%0,4), %%xmm12       \n\t"  // 4 * x
+
+	"prefetcht0	 384(%4,%0,4)		       \n\t"
+	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%5,%0,4), %%xmm12, %%xmm5 \n\t" 
+        "vmovups      16(%2,%0,4), %%xmm13       \n\t"  // 4 * x
+	"vfmaddps %%xmm6,   (%6,%0,4), %%xmm12, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7,   (%7,%0,4), %%xmm12, %%xmm7 \n\t" 
+	"prefetcht0	 384(%5,%0,4)		       \n\t"
+	"vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" 
+        "vmovups      32(%2,%0,4), %%xmm14       \n\t"  // 4 * x
+	"vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t" 
+	"prefetcht0	 384(%6,%0,4)		       \n\t"
+	"vfmaddps %%xmm4, 32(%4,%0,4), %%xmm14, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 32(%5,%0,4), %%xmm14, %%xmm5 \n\t" 
+        "vmovups      48(%2,%0,4), %%xmm15       \n\t"  // 4 * x
+	"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 32(%7,%0,4), %%xmm14, %%xmm7 \n\t" 
+	"prefetcht0	 384(%7,%0,4)		       \n\t"
+	"vfmaddps %%xmm4, 48(%4,%0,4), %%xmm15, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 48(%5,%0,4), %%xmm15, %%xmm5 \n\t" 
+	"vfmaddps %%xmm6, 48(%6,%0,4), %%xmm15, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" 
+
+        "addq		$16, %0	  	 	      \n\t"
+	"subq	        $16, %1			      \n\t"		
+	"jnz		.L01LOOP%=		      \n\t"
+
+	"vhaddps        %%xmm4, %%xmm4, %%xmm4	\n\t"
+	"vhaddps        %%xmm5, %%xmm5, %%xmm5	\n\t"
+	"vhaddps        %%xmm6, %%xmm6, %%xmm6	\n\t"
+	"vhaddps        %%xmm7, %%xmm7, %%xmm7	\n\t"
+
+	"vhaddps        %%xmm4, %%xmm4, %%xmm4	\n\t"
+	"vhaddps        %%xmm5, %%xmm5, %%xmm5	\n\t"
+	"vhaddps        %%xmm6, %%xmm6, %%xmm6	\n\t"
+	"vhaddps        %%xmm7, %%xmm7, %%xmm7	\n\t"
+
+	"vmovss		%%xmm4,    (%3)		\n\t"
+	"vmovss		%%xmm5,   4(%3)		\n\t"
+	"vmovss		%%xmm6,   8(%3)		\n\t"
+	"vmovss		%%xmm7,  12(%3)		\n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From 891b9608546b7b0dba350f8a1672b49334acfe7f Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Wed, 6 Aug 2014 13:42:41 +0200
Subject: [PATCH 47/74] added optimized sgemv_t kernel for haswell

---
 kernel/x86_64/sgemv_t.c                  |   2 +
 kernel/x86_64/sgemv_t_microk_haswell-2.c | 112 +++++++++++++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100644 kernel/x86_64/sgemv_t_microk_haswell-2.c

diff --git a/kernel/x86_64/sgemv_t.c b/kernel/x86_64/sgemv_t.c
index 89254c256..566311a69 100644
--- a/kernel/x86_64/sgemv_t.c
+++ b/kernel/x86_64/sgemv_t.c
@@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(BULLDOZER) || defined(PILEDRIVER)
 #include "sgemv_t_microk_bulldozer-2.c"
+#elif defined(HASWELL)
+#include "sgemv_t_microk_haswell-2.c"
 #endif
 
 /*
diff --git a/kernel/x86_64/sgemv_t_microk_haswell-2.c b/kernel/x86_64/sgemv_t_microk_haswell-2.c
new file mode 100644
index 000000000..cef703483
--- /dev/null
+++ b/kernel/x86_64/sgemv_t_microk_haswell-2.c
@@ -0,0 +1,112 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16x4 1
+static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			         \n\t"
+	"vxorps		%%ymm4 , %%ymm4, %%ymm4  \n\t"
+	"vxorps		%%ymm5 , %%ymm5, %%ymm5  \n\t"
+	"vxorps		%%ymm6 , %%ymm6, %%ymm6  \n\t"
+	"vxorps		%%ymm7 , %%ymm7, %%ymm7  \n\t"
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"prefetcht0	 384(%2,%0,4)		 \n\t"
+	"vmovups	(%2,%0,4), %%ymm12       \n\t"	// 8 * x
+	"vmovups      32(%2,%0,4), %%ymm13       \n\t"	// 8 * x
+
+	"prefetcht0	 384(%4,%0,4)		       \n\t"
+	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%ymm12, %%ymm5      \n\t" 
+	"prefetcht0	 384(%5,%0,4)		       \n\t"
+	"vfmadd231ps 32(%4,%0,4), %%ymm13, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5      \n\t" 
+	"prefetcht0	 384(%6,%0,4)		       \n\t"
+	"vfmadd231ps   (%6,%0,4), %%ymm12, %%ymm6      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%ymm12, %%ymm7      \n\t" 
+	"prefetcht0	 384(%7,%0,4)		       \n\t"
+	"vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm6      \n\t" 
+	"vfmadd231ps 32(%7,%0,4), %%ymm13, %%ymm7      \n\t" 
+
+        "addq		$16, %0	  	 	      \n\t"
+	"subq	        $16, %1			      \n\t"		
+	"jnz		.L01LOOP%=		      \n\t"
+
+	"vextractf128   $1 , %%ymm4, %%xmm12	      \n\t"
+	"vextractf128   $1 , %%ymm5, %%xmm13	      \n\t"
+	"vextractf128   $1 , %%ymm6, %%xmm14	      \n\t"
+	"vextractf128   $1 , %%ymm7, %%xmm15	      \n\t"
+
+	"vaddps		%%xmm4, %%xmm12, %%xmm4       \n\t"
+	"vaddps		%%xmm5, %%xmm13, %%xmm5       \n\t"
+	"vaddps		%%xmm6, %%xmm14, %%xmm6       \n\t"
+	"vaddps		%%xmm7, %%xmm15, %%xmm7       \n\t"
+
+        "vhaddps        %%xmm4, %%xmm4, %%xmm4  \n\t"
+        "vhaddps        %%xmm5, %%xmm5, %%xmm5  \n\t"
+        "vhaddps        %%xmm6, %%xmm6, %%xmm6  \n\t"
+        "vhaddps        %%xmm7, %%xmm7, %%xmm7  \n\t"
+
+        "vhaddps        %%xmm4, %%xmm4, %%xmm4  \n\t"
+        "vhaddps        %%xmm5, %%xmm5, %%xmm5  \n\t"
+        "vhaddps        %%xmm6, %%xmm6, %%xmm6  \n\t"
+        "vhaddps        %%xmm7, %%xmm7, %%xmm7  \n\t"
+
+        "vmovss         %%xmm4,    (%3)         \n\t"
+        "vmovss         %%xmm5,   4(%3)         \n\t"
+        "vmovss         %%xmm6,   8(%3)         \n\t"
+        "vmovss         %%xmm7,  12(%3)         \n\t"
+
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From 7aa43c8928bb1c69373ed426aa565c207a99470f Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Wed, 6 Aug 2014 14:06:30 +0200
Subject: [PATCH 48/74] enabled optimized sgemv kernels for windows

---
 kernel/x86_64/KERNEL.HASWELL | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index 878a56b04..ff22954c6 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -1,11 +1,5 @@
-ifdef OS_WINDOWS
-SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = ../arm/gemv_t.c
-else
 SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t.c
-endif
-
 
 SGEMMKERNEL    =  sgemm_kernel_16x4_haswell.S
 SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c

From ca6c8d06cea5bb2fbe1b544f0da1dabefd256422 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Wed, 6 Aug 2014 14:24:36 +0200
Subject: [PATCH 49/74] enabled optimized sgemv kernels for windows

---
 kernel/x86_64/KERNEL.BULLDOZER  | 6 ------
 kernel/x86_64/KERNEL.PILEDRIVER | 6 ------
 2 files changed, 12 deletions(-)

diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER
index accdddf0e..893f13064 100644
--- a/kernel/x86_64/KERNEL.BULLDOZER
+++ b/kernel/x86_64/KERNEL.BULLDOZER
@@ -1,11 +1,5 @@
-ifdef OS_WINDOWS
-SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = ../arm/gemv_t.c
-else
 SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t.c
-endif
-
 
 ZGEMVNKERNEL = zgemv_n_dup.S
 ZGEMVTKERNEL = zgemv_t.S
diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER
index 7b3c9a7b8..146a8768b 100644
--- a/kernel/x86_64/KERNEL.PILEDRIVER
+++ b/kernel/x86_64/KERNEL.PILEDRIVER
@@ -1,11 +1,5 @@
-ifdef OS_WINDOWS
-SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = ../arm/gemv_t.c
-else
 SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t.c
-endif
-
 
 ZGEMVNKERNEL = zgemv_n_dup.S
 ZGEMVTKERNEL = zgemv_t.S

From d945a2b06d131995d4184311d397b17a6f1c7afb Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Wed, 6 Aug 2014 16:21:48 +0200
Subject: [PATCH 50/74] added optimized sgemv_t kernel for nehalem

---
 kernel/x86_64/sgemv_t.c                  |   2 +
 kernel/x86_64/sgemv_t_microk_nehalem-2.c | 159 +++++++++++++++++++++++
 2 files changed, 161 insertions(+)
 create mode 100644 kernel/x86_64/sgemv_t_microk_nehalem-2.c

diff --git a/kernel/x86_64/sgemv_t.c b/kernel/x86_64/sgemv_t.c
index 566311a69..47e749e58 100644
--- a/kernel/x86_64/sgemv_t.c
+++ b/kernel/x86_64/sgemv_t.c
@@ -32,6 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "sgemv_t_microk_bulldozer-2.c"
 #elif defined(HASWELL)
 #include "sgemv_t_microk_haswell-2.c"
+#elif defined(NEHALEM)
+#include "sgemv_t_microk_nehalem-2.c"
 #endif
 
 /*
diff --git a/kernel/x86_64/sgemv_t_microk_nehalem-2.c b/kernel/x86_64/sgemv_t_microk_nehalem-2.c
new file mode 100644
index 000000000..e1f2b81bd
--- /dev/null
+++ b/kernel/x86_64/sgemv_t_microk_nehalem-2.c
@@ -0,0 +1,159 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16x4 1
+static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"xorps		%%xmm0 , %%xmm0	         \n\t"
+	"xorps		%%xmm1 , %%xmm1	         \n\t"
+	"xorps		%%xmm2 , %%xmm2	         \n\t"
+	"xorps		%%xmm3 , %%xmm3	         \n\t"
+	"xorps		%%xmm4 , %%xmm4	         \n\t"
+	"xorps		%%xmm5 , %%xmm5	         \n\t"
+	"xorps		%%xmm6 , %%xmm6	         \n\t"
+	"xorps		%%xmm7 , %%xmm7	         \n\t"
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"prefetcht0	 384(%2,%0,4)		       \n\t"
+	"movups	       (%2,%0,4), %%xmm12        \n\t"	// 4 * x
+	"movups      16(%2,%0,4), %%xmm13        \n\t"	// 4 * x
+	"movups             (%4,%0,4), %%xmm8          \n\t" 
+	"movups      32(%2,%0,4), %%xmm14        \n\t"	// 4 * x
+	"movups      48(%2,%0,4), %%xmm15        \n\t"	// 4 * x
+
+	"prefetcht0	 384(%4,%0,4)		       \n\t"
+
+	"movups           16(%4,%0,4), %%xmm9          \n\t" 
+	"movups           32(%4,%0,4), %%xmm10         \n\t" 
+	"movups           48(%4,%0,4), %%xmm11         \n\t" 
+	"mulps		%%xmm12, %%xmm8		       \n\t"
+	"addps		%%xmm8 , %%xmm0		       \n\t"
+	"mulps		%%xmm13, %%xmm9		       \n\t"
+	"addps		%%xmm9 , %%xmm4		       \n\t"
+	"movups             (%5,%0,4), %%xmm8          \n\t" 
+	"mulps		%%xmm14, %%xmm10	       \n\t"
+	"addps		%%xmm10, %%xmm0		       \n\t"
+	"mulps		%%xmm15, %%xmm11	       \n\t"
+	"addps		%%xmm11, %%xmm4		       \n\t"
+
+	"prefetcht0	 384(%5,%0,4)		       \n\t"
+
+	"movups           16(%5,%0,4), %%xmm9          \n\t" 
+	"movups           32(%5,%0,4), %%xmm10         \n\t" 
+	"movups           48(%5,%0,4), %%xmm11         \n\t" 
+	"mulps		%%xmm12, %%xmm8		       \n\t"
+	"addps		%%xmm8 , %%xmm1		       \n\t"
+	"mulps		%%xmm13, %%xmm9		       \n\t"
+	"addps		%%xmm9 , %%xmm5		       \n\t"
+	"movups             (%6,%0,4), %%xmm8          \n\t" 
+	"mulps		%%xmm14, %%xmm10	       \n\t"
+	"addps		%%xmm10, %%xmm1		       \n\t"
+	"mulps		%%xmm15, %%xmm11	       \n\t"
+	"addps		%%xmm11, %%xmm5		       \n\t"
+
+	"prefetcht0	 384(%6,%0,4)		       \n\t"
+
+	"movups           16(%6,%0,4), %%xmm9          \n\t" 
+	"movups           32(%6,%0,4), %%xmm10         \n\t" 
+	"movups           48(%6,%0,4), %%xmm11         \n\t" 
+	"mulps		%%xmm12, %%xmm8		       \n\t"
+	"addps		%%xmm8 , %%xmm2		       \n\t"
+	"mulps		%%xmm13, %%xmm9		       \n\t"
+	"addps		%%xmm9 , %%xmm6		       \n\t"
+	"movups             (%7,%0,4), %%xmm8          \n\t" 
+	"mulps		%%xmm14, %%xmm10	       \n\t"
+	"addps		%%xmm10, %%xmm2		       \n\t"
+	"mulps		%%xmm15, %%xmm11	       \n\t"
+	"addps		%%xmm11, %%xmm6		       \n\t"
+
+	"prefetcht0	 384(%7,%0,4)		       \n\t"
+
+	"movups           16(%7,%0,4), %%xmm9          \n\t" 
+	"movups           32(%7,%0,4), %%xmm10         \n\t" 
+	"movups           48(%7,%0,4), %%xmm11         \n\t" 
+	"mulps		%%xmm12, %%xmm8		       \n\t"
+	"addps		%%xmm8 , %%xmm3		       \n\t"
+	"mulps		%%xmm13, %%xmm9		       \n\t"
+	"addps		%%xmm9 , %%xmm7		       \n\t"
+	"mulps		%%xmm14, %%xmm10	       \n\t"
+	"addps		%%xmm10, %%xmm3		       \n\t"
+	"mulps		%%xmm15, %%xmm11	       \n\t"
+	"addps		%%xmm11, %%xmm7		       \n\t"
+
+        "addq		$16, %0	  	 	      \n\t"
+	"subq	        $16, %1			      \n\t"		
+	"jnz		.L01LOOP%=		      \n\t"
+
+	"addps	       %%xmm0, %%xmm4		      \n\t"
+	"addps	       %%xmm1, %%xmm5		      \n\t"
+	"addps	       %%xmm2, %%xmm6		      \n\t"
+	"addps	       %%xmm3, %%xmm7		      \n\t"
+
+        "haddps        %%xmm4, %%xmm4  \n\t"
+        "haddps        %%xmm5, %%xmm5  \n\t"
+        "haddps        %%xmm6, %%xmm6  \n\t"
+        "haddps        %%xmm7, %%xmm7  \n\t"
+
+        "haddps        %%xmm4, %%xmm4  \n\t"
+        "haddps        %%xmm5, %%xmm5  \n\t"
+        "haddps        %%xmm6, %%xmm6  \n\t"
+        "haddps        %%xmm7, %%xmm7  \n\t"
+
+        "movss         %%xmm4,    (%3)         \n\t"
+        "movss         %%xmm5,   4(%3)         \n\t"
+        "movss         %%xmm6,   8(%3)         \n\t"
+        "movss         %%xmm7,  12(%3)         \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From 2f8927376f50445f800b995ba6010e7ed571ecba Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Wed, 6 Aug 2014 16:58:21 +0200
Subject: [PATCH 51/74] enabled optimized nehalem sgemv_t kernel for windows

---
 kernel/x86_64/KERNEL.NEHALEM | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM
index 8276150c6..ca9ff252d 100644
--- a/kernel/x86_64/KERNEL.NEHALEM
+++ b/kernel/x86_64/KERNEL.NEHALEM
@@ -1,11 +1,5 @@
-ifdef OS_WINDOWS
-SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = ../arm/gemv_t.c
-else
 SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t.c
-endif
-
 
 SGEMMKERNEL    =  gemm_kernel_4x8_nehalem.S
 SGEMMINCOPY    =  gemm_ncopy_4.S

From c9bad1403ad4ff8b170bab16affcc1de1a6b66ce Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Thu, 7 Aug 2014 07:49:33 +0200
Subject: [PATCH 52/74] added optimized sgemv_t kernel for sandybridge

---
 kernel/x86_64/KERNEL.SANDYBRIDGE       |   6 --
 kernel/x86_64/sgemv_t.c                |  14 +--
 kernel/x86_64/sgemv_t_microk_sandy-2.c | 132 +++++++++++++++++++++++++
 3 files changed, 134 insertions(+), 18 deletions(-)
 create mode 100644 kernel/x86_64/sgemv_t_microk_sandy-2.c

diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE
index 26706b61d..d4fbca7f2 100644
--- a/kernel/x86_64/KERNEL.SANDYBRIDGE
+++ b/kernel/x86_64/KERNEL.SANDYBRIDGE
@@ -1,11 +1,5 @@
-ifdef OS_WINDOWS
-SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = ../arm/gemv_t.c
-else
 SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t.c
-endif
-
 
 SGEMMKERNEL    =  sgemm_kernel_16x4_sandy.S
 SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
diff --git a/kernel/x86_64/sgemv_t.c b/kernel/x86_64/sgemv_t.c
index 47e749e58..adfaa9925 100644
--- a/kernel/x86_64/sgemv_t.c
+++ b/kernel/x86_64/sgemv_t.c
@@ -32,22 +32,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "sgemv_t_microk_bulldozer-2.c"
 #elif defined(HASWELL)
 #include "sgemv_t_microk_haswell-2.c"
+#elif defined(SANDYBRIDGE)
+#include "sgemv_t_microk_sandy-2.c"
 #elif defined(NEHALEM)
 #include "sgemv_t_microk_nehalem-2.c"
 #endif
 
-/*
-#if defined(BULLDOZER) || defined(PILEDRIVER)
-#include "sgemv_n_microk_bulldozer-2.c"
-#elif defined(HASWELL)
-#include "sgemv_n_microk_haswell-2.c"
-#elif defined(SANDYBRIDGE)
-#include "sgemv_n_microk_sandy-2.c"
-#elif defined(NEHALEM)
-#include "sgemv_n_microk_nehalem-2.c"
-#endif
-*/
-
 #define NBMAX 4096
 
 #ifndef HAVE_KERNEL_16x4
diff --git a/kernel/x86_64/sgemv_t_microk_sandy-2.c b/kernel/x86_64/sgemv_t_microk_sandy-2.c
new file mode 100644
index 000000000..6a3748238
--- /dev/null
+++ b/kernel/x86_64/sgemv_t_microk_sandy-2.c
@@ -0,0 +1,132 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16x4 1
+static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+        "vxorps         %%ymm0 , %%ymm0, %%ymm0  \n\t"
+        "vxorps         %%ymm1 , %%ymm1, %%ymm1  \n\t"
+        "vxorps         %%ymm2 , %%ymm2, %%ymm2  \n\t"
+        "vxorps         %%ymm3 , %%ymm3, %%ymm3  \n\t"
+        "vxorps         %%ymm4 , %%ymm4, %%ymm4  \n\t"
+        "vxorps         %%ymm5 , %%ymm5, %%ymm5  \n\t"
+        "vxorps         %%ymm6 , %%ymm6, %%ymm6  \n\t"
+        "vxorps         %%ymm7 , %%ymm7, %%ymm7  \n\t"
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"prefetcht0	 384(%2,%0,4)		       \n\t"
+        "vmovups        (%2,%0,4), %%ymm12       \n\t"  // 8 * x
+        "vmovups      32(%2,%0,4), %%ymm13       \n\t"  // 8 * x
+
+	"prefetcht0	 384(%4,%0,4)		       \n\t"
+	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vmulps 32(%4,%0,4), %%ymm13, %%ymm9      \n\t" 
+	"vaddps	  %%ymm0, %%ymm9 , %%ymm0	  \n\t"
+	"prefetcht0	 384(%5,%0,4)		       \n\t"
+	"vmulps   (%5,%0,4), %%ymm12, %%ymm10     \n\t" 
+	"vaddps	  %%ymm1, %%ymm10, %%ymm1	  \n\t"
+	"vmulps 32(%5,%0,4), %%ymm13, %%ymm11     \n\t" 
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
+	"prefetcht0	 384(%6,%0,4)		       \n\t"
+	"vmulps   (%6,%0,4), %%ymm12, %%ymm8      \n\t" 
+	"vaddps	  %%ymm6, %%ymm8 , %%ymm6	  \n\t"
+	"vmulps 32(%6,%0,4), %%ymm13, %%ymm9      \n\t" 
+	"vaddps	  %%ymm2, %%ymm9 , %%ymm2	  \n\t"
+	"prefetcht0	 384(%7,%0,4)		       \n\t"
+	"vmulps   (%7,%0,4), %%ymm12, %%ymm10     \n\t" 
+	"vaddps	  %%ymm7, %%ymm10, %%ymm7	  \n\t"
+	"vmulps 32(%7,%0,4), %%ymm13, %%ymm11     \n\t" 
+	"vaddps	  %%ymm3, %%ymm11, %%ymm3	  \n\t"
+
+        "addq		$16, %0	  	 	      \n\t"
+	"subq	        $16, %1			      \n\t"		
+	"jnz		.L01LOOP%=		      \n\t"
+
+        "vaddps         %%ymm4, %%ymm0, %%ymm4       \n\t"
+        "vaddps         %%ymm5, %%ymm1, %%ymm5       \n\t"
+        "vaddps         %%ymm6, %%ymm2, %%ymm6       \n\t"
+        "vaddps         %%ymm7, %%ymm3, %%ymm7       \n\t"
+
+        "vextractf128   $1 , %%ymm4, %%xmm12          \n\t"
+        "vextractf128   $1 , %%ymm5, %%xmm13          \n\t"
+        "vextractf128   $1 , %%ymm6, %%xmm14          \n\t"
+        "vextractf128   $1 , %%ymm7, %%xmm15          \n\t"
+
+        "vaddps         %%xmm4, %%xmm12, %%xmm4       \n\t"
+        "vaddps         %%xmm5, %%xmm13, %%xmm5       \n\t"
+        "vaddps         %%xmm6, %%xmm14, %%xmm6       \n\t"
+        "vaddps         %%xmm7, %%xmm15, %%xmm7       \n\t"
+
+        "vhaddps        %%xmm4, %%xmm4, %%xmm4  \n\t"
+        "vhaddps        %%xmm5, %%xmm5, %%xmm5  \n\t"
+        "vhaddps        %%xmm6, %%xmm6, %%xmm6  \n\t"
+        "vhaddps        %%xmm7, %%xmm7, %%xmm7  \n\t"
+
+        "vhaddps        %%xmm4, %%xmm4, %%xmm4  \n\t"
+        "vhaddps        %%xmm5, %%xmm5, %%xmm5  \n\t"
+        "vhaddps        %%xmm6, %%xmm6, %%xmm6  \n\t"
+        "vhaddps        %%xmm7, %%xmm7, %%xmm7  \n\t"
+
+        "vmovss         %%xmm4,    (%3)         \n\t"
+        "vmovss         %%xmm5,   4(%3)         \n\t"
+        "vmovss         %%xmm6,   8(%3)         \n\t"
+        "vmovss         %%xmm7,  12(%3)         \n\t"
+
+
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From 60f17628cc65387f474160462511f91f93d50c3e Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Thu, 7 Aug 2014 09:18:02 +0200
Subject: [PATCH 53/74] added optimized dgemv_n kernel for haswell

---
 kernel/x86_64/KERNEL.HASWELL             |   1 +
 kernel/x86_64/dgemv_n.c                  | 203 +++++++++++++++++++++++
 kernel/x86_64/dgemv_n_microk_haswell-2.c |  89 ++++++++++
 3 files changed, 293 insertions(+)
 create mode 100644 kernel/x86_64/dgemv_n.c
 create mode 100644 kernel/x86_64/dgemv_n_microk_haswell-2.c

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index ff22954c6..bcff2e224 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -1,5 +1,6 @@
 SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t.c
+DGEMVNKERNEL = dgemv_n.c
 
 SGEMMKERNEL    =  sgemm_kernel_16x4_haswell.S
 SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
diff --git a/kernel/x86_64/dgemv_n.c b/kernel/x86_64/dgemv_n.c
new file mode 100644
index 000000000..5192ba193
--- /dev/null
+++ b/kernel/x86_64/dgemv_n.c
@@ -0,0 +1,203 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+
+#if defined(HASWELL)
+#include "dgemv_n_microk_haswell-2.c"
+#endif
+
+
+#define NBMAX 2048
+
+#ifndef HAVE_KERNEL_16x4
+
+static void dgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3];
+
+	for ( i=0; i< n; i+=4 )
+	{
+		y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];		
+		y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];		
+		y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];		
+		y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];		
+	}
+}
+	
+#endif
+
+static void dgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+	FLOAT *a0;
+	a0 = ap;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		y[i] += a0[i]*x[0];		
+		y[i+1] += a0[i+1]*x[0];		
+		y[i+2] += a0[i+2]*x[0];		
+		y[i+3] += a0[i+3]*x[0];		
+	}
+}
+	
+
+static void zero_y(BLASLONG n, FLOAT *dest)
+{
+	BLASLONG i;
+	for ( i=0; i<n; i++ )
+	{
+		*dest = 0.0;
+		dest++;
+	}
+}
+
+
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+	BLASLONG i;
+	if ( inc_dest == 1 )
+	{
+		for ( i=0; i<n; i+=4 )
+		{
+			dest[i] += src[i];
+			dest[i+1] += src[i+1];
+			dest[i+2] += src[i+2];
+			dest[i+3] += src[i+3];
+		}
+
+	}
+	else
+	{
+		for ( i=0; i<n; i++ )
+		{
+			*dest += *src;
+			src++;
+			dest += inc_dest;
+		}
+	}
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	FLOAT *ap[4];
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG n2;
+	FLOAT xbuffer[4],*ybuffer;
+
+	ybuffer = buffer;
+	
+	n1 = n / 4 ;
+	n2 = n % 4 ;
+	
+	m1 = m - ( m % 16 );
+	m2 = (m % NBMAX) - (m % 16) ;
+	
+	y_ptr = y;
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		a_ptr = a;
+		x_ptr = x;
+		zero_y(NB,ybuffer);
+		for( i = 0; i < n1 ; i++)
+		{
+			xbuffer[0] = alpha * x_ptr[0];
+			x_ptr += inc_x;	
+			xbuffer[1] = alpha * x_ptr[0];
+			x_ptr += inc_x;	
+			xbuffer[2] = alpha * x_ptr[0];
+			x_ptr += inc_x;	
+			xbuffer[3] = alpha * x_ptr[0];
+			x_ptr += inc_x;	
+			ap[0] = a_ptr;
+			ap[1] = a_ptr + lda;
+			ap[2] = ap[1] + lda;
+			ap[3] = ap[2] + lda;
+			dgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
+			a_ptr += 4 * lda;
+		}
+
+		for( i = 0; i < n2 ; i++)
+		{
+			xbuffer[0] = alpha * x_ptr[0];
+			x_ptr += inc_x;	
+			dgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
+			a_ptr += 1 * lda;
+
+		}
+		add_y(NB,ybuffer,y_ptr,inc_y);
+		a     += NB;
+		y_ptr += NB * inc_y;
+	}
+	j=0;
+	while ( j < (m % 16))
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp = 0.0;
+		for( i = 0; i < n; i++ )
+		{
+			temp += a_ptr[0] * x_ptr[0];
+			a_ptr += lda;
+			x_ptr += inc_x;
+		}
+		y_ptr[0] += alpha * temp;
+		y_ptr += inc_y;
+		a++;
+		j++;
+	}
+	return(0);
+}
+
+
diff --git a/kernel/x86_64/dgemv_n_microk_haswell-2.c b/kernel/x86_64/dgemv_n_microk_haswell-2.c
new file mode 100644
index 000000000..88c7d4163
--- /dev/null
+++ b/kernel/x86_64/dgemv_n_microk_haswell-2.c
@@ -0,0 +1,89 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16x4 1
+static void dgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
+
+static void dgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+	"vbroadcastsd    (%2), %%ymm12	 \n\t"	// x0 
+	"vbroadcastsd   8(%2), %%ymm13	 \n\t"	// x1 
+	"vbroadcastsd  16(%2), %%ymm14	 \n\t"	// x2 
+	"vbroadcastsd  24(%2), %%ymm15	 \n\t"	// x3 
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"prefetcht0	 192(%4,%0,8)		 \n\t"
+	"vmovups	(%3,%0,8), %%ymm4	 \n\t"	// 4 * y
+	"vmovups      32(%3,%0,8), %%ymm5	 \n\t"	// 4 * y
+
+	"prefetcht0	 192(%4,%0,8)		       \n\t"
+	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5      \n\t" 
+	"prefetcht0	 192(%5,%0,8)		       \n\t"
+	"vfmadd231pd   (%5,%0,8), %%ymm13, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5      \n\t" 
+	"prefetcht0	 192(%6,%0,8)		       \n\t"
+	"vfmadd231pd   (%6,%0,8), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5      \n\t" 
+	"prefetcht0	 192(%7,%0,8)		       \n\t"
+	"vfmadd231pd   (%7,%0,8), %%ymm15, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5      \n\t" 
+
+	"vmovups  %%ymm4,   (%3,%0,8)		      \n\t"	// 4 * y
+	"vmovups  %%ymm5, 32(%3,%0,8)		      \n\t"	// 4 * y
+
+        "addq		$8 , %0	  	 	      \n\t"
+	"subq	        $8 , %1			      \n\t"		
+	"jnz		.L01LOOP%=		      \n\t"
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From 006ef3ea012b4aa09e775724bcf7094500c597d5 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Thu, 7 Aug 2014 10:08:54 +0200
Subject: [PATCH 54/74] added optimized dgemv_t kernel for haswell

---
 kernel/x86_64/KERNEL.HASWELL             |   1 +
 kernel/x86_64/dgemv_t.c                  | 188 +++++++++++++++++++++++
 kernel/x86_64/dgemv_t_microk_haswell-2.c | 107 +++++++++++++
 3 files changed, 296 insertions(+)
 create mode 100644 kernel/x86_64/dgemv_t.c
 create mode 100644 kernel/x86_64/dgemv_t_microk_haswell-2.c

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index bcff2e224..cd280e4b6 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -1,6 +1,7 @@
 SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t.c
 DGEMVNKERNEL = dgemv_n.c
+DGEMVTKERNEL = dgemv_t.c
 
 SGEMMKERNEL    =  sgemm_kernel_16x4_haswell.S
 SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
diff --git a/kernel/x86_64/dgemv_t.c b/kernel/x86_64/dgemv_t.c
new file mode 100644
index 000000000..76aacd349
--- /dev/null
+++ b/kernel/x86_64/dgemv_t.c
@@ -0,0 +1,188 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(HASWELL)
+#include "dgemv_t_microk_haswell-2.c"
+#endif
+
+#define NBMAX 2048
+
+#ifndef HAVE_KERNEL_16x4
+
+static void dgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3];
+	FLOAT temp0 = 0.0;
+	FLOAT temp1 = 0.0;
+	FLOAT temp2 = 0.0;
+	FLOAT temp3 = 0.0;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];		
+		temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];		
+		temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];		
+		temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];		
+	}
+	y[0] = temp0;
+	y[1] = temp1;
+	y[2] = temp2;
+	y[3] = temp3;
+}
+	
+#endif
+
+static void dgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+	FLOAT *a0;
+	a0 = ap;
+	FLOAT temp = 0.0;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		temp += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];		
+	}
+	*y = temp;
+}
+	
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
+{
+        BLASLONG i;
+        for ( i=0; i<n; i++ )
+        {
+                *dest = *src;
+                dest++;
+                src += inc_src;
+        }
+}
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	FLOAT *ap[4];
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG n2;
+	FLOAT ybuffer[4],*xbuffer;
+
+	xbuffer = buffer;
+	
+	n1 = n / 4 ;
+	n2 = n % 4 ;
+	
+	m1 = m - ( m % 16 );
+	m2 = (m % NBMAX) - (m % 16) ;
+	
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		y_ptr = y;
+		a_ptr = a;
+		x_ptr = x;
+		copy_x(NB,x_ptr,xbuffer,inc_x);
+		for( i = 0; i < n1 ; i++)
+		{
+			ap[0] = a_ptr;
+			ap[1] = a_ptr + lda;
+			ap[2] = ap[1] + lda;
+			ap[3] = ap[2] + lda;
+			dgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
+			a_ptr += 4 * lda;
+			*y_ptr += ybuffer[0]*alpha;
+			y_ptr  += inc_y;
+			*y_ptr += ybuffer[1]*alpha;
+			y_ptr  += inc_y;
+			*y_ptr += ybuffer[2]*alpha;
+			y_ptr  += inc_y;
+			*y_ptr += ybuffer[3]*alpha;
+			y_ptr  += inc_y;
+		}
+
+		for( i = 0; i < n2 ; i++)
+		{
+			dgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
+			a_ptr += 1 * lda;
+			*y_ptr += ybuffer[0]*alpha;
+			y_ptr  += inc_y;
+
+		}
+		a += NB;
+		x += NB * inc_x;	
+	}
+
+	BLASLONG m3 = m % 16;
+	if ( m3 == 0 ) return(0);
+	x_ptr = x;
+	for ( i=0; i< m3; i++ )
+	{
+		xbuffer[i] = *x_ptr;
+		x_ptr += inc_x;
+	}
+	j=0;
+	a_ptr = a;
+	y_ptr = y;
+	while ( j < n)
+	{
+		FLOAT temp = 0.0;
+		for( i = 0; i < m3; i++ )
+		{
+			temp += a_ptr[i] * xbuffer[i];
+		}
+		a_ptr += lda;
+		y_ptr[0] += alpha * temp;
+		y_ptr += inc_y;
+		j++;
+	}
+	return(0);
+}
+
+
diff --git a/kernel/x86_64/dgemv_t_microk_haswell-2.c b/kernel/x86_64/dgemv_t_microk_haswell-2.c
new file mode 100644
index 000000000..94d4c319e
--- /dev/null
+++ b/kernel/x86_64/dgemv_t_microk_haswell-2.c
@@ -0,0 +1,107 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16x4 1
+static void dgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
+
+static void dgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			         \n\t"
+	"vxorpd		%%ymm4 , %%ymm4, %%ymm4  \n\t"
+	"vxorpd		%%ymm5 , %%ymm5, %%ymm5  \n\t"
+	"vxorpd		%%ymm6 , %%ymm6, %%ymm6  \n\t"
+	"vxorpd		%%ymm7 , %%ymm7, %%ymm7  \n\t"
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"prefetcht0	 384(%2,%0,8)		 \n\t"
+	"vmovups	(%2,%0,8), %%ymm12       \n\t"	// 4 * x
+	"vmovups      32(%2,%0,8), %%ymm13       \n\t"	// 4 * x
+
+	"prefetcht0	 384(%4,%0,8)		       \n\t"
+	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231pd   (%5,%0,8), %%ymm12, %%ymm5      \n\t" 
+	"prefetcht0	 384(%5,%0,8)		       \n\t"
+	"vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5      \n\t" 
+	"prefetcht0	 384(%6,%0,8)		       \n\t"
+	"vfmadd231pd   (%6,%0,8), %%ymm12, %%ymm6      \n\t" 
+	"vfmadd231pd   (%7,%0,8), %%ymm12, %%ymm7      \n\t" 
+	"prefetcht0	 384(%7,%0,8)		       \n\t"
+	"vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm6      \n\t" 
+	"vfmadd231pd 32(%7,%0,8), %%ymm13, %%ymm7      \n\t" 
+
+        "addq		$8 , %0	  	 	      \n\t"
+	"subq	        $8 , %1			      \n\t"		
+	"jnz		.L01LOOP%=		      \n\t"
+
+	"vextractf128   $1 , %%ymm4, %%xmm12	      \n\t"
+	"vextractf128   $1 , %%ymm5, %%xmm13	      \n\t"
+	"vextractf128   $1 , %%ymm6, %%xmm14	      \n\t"
+	"vextractf128   $1 , %%ymm7, %%xmm15	      \n\t"
+
+	"vaddpd		%%xmm4, %%xmm12, %%xmm4       \n\t"
+	"vaddpd		%%xmm5, %%xmm13, %%xmm5       \n\t"
+	"vaddpd		%%xmm6, %%xmm14, %%xmm6       \n\t"
+	"vaddpd		%%xmm7, %%xmm15, %%xmm7       \n\t"
+
+        "vhaddpd        %%xmm4, %%xmm4, %%xmm4  \n\t"
+        "vhaddpd        %%xmm5, %%xmm5, %%xmm5  \n\t"
+        "vhaddpd        %%xmm6, %%xmm6, %%xmm6  \n\t"
+        "vhaddpd        %%xmm7, %%xmm7, %%xmm7  \n\t"
+
+        "vmovsd         %%xmm4,    (%3)         \n\t"
+        "vmovsd         %%xmm5,   8(%3)         \n\t"
+        "vmovsd         %%xmm6,  16(%3)         \n\t"
+        "vmovsd         %%xmm7,  24(%3)         \n\t"
+
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From aa54fe064c1056e4f87266d1d64f10c51d9558a9 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Thu, 7 Aug 2014 22:30:20 +0200
Subject: [PATCH 55/74] added zgemv_n c-function

---
 kernel/x86_64/zgemv_n.c | 302 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 302 insertions(+)
 create mode 100644 kernel/x86_64/zgemv_n.c

diff --git a/kernel/x86_64/zgemv_n.c b/kernel/x86_64/zgemv_n.c
new file mode 100644
index 000000000..be5b08dcd
--- /dev/null
+++ b/kernel/x86_64/zgemv_n.c
@@ -0,0 +1,302 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+
+#define NBMAX 1024
+
+#ifndef HAVE_KERNEL_16x4
+
+static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3];
+
+	for ( i=0; i< 2*n; i+=2 )
+	{
+#if !defined(CONJ) 
+#if !defined(XCONJ)
+		y[i]   += a0[i]*x[0] - a0[i+1] * x[1];
+		y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
+		y[i]   += a1[i]*x[2] - a1[i+1] * x[3];
+		y[i+1] += a1[i]*x[3] + a1[i+1] * x[2];
+		y[i]   += a2[i]*x[4] - a2[i+1] * x[5];
+		y[i+1] += a2[i]*x[5] + a2[i+1] * x[4];
+		y[i]   += a3[i]*x[6] - a3[i+1] * x[7];
+		y[i+1] += a3[i]*x[7] + a3[i+1] * x[6];
+#else 
+		y[i]   += a0[i]*x[0] + a0[i+1] * x[1];
+		y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
+		y[i]   += a1[i]*x[2] + a1[i+1] * x[3];
+		y[i+1] += a1[i]*x[3] - a1[i+1] * x[2];
+		y[i]   += a2[i]*x[4] + a2[i+1] * x[5];
+		y[i+1] += a2[i]*x[5] - a2[i+1] * x[4];
+		y[i]   += a3[i]*x[6] + a3[i+1] * x[7];
+		y[i+1] += a3[i]*x[7] - a3[i+1] * x[6];
+#endif
+#else
+#if !defined(XCONJ)
+		y[i]   += a0[i]*x[0] + a0[i+1] * x[1];
+		y[i+1] -= a0[i]*x[1] - a0[i+1] * x[0];
+		y[i]   += a1[i]*x[2] + a1[i+1] * x[3];
+		y[i+1] -= a1[i]*x[3] - a1[i+1] * x[2];
+		y[i]   += a2[i]*x[4] + a2[i+1] * x[5];
+		y[i+1] -= a2[i]*x[5] - a2[i+1] * x[4];
+		y[i]   += a3[i]*x[6] + a3[i+1] * x[7];
+		y[i+1] -= a3[i]*x[7] - a3[i+1] * x[6];
+
+#else
+		y[i]   += a0[i]*x[0] - a0[i+1] * x[1];
+		y[i+1] -= a0[i]*x[1] + a0[i+1] * x[0];
+		y[i]   += a1[i]*x[2] - a1[i+1] * x[3];
+		y[i+1] -= a1[i]*x[3] + a1[i+1] * x[2];
+		y[i]   += a2[i]*x[4] - a2[i+1] * x[5];
+		y[i+1] -= a2[i]*x[5] + a2[i+1] * x[4];
+		y[i]   += a3[i]*x[6] - a3[i+1] * x[7];
+		y[i+1] -= a3[i]*x[7] + a3[i+1] * x[6];
+
+#endif
+#endif
+	}
+}
+	
+#endif
+
+static void zgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+	FLOAT *a0;
+	a0 = ap;
+
+	for ( i=0; i< 2*n; i+=2 )
+	{
+#if !defined(CONJ) 
+#if !defined(XCONJ)
+		y[i]   += a0[i]*x[0] - a0[i+1] * x[1];
+		y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
+#else 
+		y[i]   += a0[i]*x[0] + a0[i+1] * x[1];
+		y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
+#endif
+#else
+#if !defined(XCONJ)
+		y[i]   += a0[i]*x[0] + a0[i+1] * x[1];
+		y[i+1] -= a0[i]*x[1] - a0[i+1] * x[0];
+
+#else
+		y[i]   += a0[i]*x[0] - a0[i+1] * x[1];
+		y[i+1] -= a0[i]*x[1] + a0[i+1] * x[0];
+#endif
+#endif
+
+	}
+}
+	
+
+static void zero_y(BLASLONG n, FLOAT *dest)
+{
+	BLASLONG i;
+	for ( i=0; i<2*n; i++ )
+	{
+		*dest = 0.0;
+		dest++;
+	}
+}
+
+
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+	BLASLONG i;
+	for ( i=0; i<n; i++ )
+	{
+		*dest += *src;
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+		*(dest+1) += *(src+1);
+#else
+		*(dest+1) -= *(src+1);
+#endif
+		src+=2;
+		dest += inc_dest;
+	}
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	FLOAT *ap[4];
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG n2;
+	FLOAT xbuffer[8],*ybuffer;
+
+	ybuffer = buffer;
+	
+	inc_x *= 2;
+	inc_y *= 2;
+	lda   *= 2;
+
+	n1 = n / 4 ;
+	n2 = n % 4 ;
+	
+	m1 = m - ( m % 16 );
+	m2 = (m % NBMAX) - (m % 16) ;
+	
+	y_ptr = y;
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		a_ptr = a;
+		x_ptr = x;
+		zero_y(NB,ybuffer);
+		for( i = 0; i < n1 ; i++)
+		{
+
+#if !defined(XCONJ)
+			xbuffer[0] = alpha_r * x_ptr[0] - alpha_i * x_ptr[1];
+			xbuffer[1] = alpha_r * x_ptr[1] + alpha_i * x_ptr[0];
+			x_ptr += inc_x;	
+			xbuffer[2] = alpha_r * x_ptr[0] - alpha_i * x_ptr[1];
+			xbuffer[3] = alpha_r * x_ptr[1] + alpha_i * x_ptr[0];
+			x_ptr += inc_x;	
+			xbuffer[4] = alpha_r * x_ptr[0] - alpha_i * x_ptr[1];
+			xbuffer[5] = alpha_r * x_ptr[1] + alpha_i * x_ptr[0];
+			x_ptr += inc_x;	
+			xbuffer[6] = alpha_r * x_ptr[0] - alpha_i * x_ptr[1];
+			xbuffer[7] = alpha_r * x_ptr[1] + alpha_i * x_ptr[0];
+			x_ptr += inc_x;	
+#else
+			xbuffer[0] = alpha_r * x_ptr[0] + alpha_i * x_ptr[1];
+			xbuffer[1] = alpha_r * x_ptr[1] - alpha_i * x_ptr[0];
+			x_ptr += inc_x;	
+			xbuffer[2] = alpha_r * x_ptr[0] + alpha_i * x_ptr[1];
+			xbuffer[3] = alpha_r * x_ptr[1] - alpha_i * x_ptr[0];
+			x_ptr += inc_x;	
+			xbuffer[4] = alpha_r * x_ptr[0] + alpha_i * x_ptr[1];
+			xbuffer[5] = alpha_r * x_ptr[1] - alpha_i * x_ptr[0];
+			x_ptr += inc_x;	
+			xbuffer[6] = alpha_r * x_ptr[0] + alpha_i * x_ptr[1];
+			xbuffer[7] = alpha_r * x_ptr[1] - alpha_i * x_ptr[0];
+			x_ptr += inc_x;	
+#endif
+
+			ap[0] = a_ptr;
+			ap[1] = a_ptr + lda;
+			ap[2] = ap[1] + lda;
+			ap[3] = ap[2] + lda;
+			zgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
+			a_ptr += 4 * lda;
+		}
+
+		for( i = 0; i < n2 ; i++)
+		{
+#if !defined(XCONJ)
+			xbuffer[0] = alpha_r * x_ptr[0] - alpha_i * x_ptr[1];
+			xbuffer[1] = alpha_r * x_ptr[1] + alpha_i * x_ptr[0];
+			x_ptr += inc_x;	
+#else
+			xbuffer[0] = alpha_r * x_ptr[0] + alpha_i * x_ptr[1];
+			xbuffer[1] = alpha_r * x_ptr[1] - alpha_i * x_ptr[0];
+			x_ptr += inc_x;	
+#endif
+
+			zgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
+			a_ptr += 1 * lda;
+
+		}
+		add_y(NB,ybuffer,y_ptr,inc_y);
+		a     += 2 * NB;
+		y_ptr += NB * inc_y;
+	}
+	j=0;
+	while ( j < (m % 16))
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp_r = 0.0;
+		FLOAT temp_i = 0.0;
+		for( i = 0; i < n; i++ )
+		{
+#if  !defined(CONJ)
+#if  !defined(XCONJ)
+			temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+			temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+#else
+			temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+			temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+#endif
+#else
+#if  !defined(XCONJ)
+			temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+			temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+#else
+			temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+			temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+#endif
+#endif
+
+
+			a_ptr += lda;
+			x_ptr += inc_x;
+		}
+
+#if !defined(XCONJ) 
+		y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+		y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+		y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+		y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+		y_ptr += inc_y;
+		a+=2;
+		j++;
+	}
+	return(0);
+}
+
+

From 462b4885ff471da4e525a2f768a734151aab774b Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sun, 10 Aug 2014 08:39:17 +0200
Subject: [PATCH 56/74] added optimized zgemv_n kernel for haswell

---
 kernel/x86_64/KERNEL.HASWELL             |   1 +
 kernel/x86_64/dgemv_n_microk_haswell-2.c |   2 +-
 kernel/x86_64/zgemv_n.c                  |   4 +
 kernel/x86_64/zgemv_n_microk_haswell-2.c | 149 +++++++++++++++++++++++
 4 files changed, 155 insertions(+), 1 deletion(-)
 create mode 100644 kernel/x86_64/zgemv_n_microk_haswell-2.c

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index cd280e4b6..d126eb6f4 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -2,6 +2,7 @@ SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t.c
 DGEMVNKERNEL = dgemv_n.c
 DGEMVTKERNEL = dgemv_t.c
+ZGEMVNKERNEL = zgemv_n.c
 
 SGEMMKERNEL    =  sgemm_kernel_16x4_haswell.S
 SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
diff --git a/kernel/x86_64/dgemv_n_microk_haswell-2.c b/kernel/x86_64/dgemv_n_microk_haswell-2.c
index 88c7d4163..b9f462cb2 100644
--- a/kernel/x86_64/dgemv_n_microk_haswell-2.c
+++ b/kernel/x86_64/dgemv_n_microk_haswell-2.c
@@ -43,7 +43,7 @@ static void dgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
 
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
-	"prefetcht0	 192(%4,%0,8)		 \n\t"
+	"prefetcht0	 192(%3,%0,8)		 \n\t"
 	"vmovups	(%3,%0,8), %%ymm4	 \n\t"	// 4 * y
 	"vmovups      32(%3,%0,8), %%ymm5	 \n\t"	// 4 * y
 
diff --git a/kernel/x86_64/zgemv_n.c b/kernel/x86_64/zgemv_n.c
index be5b08dcd..141cb35df 100644
--- a/kernel/x86_64/zgemv_n.c
+++ b/kernel/x86_64/zgemv_n.c
@@ -28,6 +28,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
+#if defined(HASWELL)
+#include "zgemv_n_microk_haswell-2.c"
+#endif
+
 
 #define NBMAX 1024
 
diff --git a/kernel/x86_64/zgemv_n_microk_haswell-2.c b/kernel/x86_64/zgemv_n_microk_haswell-2.c
new file mode 100644
index 000000000..8583f96b3
--- /dev/null
+++ b/kernel/x86_64/zgemv_n_microk_haswell-2.c
@@ -0,0 +1,149 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16x4 1
+static void zgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
+
+static void zgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+
+	"vbroadcastsd	  (%2), %%ymm0                  \n\t"  // real part x0
+	"vbroadcastsd	 8(%2), %%ymm1                  \n\t"  // imag part x0
+	"vbroadcastsd	16(%2), %%ymm2                  \n\t"  // real part x1
+	"vbroadcastsd	24(%2), %%ymm3                  \n\t"  // imag part x1
+	"vbroadcastsd	32(%2), %%ymm4                  \n\t"  // real part x2
+	"vbroadcastsd	40(%2), %%ymm5                  \n\t"  // imag part x2
+	"vbroadcastsd	48(%2), %%ymm6                  \n\t"  // real part x3
+	"vbroadcastsd	56(%2), %%ymm7                  \n\t"  // imag part x3
+
+
+	".align 16				        \n\t"
+	".L01LOOP%=:				        \n\t"
+	"prefetcht0      192(%4,%0,8)			\n\t"
+	"vmovups	(%4,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
+	"vmovups      32(%4,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
+
+	"vxorpd		%%ymm12, %%ymm12, %%ymm12	\n\t"
+	"vxorpd		%%ymm13, %%ymm13, %%ymm13	\n\t"
+	"vxorpd		%%ymm14, %%ymm14, %%ymm14	\n\t"
+	"vxorpd		%%ymm15, %%ymm15, %%ymm15	\n\t"
+
+	"prefetcht0      192(%5,%0,8)			\n\t"
+	"vmovups	(%5,%0,8), %%ymm10              \n\t" // 2 complex values form a1
+	"vmovups      32(%5,%0,8), %%ymm11              \n\t" // 2 complex values form a1
+
+	"vfmadd231pd      %%ymm8 , %%ymm0, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmadd231pd      %%ymm8 , %%ymm1, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+	"vfmadd231pd      %%ymm9 , %%ymm0, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vfmadd231pd      %%ymm9 , %%ymm1, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"prefetcht0      192(%6,%0,8)			\n\t"
+	"vmovups	(%6,%0,8), %%ymm8	        \n\t" // 2 complex values form a2
+	"vmovups      32(%6,%0,8), %%ymm9	        \n\t" // 2 complex values form a2
+
+	"vfmadd231pd      %%ymm10, %%ymm2, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmadd231pd      %%ymm10, %%ymm3, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+	"vfmadd231pd      %%ymm11, %%ymm2, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vfmadd231pd      %%ymm11, %%ymm3, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"prefetcht0      192(%7,%0,8)			\n\t"
+	"vmovups	(%7,%0,8), %%ymm10              \n\t" // 2 complex values form a3
+	"vmovups      32(%7,%0,8), %%ymm11              \n\t" // 2 complex values form a3
+
+	"vfmadd231pd      %%ymm8 , %%ymm4, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmadd231pd      %%ymm8 , %%ymm5, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+	"vfmadd231pd      %%ymm9 , %%ymm4, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vfmadd231pd      %%ymm9 , %%ymm5, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"vfmadd231pd      %%ymm10, %%ymm6, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmadd231pd      %%ymm10, %%ymm7, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+	"vfmadd231pd      %%ymm11, %%ymm6, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vfmadd231pd      %%ymm11, %%ymm7, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vpermilpd      $0x5 , %%ymm13, %%ymm13               \n\t"
+        "vpermilpd      $0x5 , %%ymm15, %%ymm15               \n\t"
+        "vaddsubpd      %%ymm13, %%ymm12, %%ymm8              \n\t"
+        "vaddsubpd      %%ymm15, %%ymm14, %%ymm9              \n\t"
+#else
+        "vpermilpd      $0x5 , %%ymm12, %%ymm12               \n\t"
+        "vpermilpd      $0x5 , %%ymm14, %%ymm14               \n\t"
+        "vaddsubpd      %%ymm13, %%ymm12, %%ymm8              \n\t"
+        "vaddsubpd      %%ymm15, %%ymm14, %%ymm9              \n\t"
+        "vpermilpd      $0x5 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilpd      $0x5 , %%ymm9 , %%ymm9                \n\t"
+#endif
+
+	"prefetcht0      192(%3,%0,8)			\n\t"
+	"vmovups	  (%3,%0,8),  %%ymm12           \n\t"
+	"vmovups	32(%3,%0,8),  %%ymm13           \n\t"
+
+#if !defined(XCONJ)
+        "vaddpd         %%ymm8, %%ymm12, %%ymm12              \n\t"
+        "vaddpd         %%ymm9, %%ymm13, %%ymm13              \n\t"
+#else
+        "vaddsubpd              %%ymm12, %%ymm8, %%ymm12              \n\t"
+        "vaddsubpd              %%ymm13, %%ymm9, %%ymm13              \n\t"
+#endif
+
+
+	"vmovups  %%ymm12,   (%3,%0,8)		        \n\t" // 2 complex values to y	
+	"vmovups  %%ymm13, 32(%3,%0,8)		        \n\t"	
+
+        "addq		$8 , %0	  	 	        \n\t"
+	"subq	        $4 , %1			        \n\t"		
+	"jnz		.L01LOOP%=		        \n\t"
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From dbc2eff029b298e324c175b698ec132436e6df43 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sun, 10 Aug 2014 11:57:24 +0200
Subject: [PATCH 57/74] disabled optimized haswell zgemv_n kernel for windows (
 bad rounding )

---
 kernel/x86_64/KERNEL.HASWELL | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index d126eb6f4..7d4cddbcc 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -2,7 +2,10 @@ SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t.c
 DGEMVNKERNEL = dgemv_n.c
 DGEMVTKERNEL = dgemv_t.c
+
+ifndef OS_WINDOWS 
 ZGEMVNKERNEL = zgemv_n.c
+endif
 
 SGEMMKERNEL    =  sgemm_kernel_16x4_haswell.S
 SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c

From 6fe416976d1ad2ff8f60829cb01a63a11d876429 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Mon, 11 Aug 2014 09:13:18 +0200
Subject: [PATCH 58/74] added optimimized zgemv_t c-kernel

---
 kernel/x86_64/zgemv_t.c | 267 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 267 insertions(+)
 create mode 100644 kernel/x86_64/zgemv_t.c

diff --git a/kernel/x86_64/zgemv_t.c b/kernel/x86_64/zgemv_t.c
new file mode 100644
index 000000000..a2dc45c45
--- /dev/null
+++ b/kernel/x86_64/zgemv_t.c
@@ -0,0 +1,267 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+/*
+#if defined(HASWELL)
+#include "zgemv_t_microk_haswell-2.c"
+#endif
+*/
+
+#define NBMAX 1028
+
+#ifndef HAVE_KERNEL_16x4
+
+static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3];
+	FLOAT temp_r0 = 0.0;
+	FLOAT temp_r1 = 0.0;
+	FLOAT temp_r2 = 0.0;
+	FLOAT temp_r3 = 0.0;
+	FLOAT temp_i0 = 0.0;
+	FLOAT temp_i1 = 0.0;
+	FLOAT temp_i2 = 0.0;
+	FLOAT temp_i3 = 0.0;
+
+
+	for ( i=0; i< 2*n; i+=2 )
+	{
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+		temp_r0 += a0[i]*x[i]   - a0[i+1]*x[i+1];		
+		temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i];		
+		temp_r1 += a1[i]*x[i]   - a1[i+1]*x[i+1];		
+		temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i];		
+		temp_r2 += a2[i]*x[i]   - a2[i+1]*x[i+1];		
+		temp_i2 += a2[i]*x[i+1] + a2[i+1]*x[i];		
+		temp_r3 += a3[i]*x[i]   - a3[i+1]*x[i+1];		
+		temp_i3 += a3[i]*x[i+1] + a3[i+1]*x[i];		
+#else
+		temp_r0 += a0[i]*x[i]   + a0[i+1]*x[i+1];		
+		temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i];		
+		temp_r1 += a1[i]*x[i]   + a1[i+1]*x[i+1];		
+		temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i];		
+		temp_r2 += a2[i]*x[i]   + a2[i+1]*x[i+1];		
+		temp_i2 += a2[i]*x[i+1] - a2[i+1]*x[i];		
+		temp_r3 += a3[i]*x[i]   + a3[i+1]*x[i+1];		
+		temp_i3 += a3[i]*x[i+1] - a3[i+1]*x[i];		
+#endif
+	}
+	y[0] = temp_r0;
+	y[1] = temp_i0;
+	y[2] = temp_r1;
+	y[3] = temp_i1;
+	y[4] = temp_r2;
+	y[5] = temp_i2;
+	y[6] = temp_r3;
+	y[7] = temp_i3;
+}
+	
+#endif
+
+static void zgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+	FLOAT *a0;
+	a0 = ap;
+	FLOAT temp_r = 0.0;
+	FLOAT temp_i = 0.0;
+
+	for ( i=0; i< 2*n; i+=2 )
+	{
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+		temp_r += a0[i]*x[i]   - a0[i+1]*x[i+1];		
+		temp_i += a0[i]*x[i+1] + a0[i+1]*x[i];		
+#else
+		temp_r += a0[i]*x[i]   + a0[i+1]*x[i+1];		
+		temp_i += a0[i]*x[i+1] - a0[i+1]*x[i];		
+#endif
+	}
+	*y      = temp_r;
+	*(y+1)  = temp_i;
+}
+	
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
+{
+        BLASLONG i;
+        for ( i=0; i<n; i++ )
+        {
+                *dest     = *src;
+                *(dest+1) = *(src+1);
+                dest+=2;
+                src += inc_src;
+        }
+}
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	FLOAT *ap[8];
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG n2;
+	FLOAT ybuffer[8],*xbuffer;
+
+        inc_x *= 2;
+        inc_y *= 2;
+        lda   *= 2;
+
+	xbuffer = buffer;
+	
+	n1 = n / 4 ;
+	n2 = n % 4 ;
+	
+	m1 = m - ( m % 16 );
+	m2 = (m % NBMAX) - (m % 16) ;
+	
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		y_ptr = y;
+		a_ptr = a;
+		x_ptr = x;
+		copy_x(NB,x_ptr,xbuffer,inc_x);
+		for( i = 0; i < n1 ; i++)
+		{
+			ap[0] = a_ptr;
+			ap[1] = a_ptr + lda;
+			ap[2] = ap[1] + lda;
+			ap[3] = ap[2] + lda;
+			zgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
+			a_ptr += 4 * lda;
+
+#if !defined(XCONJ)
+			y_ptr[0] += alpha_r * ybuffer[0] - alpha_i * ybuffer[1];
+			y_ptr[1] += alpha_r * ybuffer[1] + alpha_i * ybuffer[0];
+			y_ptr  += inc_y;
+			y_ptr[0] += alpha_r * ybuffer[2] - alpha_i * ybuffer[3];
+			y_ptr[1] += alpha_r * ybuffer[3] + alpha_i * ybuffer[2];
+			y_ptr  += inc_y;
+			y_ptr[0] += alpha_r * ybuffer[4] - alpha_i * ybuffer[5];
+			y_ptr[1] += alpha_r * ybuffer[5] + alpha_i * ybuffer[4];
+			y_ptr  += inc_y;
+			y_ptr[0] += alpha_r * ybuffer[6] - alpha_i * ybuffer[7];
+			y_ptr[1] += alpha_r * ybuffer[7] + alpha_i * ybuffer[6];
+			y_ptr  += inc_y;
+#else
+			y_ptr[0] += alpha_r * ybuffer[0] + alpha_i * ybuffer[1];
+			y_ptr[1] -= alpha_r * ybuffer[1] - alpha_i * ybuffer[0];
+			y_ptr  += inc_y;
+			y_ptr[0] += alpha_r * ybuffer[2] + alpha_i * ybuffer[3];
+			y_ptr[1] -= alpha_r * ybuffer[3] - alpha_i * ybuffer[2];
+			y_ptr  += inc_y;
+			y_ptr[0] += alpha_r * ybuffer[4] + alpha_i * ybuffer[5];
+			y_ptr[1] -= alpha_r * ybuffer[5] - alpha_i * ybuffer[4];
+			y_ptr  += inc_y;
+			y_ptr[0] += alpha_r * ybuffer[6] + alpha_i * ybuffer[7];
+			y_ptr[1] -= alpha_r * ybuffer[7] - alpha_i * ybuffer[6];
+			y_ptr  += inc_y;
+#endif
+		}
+
+		for( i = 0; i < n2 ; i++)
+		{
+			zgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
+			a_ptr += 1 * lda;
+
+#if !defined(XCONJ)
+			y_ptr[0] += alpha_r * ybuffer[0] - alpha_i * ybuffer[1];
+			y_ptr[1] += alpha_r * ybuffer[1] + alpha_i * ybuffer[0];
+			y_ptr  += inc_y;
+#else
+			y_ptr[0] += alpha_r * ybuffer[0] + alpha_i * ybuffer[1];
+			y_ptr[1] -= alpha_r * ybuffer[1] - alpha_i * ybuffer[0];
+			y_ptr  += inc_y;
+#endif
+
+		}
+		a += 2* NB;
+		x += NB * inc_x;	
+	}
+
+	BLASLONG m3 = m % 16;
+	if ( m3 == 0 ) return(0);
+
+	x_ptr = x;
+	copy_x(m3,x_ptr,xbuffer,inc_x);
+	j=0;
+	a_ptr = a;
+	y_ptr = y;
+	while ( j < n)
+	{
+		FLOAT temp_r = 0.0;
+		FLOAT temp_i = 0.0;
+		for( i = 0; i < m3*2; i+=2 )
+		{
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+			temp_r += a_ptr[i] * xbuffer[i]   - a_ptr[i+1] * xbuffer[i+1];
+			temp_i += a_ptr[i] * xbuffer[i+1] + a_ptr[i+1] * xbuffer[i];
+#else
+			temp_r += a_ptr[i] * xbuffer[i]   + a_ptr[i+1] * xbuffer[i+1];
+			temp_i += a_ptr[i] * xbuffer[i+1] - a_ptr[i+1] * xbuffer[i];
+#endif
+		}
+		a_ptr += lda;
+
+#if !defined(XCONJ) 
+                y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+                y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+                y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+                y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+		y_ptr += inc_y;
+		j++;
+	}
+	return(0);
+}
+
+

From 726ad085cb00af5c13a7742a4b73df598953bf80 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Mon, 11 Aug 2014 13:10:12 +0200
Subject: [PATCH 59/74] added optimized zgemv_t for haswell

---
 kernel/x86_64/dgemv_n_microk_haswell-2.c   |   4 +-
 kernel/x86_64/dgemv_t_microk_haswell-2.c   |   4 +-
 kernel/x86_64/sgemv_n_microk_bulldozer-2.c |   4 +-
 kernel/x86_64/sgemv_n_microk_haswell-2.c   |   4 +-
 kernel/x86_64/sgemv_n_microk_nehalem-2.c   |   4 +-
 kernel/x86_64/sgemv_n_microk_sandy-2.c     |   4 +-
 kernel/x86_64/sgemv_t_microk_bulldozer-2.c |   4 +-
 kernel/x86_64/sgemv_t_microk_haswell-2.c   |   4 +-
 kernel/x86_64/sgemv_t_microk_nehalem-2.c   |   4 +-
 kernel/x86_64/sgemv_t_microk_sandy-2.c     |   4 +-
 kernel/x86_64/zgemv_n_microk_haswell-2.c   |   4 +-
 kernel/x86_64/zgemv_t_microk_bulldozer-2.c | 139 +++++++++++++++++++++
 kernel/x86_64/zgemv_t_microk_haswell-2.c   | 139 +++++++++++++++++++++
 13 files changed, 300 insertions(+), 22 deletions(-)
 create mode 100644 kernel/x86_64/zgemv_t_microk_bulldozer-2.c
 create mode 100644 kernel/x86_64/zgemv_t_microk_haswell-2.c

diff --git a/kernel/x86_64/dgemv_n_microk_haswell-2.c b/kernel/x86_64/dgemv_n_microk_haswell-2.c
index b9f462cb2..28e2fe4f6 100644
--- a/kernel/x86_64/dgemv_n_microk_haswell-2.c
+++ b/kernel/x86_64/dgemv_n_microk_haswell-2.c
@@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #define HAVE_KERNEL_16x4 1
-static void dgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
+static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 
-static void dgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
+static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {
 
 	BLASLONG register i = 0;
diff --git a/kernel/x86_64/dgemv_t_microk_haswell-2.c b/kernel/x86_64/dgemv_t_microk_haswell-2.c
index 94d4c319e..1a4ba37d7 100644
--- a/kernel/x86_64/dgemv_t_microk_haswell-2.c
+++ b/kernel/x86_64/dgemv_t_microk_haswell-2.c
@@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #define HAVE_KERNEL_16x4 1
-static void dgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
+static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 
-static void dgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
+static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {
 
 	BLASLONG register i = 0;
diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-2.c b/kernel/x86_64/sgemv_n_microk_bulldozer-2.c
index d50fa4268..c4a490587 100644
--- a/kernel/x86_64/sgemv_n_microk_bulldozer-2.c
+++ b/kernel/x86_64/sgemv_n_microk_bulldozer-2.c
@@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #define HAVE_KERNEL_16x4 1
-static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
+static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 
-static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
+static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {
 
 	BLASLONG register i = 0;
diff --git a/kernel/x86_64/sgemv_n_microk_haswell-2.c b/kernel/x86_64/sgemv_n_microk_haswell-2.c
index d3fee67c3..19888d150 100644
--- a/kernel/x86_64/sgemv_n_microk_haswell-2.c
+++ b/kernel/x86_64/sgemv_n_microk_haswell-2.c
@@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #define HAVE_KERNEL_16x4 1
-static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
+static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 
-static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
+static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {
 
 	BLASLONG register i = 0;
diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-2.c b/kernel/x86_64/sgemv_n_microk_nehalem-2.c
index 3cfb82a45..40ccbb78f 100644
--- a/kernel/x86_64/sgemv_n_microk_nehalem-2.c
+++ b/kernel/x86_64/sgemv_n_microk_nehalem-2.c
@@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #define HAVE_KERNEL_16x4 1
-static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
+static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 
-static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
+static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {
 
 	BLASLONG register i = 0;
diff --git a/kernel/x86_64/sgemv_n_microk_sandy-2.c b/kernel/x86_64/sgemv_n_microk_sandy-2.c
index 21eff1c5e..b255ddbcb 100644
--- a/kernel/x86_64/sgemv_n_microk_sandy-2.c
+++ b/kernel/x86_64/sgemv_n_microk_sandy-2.c
@@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #define HAVE_KERNEL_16x4 1
-static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
+static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 
-static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
+static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {
 
 	BLASLONG register i = 0;
diff --git a/kernel/x86_64/sgemv_t_microk_bulldozer-2.c b/kernel/x86_64/sgemv_t_microk_bulldozer-2.c
index 54bdca63a..e4498afa3 100644
--- a/kernel/x86_64/sgemv_t_microk_bulldozer-2.c
+++ b/kernel/x86_64/sgemv_t_microk_bulldozer-2.c
@@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #define HAVE_KERNEL_16x4 1
-static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
+static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 
-static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
+static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {
 
 	BLASLONG register i = 0;
diff --git a/kernel/x86_64/sgemv_t_microk_haswell-2.c b/kernel/x86_64/sgemv_t_microk_haswell-2.c
index cef703483..e6d47270d 100644
--- a/kernel/x86_64/sgemv_t_microk_haswell-2.c
+++ b/kernel/x86_64/sgemv_t_microk_haswell-2.c
@@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #define HAVE_KERNEL_16x4 1
-static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
+static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 
-static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
+static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {
 
 	BLASLONG register i = 0;
diff --git a/kernel/x86_64/sgemv_t_microk_nehalem-2.c b/kernel/x86_64/sgemv_t_microk_nehalem-2.c
index e1f2b81bd..db5a1448b 100644
--- a/kernel/x86_64/sgemv_t_microk_nehalem-2.c
+++ b/kernel/x86_64/sgemv_t_microk_nehalem-2.c
@@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #define HAVE_KERNEL_16x4 1
-static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
+static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 
-static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
+static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {
 
 	BLASLONG register i = 0;
diff --git a/kernel/x86_64/sgemv_t_microk_sandy-2.c b/kernel/x86_64/sgemv_t_microk_sandy-2.c
index 6a3748238..841522302 100644
--- a/kernel/x86_64/sgemv_t_microk_sandy-2.c
+++ b/kernel/x86_64/sgemv_t_microk_sandy-2.c
@@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #define HAVE_KERNEL_16x4 1
-static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
+static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 
-static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
+static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {
 
 	BLASLONG register i = 0;
diff --git a/kernel/x86_64/zgemv_n_microk_haswell-2.c b/kernel/x86_64/zgemv_n_microk_haswell-2.c
index 8583f96b3..833983fe0 100644
--- a/kernel/x86_64/zgemv_n_microk_haswell-2.c
+++ b/kernel/x86_64/zgemv_n_microk_haswell-2.c
@@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #define HAVE_KERNEL_16x4 1
-static void zgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
+static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
 
-static void zgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
+static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {
 
 	BLASLONG register i = 0;
diff --git a/kernel/x86_64/zgemv_t_microk_bulldozer-2.c b/kernel/x86_64/zgemv_t_microk_bulldozer-2.c
new file mode 100644
index 000000000..efb6d784e
--- /dev/null
+++ b/kernel/x86_64/zgemv_t_microk_bulldozer-2.c
@@ -0,0 +1,139 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary froms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary from must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16x4 1
+static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+
+	"vxorpd		%%ymm8 , %%ymm8 , %%ymm8 	\n\t" // temp
+	"vxorpd		%%ymm9 , %%ymm9 , %%ymm9 	\n\t" // temp
+	"vxorpd		%%ymm10, %%ymm10, %%ymm10	\n\t" // temp
+	"vxorpd		%%ymm11, %%ymm11, %%ymm11	\n\t" // temp
+	"vxorpd		%%ymm12, %%ymm12, %%ymm12	\n\t" // temp
+	"vxorpd		%%ymm13, %%ymm13, %%ymm13	\n\t"
+	"vxorpd		%%ymm14, %%ymm14, %%ymm14	\n\t"
+	"vxorpd		%%ymm15, %%ymm15, %%ymm15	\n\t"
+
+	".align 16				        \n\t"
+	".L01LOOP%=:				        \n\t"
+
+	"vmovddup	   (%2,%0,8), %%xmm0            \n\t"  // real value from x0
+	"vmovddup	  8(%2,%0,8), %%xmm1            \n\t"  // imag value from x0
+	"vmovddup	 16(%2,%0,8), %%xmm2            \n\t"  // real value from x1
+	"vmovddup	 24(%2,%0,8), %%xmm3            \n\t"  // imag value from x1
+	"vinsertf128	 $1, %%xmm2, %%ymm0 , %%ymm0	\n\t"  // real values from x0 and x1
+	"vinsertf128	 $1, %%xmm3, %%ymm1 , %%ymm1	\n\t"  // imag values from x0 and x1
+
+	"vmovups	(%4,%0,8), %%ymm4	        \n\t" // 2 complex values from a0
+	"vmovups	(%5,%0,8), %%ymm5               \n\t" // 2 complex values from a1
+	"vmovups	(%6,%0,8), %%ymm6	        \n\t" // 2 complex values from a2
+	"vmovups	(%7,%0,8), %%ymm7               \n\t" // 2 complex values from a3
+
+	"vfmaddpd   %%ymm8 ,   %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddpd   %%ymm9 ,   %%ymm4 , %%ymm1, %%ymm9       \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmaddpd   %%ymm10,   %%ymm5 , %%ymm0, %%ymm10      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddpd   %%ymm11,   %%ymm5 , %%ymm1, %%ymm11      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmaddpd   %%ymm12,   %%ymm6 , %%ymm0, %%ymm12      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddpd   %%ymm13,   %%ymm6 , %%ymm1, %%ymm13      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmaddpd   %%ymm14,   %%ymm7 , %%ymm0, %%ymm14      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddpd   %%ymm15,   %%ymm7 , %%ymm1, %%ymm15      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+
+
+        "addq		$4 , %0	  	 	        \n\t"
+	"subq	        $2 , %1			        \n\t"		
+	"jnz		.L01LOOP%=		        \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vpermilpd      $0x5 , %%ymm9 , %%ymm9                \n\t"
+        "vpermilpd      $0x5 , %%ymm11, %%ymm11               \n\t"
+        "vpermilpd      $0x5 , %%ymm13, %%ymm13               \n\t"
+        "vpermilpd      $0x5 , %%ymm15, %%ymm15               \n\t"
+        "vaddsubpd      %%ymm9 , %%ymm8, %%ymm8               \n\t" 
+        "vaddsubpd      %%ymm11, %%ymm10, %%ymm10             \n\t"
+        "vaddsubpd      %%ymm13, %%ymm12, %%ymm12             \n\t"
+        "vaddsubpd      %%ymm15, %%ymm14, %%ymm14             \n\t"
+#else
+        "vpermilpd      $0x5 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilpd      $0x5 , %%ymm10, %%ymm10               \n\t"
+        "vpermilpd      $0x5 , %%ymm12, %%ymm12               \n\t"
+        "vpermilpd      $0x5 , %%ymm14, %%ymm14               \n\t"
+        "vaddsubpd      %%ymm8 , %%ymm9 , %%ymm8              \n\t"
+        "vaddsubpd      %%ymm10, %%ymm11, %%ymm10             \n\t"
+        "vaddsubpd      %%ymm12, %%ymm13, %%ymm12             \n\t"
+        "vaddsubpd      %%ymm14, %%ymm15, %%ymm14             \n\t"
+        "vpermilpd      $0x5 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilpd      $0x5 , %%ymm10, %%ymm10               \n\t"
+        "vpermilpd      $0x5 , %%ymm12, %%ymm12               \n\t"
+        "vpermilpd      $0x5 , %%ymm14, %%ymm14               \n\t"
+#endif
+
+	"vextractf128   $1, %%ymm8 , %%xmm9		      \n\t"
+	"vextractf128   $1, %%ymm10, %%xmm11	      	      \n\t"
+	"vextractf128   $1, %%ymm12, %%xmm13		      \n\t"
+	"vextractf128   $1, %%ymm14, %%xmm15		      \n\t"
+
+	"vaddpd		%%xmm8 , %%xmm9 , %%xmm8       \n\t"
+	"vaddpd		%%xmm10, %%xmm11, %%xmm10      \n\t"
+	"vaddpd		%%xmm12, %%xmm13, %%xmm12      \n\t"
+	"vaddpd		%%xmm14, %%xmm15, %%xmm14      \n\t"
+
+	"vmovups	%%xmm8 ,   (%3)			\n\t"
+	"vmovups	%%xmm10, 16(%3)			\n\t"
+	"vmovups	%%xmm12, 32(%3)			\n\t"
+	"vmovups	%%xmm14, 48(%3)			\n\t"
+
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
diff --git a/kernel/x86_64/zgemv_t_microk_haswell-2.c b/kernel/x86_64/zgemv_t_microk_haswell-2.c
new file mode 100644
index 000000000..2dddef27d
--- /dev/null
+++ b/kernel/x86_64/zgemv_t_microk_haswell-2.c
@@ -0,0 +1,139 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary froms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary from must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16x4 1
+static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+
+	"vxorpd		%%ymm8 , %%ymm8 , %%ymm8 	\n\t" // temp
+	"vxorpd		%%ymm9 , %%ymm9 , %%ymm9 	\n\t" // temp
+	"vxorpd		%%ymm10, %%ymm10, %%ymm10	\n\t" // temp
+	"vxorpd		%%ymm11, %%ymm11, %%ymm11	\n\t" // temp
+	"vxorpd		%%ymm12, %%ymm12, %%ymm12	\n\t" // temp
+	"vxorpd		%%ymm13, %%ymm13, %%ymm13	\n\t"
+	"vxorpd		%%ymm14, %%ymm14, %%ymm14	\n\t"
+	"vxorpd		%%ymm15, %%ymm15, %%ymm15	\n\t"
+
+	".align 16				        \n\t"
+	".L01LOOP%=:				        \n\t"
+
+	"vmovddup	   (%2,%0,8), %%xmm0            \n\t"  // real value from x0
+	"vmovddup	  8(%2,%0,8), %%xmm1            \n\t"  // imag value from x0
+	"vmovddup	 16(%2,%0,8), %%xmm2            \n\t"  // real value from x1
+	"vmovddup	 24(%2,%0,8), %%xmm3            \n\t"  // imag value from x1
+	"vinsertf128	 $1, %%xmm2, %%ymm0 , %%ymm0	\n\t"  // real values from x0 and x1
+	"vinsertf128	 $1, %%xmm3, %%ymm1 , %%ymm1	\n\t"  // imag values from x0 and x1
+
+	"vmovups	(%4,%0,8), %%ymm4	        \n\t" // 2 complex values from a0
+	"vmovups	(%5,%0,8), %%ymm5               \n\t" // 2 complex values from a1
+	"vmovups	(%6,%0,8), %%ymm6	        \n\t" // 2 complex values from a2
+	"vmovups	(%7,%0,8), %%ymm7               \n\t" // 2 complex values from a3
+
+	"vfmadd231pd      %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmadd231pd      %%ymm4 , %%ymm1, %%ymm9       \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmadd231pd      %%ymm5 , %%ymm0, %%ymm10      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmadd231pd      %%ymm5 , %%ymm1, %%ymm11      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmadd231pd      %%ymm6 , %%ymm0, %%ymm12      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmadd231pd      %%ymm6 , %%ymm1, %%ymm13      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmadd231pd      %%ymm7 , %%ymm0, %%ymm14      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmadd231pd      %%ymm7 , %%ymm1, %%ymm15      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+
+
+        "addq		$4 , %0	  	 	        \n\t"
+	"subq	        $2 , %1			        \n\t"		
+	"jnz		.L01LOOP%=		        \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vpermilpd      $0x5 , %%ymm9 , %%ymm9                \n\t"
+        "vpermilpd      $0x5 , %%ymm11, %%ymm11               \n\t"
+        "vpermilpd      $0x5 , %%ymm13, %%ymm13               \n\t"
+        "vpermilpd      $0x5 , %%ymm15, %%ymm15               \n\t"
+        "vaddsubpd      %%ymm9 , %%ymm8, %%ymm8               \n\t" 
+        "vaddsubpd      %%ymm11, %%ymm10, %%ymm10             \n\t"
+        "vaddsubpd      %%ymm13, %%ymm12, %%ymm12             \n\t"
+        "vaddsubpd      %%ymm15, %%ymm14, %%ymm14             \n\t"
+#else
+        "vpermilpd      $0x5 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilpd      $0x5 , %%ymm10, %%ymm10               \n\t"
+        "vpermilpd      $0x5 , %%ymm12, %%ymm12               \n\t"
+        "vpermilpd      $0x5 , %%ymm14, %%ymm14               \n\t"
+        "vaddsubpd      %%ymm8 , %%ymm9 , %%ymm8              \n\t"
+        "vaddsubpd      %%ymm10, %%ymm11, %%ymm10             \n\t"
+        "vaddsubpd      %%ymm12, %%ymm13, %%ymm12             \n\t"
+        "vaddsubpd      %%ymm14, %%ymm15, %%ymm14             \n\t"
+        "vpermilpd      $0x5 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilpd      $0x5 , %%ymm10, %%ymm10               \n\t"
+        "vpermilpd      $0x5 , %%ymm12, %%ymm12               \n\t"
+        "vpermilpd      $0x5 , %%ymm14, %%ymm14               \n\t"
+#endif
+
+	"vextractf128   $1, %%ymm8 , %%xmm9		      \n\t"
+	"vextractf128   $1, %%ymm10, %%xmm11	      	      \n\t"
+	"vextractf128   $1, %%ymm12, %%xmm13		      \n\t"
+	"vextractf128   $1, %%ymm14, %%xmm15		      \n\t"
+
+	"vaddpd		%%xmm8 , %%xmm9 , %%xmm8       \n\t"
+	"vaddpd		%%xmm10, %%xmm11, %%xmm10      \n\t"
+	"vaddpd		%%xmm12, %%xmm13, %%xmm12      \n\t"
+	"vaddpd		%%xmm14, %%xmm15, %%xmm14      \n\t"
+
+	"vmovups	%%xmm8 ,   (%3)			\n\t"
+	"vmovups	%%xmm10, 16(%3)			\n\t"
+	"vmovups	%%xmm12, 32(%3)			\n\t"
+	"vmovups	%%xmm14, 48(%3)			\n\t"
+
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From 09fcd3a34135ad86f4b17f8e15893fe99b9f0171 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Mon, 11 Aug 2014 14:19:25 +0200
Subject: [PATCH 60/74] add optimized zgemv_t kernel for bulldozer

---
 kernel/x86_64/KERNEL.BULLDOZER             |   2 +-
 kernel/x86_64/zgemv_t.c                    |   8 +-
 kernel/x86_64/zgemv_t_microk_bulldozer-2.c | 151 +++++++++++++--------
 3 files changed, 101 insertions(+), 60 deletions(-)

diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER
index 893f13064..19bf7fd32 100644
--- a/kernel/x86_64/KERNEL.BULLDOZER
+++ b/kernel/x86_64/KERNEL.BULLDOZER
@@ -2,7 +2,7 @@ SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t.c
 
 ZGEMVNKERNEL = zgemv_n_dup.S
-ZGEMVTKERNEL = zgemv_t.S
+ZGEMVTKERNEL = zgemv_t.c
 
 DGEMVNKERNEL = dgemv_n_bulldozer.S
 DGEMVTKERNEL = dgemv_t_bulldozer.S
diff --git a/kernel/x86_64/zgemv_t.c b/kernel/x86_64/zgemv_t.c
index a2dc45c45..b54d5f4e2 100644
--- a/kernel/x86_64/zgemv_t.c
+++ b/kernel/x86_64/zgemv_t.c
@@ -28,11 +28,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-/*
-#if defined(HASWELL)
-#include "zgemv_t_microk_haswell-2.c"
+
+#if defined(BULLDOZER)
+#include "zgemv_t_microk_bulldozer-2.c"
 #endif
-*/
+
 
 #define NBMAX 1028
 
diff --git a/kernel/x86_64/zgemv_t_microk_bulldozer-2.c b/kernel/x86_64/zgemv_t_microk_bulldozer-2.c
index efb6d784e..65d5a10a2 100644
--- a/kernel/x86_64/zgemv_t_microk_bulldozer-2.c
+++ b/kernel/x86_64/zgemv_t_microk_bulldozer-2.c
@@ -37,77 +37,118 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	(
 	"vzeroupper			 \n\t"
 
-	"vxorpd		%%ymm8 , %%ymm8 , %%ymm8 	\n\t" // temp
-	"vxorpd		%%ymm9 , %%ymm9 , %%ymm9 	\n\t" // temp
-	"vxorpd		%%ymm10, %%ymm10, %%ymm10	\n\t" // temp
-	"vxorpd		%%ymm11, %%ymm11, %%ymm11	\n\t" // temp
-	"vxorpd		%%ymm12, %%ymm12, %%ymm12	\n\t" // temp
-	"vxorpd		%%ymm13, %%ymm13, %%ymm13	\n\t"
-	"vxorpd		%%ymm14, %%ymm14, %%ymm14	\n\t"
-	"vxorpd		%%ymm15, %%ymm15, %%ymm15	\n\t"
+	"vxorpd		%%xmm8 , %%xmm8 , %%xmm8 	\n\t" // temp
+	"vxorpd		%%xmm9 , %%xmm9 , %%xmm9 	\n\t" // temp
+	"vxorpd		%%xmm10, %%xmm10, %%xmm10	\n\t" // temp
+	"vxorpd		%%xmm11, %%xmm11, %%xmm11	\n\t" // temp
+	"vxorpd		%%xmm12, %%xmm12, %%xmm12	\n\t" // temp
+	"vxorpd		%%xmm13, %%xmm13, %%xmm13	\n\t"
+	"vxorpd		%%xmm14, %%xmm14, %%xmm14	\n\t"
+	"vxorpd		%%xmm15, %%xmm15, %%xmm15	\n\t"
 
 	".align 16				        \n\t"
 	".L01LOOP%=:				        \n\t"
 
 	"vmovddup	   (%2,%0,8), %%xmm0            \n\t"  // real value from x0
 	"vmovddup	  8(%2,%0,8), %%xmm1            \n\t"  // imag value from x0
-	"vmovddup	 16(%2,%0,8), %%xmm2            \n\t"  // real value from x1
-	"vmovddup	 24(%2,%0,8), %%xmm3            \n\t"  // imag value from x1
-	"vinsertf128	 $1, %%xmm2, %%ymm0 , %%ymm0	\n\t"  // real values from x0 and x1
-	"vinsertf128	 $1, %%xmm3, %%ymm1 , %%ymm1	\n\t"  // imag values from x0 and x1
 
-	"vmovups	(%4,%0,8), %%ymm4	        \n\t" // 2 complex values from a0
-	"vmovups	(%5,%0,8), %%ymm5               \n\t" // 2 complex values from a1
-	"vmovups	(%6,%0,8), %%ymm6	        \n\t" // 2 complex values from a2
-	"vmovups	(%7,%0,8), %%ymm7               \n\t" // 2 complex values from a3
+	"prefetcht0  192(%4,%0,8)                       \n\t"
+	"vmovups	(%4,%0,8), %%xmm4	        \n\t" // 1 complex values from a0
+	"prefetcht0  192(%5,%0,8)                       \n\t"
+	"vmovups	(%5,%0,8), %%xmm5               \n\t" // 1 complex values from a1
+	"prefetcht0  192(%6,%0,8)                       \n\t"
+	"vmovups	(%6,%0,8), %%xmm6	        \n\t" // 1 complex values from a2
+	"prefetcht0  192(%7,%0,8)                       \n\t"
+	"vmovups	(%7,%0,8), %%xmm7               \n\t" // 1 complex values from a3
 
-	"vfmaddpd   %%ymm8 ,   %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
-	"vfmaddpd   %%ymm9 ,   %%ymm4 , %%ymm1, %%ymm9       \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
-	"vfmaddpd   %%ymm10,   %%ymm5 , %%ymm0, %%ymm10      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
-	"vfmaddpd   %%ymm11,   %%ymm5 , %%ymm1, %%ymm11      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
-	"vfmaddpd   %%ymm12,   %%ymm6 , %%ymm0, %%ymm12      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
-	"vfmaddpd   %%ymm13,   %%ymm6 , %%ymm1, %%ymm13      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
-	"vfmaddpd   %%ymm14,   %%ymm7 , %%ymm0, %%ymm14      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
-	"vfmaddpd   %%ymm15,   %%ymm7 , %%ymm1, %%ymm15      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmaddpd   %%xmm8 ,   %%xmm4 , %%xmm0, %%xmm8       \n\t" // ar0*xr0,al0*xr0 
+	"vfmaddpd   %%xmm9 ,   %%xmm4 , %%xmm1, %%xmm9       \n\t" // ar0*xl0,al0*xl0 
+	"vfmaddpd   %%xmm10,   %%xmm5 , %%xmm0, %%xmm10      \n\t" // ar0*xr0,al0*xr0
+	"vfmaddpd   %%xmm11,   %%xmm5 , %%xmm1, %%xmm11      \n\t" // ar0*xl0,al0*xl0 
+	"vfmaddpd   %%xmm12,   %%xmm6 , %%xmm0, %%xmm12      \n\t" // ar0*xr0,al0*xr0 
+	"vfmaddpd   %%xmm13,   %%xmm6 , %%xmm1, %%xmm13      \n\t" // ar0*xl0,al0*xl0 
+	"vfmaddpd   %%xmm14,   %%xmm7 , %%xmm0, %%xmm14      \n\t" // ar0*xr0,al0*xr0 
+	"vfmaddpd   %%xmm15,   %%xmm7 , %%xmm1, %%xmm15      \n\t" // ar0*xl0,al0*xl0 
 
+	"vmovddup	 16(%2,%0,8), %%xmm0            \n\t"  // real value from x0
+	"vmovddup	 24(%2,%0,8), %%xmm1            \n\t"  // imag value from x0
 
-        "addq		$4 , %0	  	 	        \n\t"
-	"subq	        $2 , %1			        \n\t"		
+	"vmovups      16(%4,%0,8), %%xmm4	        \n\t" // 1 complex values from a0
+	"vmovups      16(%5,%0,8), %%xmm5               \n\t" // 1 complex values from a1
+	"vmovups      16(%6,%0,8), %%xmm6	        \n\t" // 1 complex values from a2
+	"vmovups      16(%7,%0,8), %%xmm7               \n\t" // 1 complex values from a3
+
+	"vfmaddpd   %%xmm8 ,   %%xmm4 , %%xmm0, %%xmm8       \n\t" // ar0*xr0,al0*xr0 
+	"vfmaddpd   %%xmm9 ,   %%xmm4 , %%xmm1, %%xmm9       \n\t" // ar0*xl0,al0*xl0 
+	"vfmaddpd   %%xmm10,   %%xmm5 , %%xmm0, %%xmm10      \n\t" // ar0*xr0,al0*xr0
+	"vfmaddpd   %%xmm11,   %%xmm5 , %%xmm1, %%xmm11      \n\t" // ar0*xl0,al0*xl0 
+	"vfmaddpd   %%xmm12,   %%xmm6 , %%xmm0, %%xmm12      \n\t" // ar0*xr0,al0*xr0 
+	"vfmaddpd   %%xmm13,   %%xmm6 , %%xmm1, %%xmm13      \n\t" // ar0*xl0,al0*xl0 
+	"vfmaddpd   %%xmm14,   %%xmm7 , %%xmm0, %%xmm14      \n\t" // ar0*xr0,al0*xr0 
+	"vfmaddpd   %%xmm15,   %%xmm7 , %%xmm1, %%xmm15      \n\t" // ar0*xl0,al0*xl0 
+
+	"vmovddup	 32(%2,%0,8), %%xmm0            \n\t"  // real value from x0
+	"vmovddup	 40(%2,%0,8), %%xmm1            \n\t"  // imag value from x0
+
+	"vmovups      32(%4,%0,8), %%xmm4	        \n\t" // 1 complex values from a0
+	"vmovups      32(%5,%0,8), %%xmm5               \n\t" // 1 complex values from a1
+	"vmovups      32(%6,%0,8), %%xmm6	        \n\t" // 1 complex values from a2
+	"vmovups      32(%7,%0,8), %%xmm7               \n\t" // 1 complex values from a3
+
+	"vfmaddpd   %%xmm8 ,   %%xmm4 , %%xmm0, %%xmm8       \n\t" // ar0*xr0,al0*xr0 
+	"vfmaddpd   %%xmm9 ,   %%xmm4 , %%xmm1, %%xmm9       \n\t" // ar0*xl0,al0*xl0 
+	"vfmaddpd   %%xmm10,   %%xmm5 , %%xmm0, %%xmm10      \n\t" // ar0*xr0,al0*xr0
+	"vfmaddpd   %%xmm11,   %%xmm5 , %%xmm1, %%xmm11      \n\t" // ar0*xl0,al0*xl0 
+	"vfmaddpd   %%xmm12,   %%xmm6 , %%xmm0, %%xmm12      \n\t" // ar0*xr0,al0*xr0 
+	"vfmaddpd   %%xmm13,   %%xmm6 , %%xmm1, %%xmm13      \n\t" // ar0*xl0,al0*xl0 
+	"vfmaddpd   %%xmm14,   %%xmm7 , %%xmm0, %%xmm14      \n\t" // ar0*xr0,al0*xr0 
+	"vfmaddpd   %%xmm15,   %%xmm7 , %%xmm1, %%xmm15      \n\t" // ar0*xl0,al0*xl0 
+
+	"vmovddup	 48(%2,%0,8), %%xmm0            \n\t"  // real value from x0
+	"vmovddup	 56(%2,%0,8), %%xmm1            \n\t"  // imag value from x0
+
+	"vmovups      48(%4,%0,8), %%xmm4	        \n\t" // 1 complex values from a0
+	"vmovups      48(%5,%0,8), %%xmm5               \n\t" // 1 complex values from a1
+	"vmovups      48(%6,%0,8), %%xmm6	        \n\t" // 1 complex values from a2
+	"vmovups      48(%7,%0,8), %%xmm7               \n\t" // 1 complex values from a3
+
+	"vfmaddpd   %%xmm8 ,   %%xmm4 , %%xmm0, %%xmm8       \n\t" // ar0*xr0,al0*xr0 
+	"vfmaddpd   %%xmm9 ,   %%xmm4 , %%xmm1, %%xmm9       \n\t" // ar0*xl0,al0*xl0 
+	"vfmaddpd   %%xmm10,   %%xmm5 , %%xmm0, %%xmm10      \n\t" // ar0*xr0,al0*xr0
+	"vfmaddpd   %%xmm11,   %%xmm5 , %%xmm1, %%xmm11      \n\t" // ar0*xl0,al0*xl0 
+	"vfmaddpd   %%xmm12,   %%xmm6 , %%xmm0, %%xmm12      \n\t" // ar0*xr0,al0*xr0 
+	"vfmaddpd   %%xmm13,   %%xmm6 , %%xmm1, %%xmm13      \n\t" // ar0*xl0,al0*xl0 
+	"vfmaddpd   %%xmm14,   %%xmm7 , %%xmm0, %%xmm14      \n\t" // ar0*xr0,al0*xr0 
+	"vfmaddpd   %%xmm15,   %%xmm7 , %%xmm1, %%xmm15      \n\t" // ar0*xl0,al0*xl0 
+
+        "addq		$8 , %0	  	 	        \n\t"
+	"subq	        $4 , %1			        \n\t"		
 	"jnz		.L01LOOP%=		        \n\t"
 
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-        "vpermilpd      $0x5 , %%ymm9 , %%ymm9                \n\t"
-        "vpermilpd      $0x5 , %%ymm11, %%ymm11               \n\t"
-        "vpermilpd      $0x5 , %%ymm13, %%ymm13               \n\t"
-        "vpermilpd      $0x5 , %%ymm15, %%ymm15               \n\t"
-        "vaddsubpd      %%ymm9 , %%ymm8, %%ymm8               \n\t" 
-        "vaddsubpd      %%ymm11, %%ymm10, %%ymm10             \n\t"
-        "vaddsubpd      %%ymm13, %%ymm12, %%ymm12             \n\t"
-        "vaddsubpd      %%ymm15, %%ymm14, %%ymm14             \n\t"
+        "vpermilpd      $0x1 , %%xmm9 , %%xmm9                \n\t"
+        "vpermilpd      $0x1 , %%xmm11, %%xmm11               \n\t"
+        "vpermilpd      $0x1 , %%xmm13, %%xmm13               \n\t"
+        "vpermilpd      $0x1 , %%xmm15, %%xmm15               \n\t"
+        "vaddsubpd      %%xmm9 , %%xmm8, %%xmm8               \n\t" 
+        "vaddsubpd      %%xmm11, %%xmm10, %%xmm10             \n\t"
+        "vaddsubpd      %%xmm13, %%xmm12, %%xmm12             \n\t"
+        "vaddsubpd      %%xmm15, %%xmm14, %%xmm14             \n\t"
 #else
-        "vpermilpd      $0x5 , %%ymm8 , %%ymm8                \n\t"
-        "vpermilpd      $0x5 , %%ymm10, %%ymm10               \n\t"
-        "vpermilpd      $0x5 , %%ymm12, %%ymm12               \n\t"
-        "vpermilpd      $0x5 , %%ymm14, %%ymm14               \n\t"
-        "vaddsubpd      %%ymm8 , %%ymm9 , %%ymm8              \n\t"
-        "vaddsubpd      %%ymm10, %%ymm11, %%ymm10             \n\t"
-        "vaddsubpd      %%ymm12, %%ymm13, %%ymm12             \n\t"
-        "vaddsubpd      %%ymm14, %%ymm15, %%ymm14             \n\t"
-        "vpermilpd      $0x5 , %%ymm8 , %%ymm8                \n\t"
-        "vpermilpd      $0x5 , %%ymm10, %%ymm10               \n\t"
-        "vpermilpd      $0x5 , %%ymm12, %%ymm12               \n\t"
-        "vpermilpd      $0x5 , %%ymm14, %%ymm14               \n\t"
+        "vpermilpd      $0x1 , %%xmm8 , %%xmm8                \n\t"
+        "vpermilpd      $0x1 , %%xmm10, %%xmm10               \n\t"
+        "vpermilpd      $0x1 , %%xmm12, %%xmm12               \n\t"
+        "vpermilpd      $0x1 , %%xmm14, %%xmm14               \n\t"
+        "vaddsubpd      %%xmm8 , %%xmm9 , %%xmm8              \n\t"
+        "vaddsubpd      %%xmm10, %%xmm11, %%xmm10             \n\t"
+        "vaddsubpd      %%xmm12, %%xmm13, %%xmm12             \n\t"
+        "vaddsubpd      %%xmm14, %%xmm15, %%xmm14             \n\t"
+        "vpermilpd      $0x1 , %%xmm8 , %%xmm8                \n\t"
+        "vpermilpd      $0x1 , %%xmm10, %%xmm10               \n\t"
+        "vpermilpd      $0x1 , %%xmm12, %%xmm12               \n\t"
+        "vpermilpd      $0x1 , %%xmm14, %%xmm14               \n\t"
 #endif
 
-	"vextractf128   $1, %%ymm8 , %%xmm9		      \n\t"
-	"vextractf128   $1, %%ymm10, %%xmm11	      	      \n\t"
-	"vextractf128   $1, %%ymm12, %%xmm13		      \n\t"
-	"vextractf128   $1, %%ymm14, %%xmm15		      \n\t"
-
-	"vaddpd		%%xmm8 , %%xmm9 , %%xmm8       \n\t"
-	"vaddpd		%%xmm10, %%xmm11, %%xmm10      \n\t"
-	"vaddpd		%%xmm12, %%xmm13, %%xmm12      \n\t"
-	"vaddpd		%%xmm14, %%xmm15, %%xmm14      \n\t"
 
 	"vmovups	%%xmm8 ,   (%3)			\n\t"
 	"vmovups	%%xmm10, 16(%3)			\n\t"

From 58b075daef4c65b02951ed8a8fd78dc53cab0893 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Mon, 11 Aug 2014 16:57:52 +0200
Subject: [PATCH 61/74] added optimized zgemv_t kernel for haswell

---
 kernel/x86_64/KERNEL.HASWELL             |  1 +
 kernel/x86_64/zgemv_t.c                  |  2 ++
 kernel/x86_64/zgemv_t_microk_haswell-2.c | 29 ++++++++++++++++++++++--
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index 7d4cddbcc..9a48289c5 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -6,6 +6,7 @@ DGEMVTKERNEL = dgemv_t.c
 ifndef OS_WINDOWS 
 ZGEMVNKERNEL = zgemv_n.c
 endif
+ZGEMVTKERNEL = zgemv_t.c
 
 SGEMMKERNEL    =  sgemm_kernel_16x4_haswell.S
 SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
diff --git a/kernel/x86_64/zgemv_t.c b/kernel/x86_64/zgemv_t.c
index b54d5f4e2..df75afeff 100644
--- a/kernel/x86_64/zgemv_t.c
+++ b/kernel/x86_64/zgemv_t.c
@@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(BULLDOZER)
 #include "zgemv_t_microk_bulldozer-2.c"
+#elif defined(HASWELL)
+#include "zgemv_t_microk_haswell-2.c"
 #endif
 
 
diff --git a/kernel/x86_64/zgemv_t_microk_haswell-2.c b/kernel/x86_64/zgemv_t_microk_haswell-2.c
index 2dddef27d..99a620e44 100644
--- a/kernel/x86_64/zgemv_t_microk_haswell-2.c
+++ b/kernel/x86_64/zgemv_t_microk_haswell-2.c
@@ -49,6 +49,7 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	".align 16				        \n\t"
 	".L01LOOP%=:				        \n\t"
 
+        "prefetcht0      384(%2,%0,8)                   \n\t"
 	"vmovddup	   (%2,%0,8), %%xmm0            \n\t"  // real value from x0
 	"vmovddup	  8(%2,%0,8), %%xmm1            \n\t"  // imag value from x0
 	"vmovddup	 16(%2,%0,8), %%xmm2            \n\t"  // real value from x1
@@ -56,9 +57,13 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	"vinsertf128	 $1, %%xmm2, %%ymm0 , %%ymm0	\n\t"  // real values from x0 and x1
 	"vinsertf128	 $1, %%xmm3, %%ymm1 , %%ymm1	\n\t"  // imag values from x0 and x1
 
+        "prefetcht0      384(%4,%0,8)                   \n\t"
 	"vmovups	(%4,%0,8), %%ymm4	        \n\t" // 2 complex values from a0
+        "prefetcht0      384(%5,%0,8)                   \n\t"
 	"vmovups	(%5,%0,8), %%ymm5               \n\t" // 2 complex values from a1
+        "prefetcht0      384(%6,%0,8)                   \n\t"
 	"vmovups	(%6,%0,8), %%ymm6	        \n\t" // 2 complex values from a2
+        "prefetcht0      384(%7,%0,8)                   \n\t"
 	"vmovups	(%7,%0,8), %%ymm7               \n\t" // 2 complex values from a3
 
 	"vfmadd231pd      %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
@@ -70,9 +75,29 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	"vfmadd231pd      %%ymm7 , %%ymm0, %%ymm14      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
 	"vfmadd231pd      %%ymm7 , %%ymm1, %%ymm15      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
 
+	"vmovddup	 32(%2,%0,8), %%xmm0            \n\t"  // real value from x0
+	"vmovddup	 40(%2,%0,8), %%xmm1            \n\t"  // imag value from x0
+	"vmovddup	 48(%2,%0,8), %%xmm2            \n\t"  // real value from x1
+	"vmovddup	 56(%2,%0,8), %%xmm3            \n\t"  // imag value from x1
+	"vinsertf128	 $1, %%xmm2, %%ymm0 , %%ymm0	\n\t"  // real values from x0 and x1
+	"vinsertf128	 $1, %%xmm3, %%ymm1 , %%ymm1	\n\t"  // imag values from x0 and x1
 
-        "addq		$4 , %0	  	 	        \n\t"
-	"subq	        $2 , %1			        \n\t"		
+	"vmovups       32(%4,%0,8), %%ymm4	        \n\t" // 2 complex values from a0
+	"vmovups       32(%5,%0,8), %%ymm5              \n\t" // 2 complex values from a1
+	"vmovups       32(%6,%0,8), %%ymm6	        \n\t" // 2 complex values from a2
+	"vmovups       32(%7,%0,8), %%ymm7               \n\t" // 2 complex values from a3
+
+	"vfmadd231pd      %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmadd231pd      %%ymm4 , %%ymm1, %%ymm9       \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmadd231pd      %%ymm5 , %%ymm0, %%ymm10      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmadd231pd      %%ymm5 , %%ymm1, %%ymm11      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmadd231pd      %%ymm6 , %%ymm0, %%ymm12      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmadd231pd      %%ymm6 , %%ymm1, %%ymm13      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmadd231pd      %%ymm7 , %%ymm0, %%ymm14      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmadd231pd      %%ymm7 , %%ymm1, %%ymm15      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+
+        "addq		$8 , %0	  	 	        \n\t"
+	"subq	        $4 , %1			        \n\t"		
 	"jnz		.L01LOOP%=		        \n\t"
 
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )

From 07c66b196093b3bc124f674a341ce304939eccde Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 12 Aug 2014 08:35:42 +0200
Subject: [PATCH 62/74] modified algorithm for better numerical stability

---
 kernel/x86_64/zgemv_n.c | 114 ++++++++++------------------------------
 1 file changed, 27 insertions(+), 87 deletions(-)

diff --git a/kernel/x86_64/zgemv_n.c b/kernel/x86_64/zgemv_n.c
index 141cb35df..75e40eccb 100644
--- a/kernel/x86_64/zgemv_n.c
+++ b/kernel/x86_64/zgemv_n.c
@@ -48,8 +48,7 @@ static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 
 	for ( i=0; i< 2*n; i+=2 )
 	{
-#if !defined(CONJ) 
-#if !defined(XCONJ)
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 		y[i]   += a0[i]*x[0] - a0[i+1] * x[1];
 		y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
 		y[i]   += a1[i]*x[2] - a1[i+1] * x[3];
@@ -67,29 +66,6 @@ static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 		y[i+1] += a2[i]*x[5] - a2[i+1] * x[4];
 		y[i]   += a3[i]*x[6] + a3[i+1] * x[7];
 		y[i+1] += a3[i]*x[7] - a3[i+1] * x[6];
-#endif
-#else
-#if !defined(XCONJ)
-		y[i]   += a0[i]*x[0] + a0[i+1] * x[1];
-		y[i+1] -= a0[i]*x[1] - a0[i+1] * x[0];
-		y[i]   += a1[i]*x[2] + a1[i+1] * x[3];
-		y[i+1] -= a1[i]*x[3] - a1[i+1] * x[2];
-		y[i]   += a2[i]*x[4] + a2[i+1] * x[5];
-		y[i+1] -= a2[i]*x[5] - a2[i+1] * x[4];
-		y[i]   += a3[i]*x[6] + a3[i+1] * x[7];
-		y[i+1] -= a3[i]*x[7] - a3[i+1] * x[6];
-
-#else
-		y[i]   += a0[i]*x[0] - a0[i+1] * x[1];
-		y[i+1] -= a0[i]*x[1] + a0[i+1] * x[0];
-		y[i]   += a1[i]*x[2] - a1[i+1] * x[3];
-		y[i+1] -= a1[i]*x[3] + a1[i+1] * x[2];
-		y[i]   += a2[i]*x[4] - a2[i+1] * x[5];
-		y[i+1] -= a2[i]*x[5] + a2[i+1] * x[4];
-		y[i]   += a3[i]*x[6] - a3[i+1] * x[7];
-		y[i+1] -= a3[i]*x[7] + a3[i+1] * x[6];
-
-#endif
 #endif
 	}
 }
@@ -104,23 +80,12 @@ static void zgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 
 	for ( i=0; i< 2*n; i+=2 )
 	{
-#if !defined(CONJ) 
-#if !defined(XCONJ)
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 		y[i]   += a0[i]*x[0] - a0[i+1] * x[1];
 		y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
 #else 
 		y[i]   += a0[i]*x[0] + a0[i+1] * x[1];
 		y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
-#endif
-#else
-#if !defined(XCONJ)
-		y[i]   += a0[i]*x[0] + a0[i+1] * x[1];
-		y[i+1] -= a0[i]*x[1] - a0[i+1] * x[0];
-
-#else
-		y[i]   += a0[i]*x[0] - a0[i+1] * x[1];
-		y[i+1] -= a0[i]*x[1] + a0[i+1] * x[0];
-#endif
 #endif
 
 	}
@@ -139,17 +104,24 @@ static void zero_y(BLASLONG n, FLOAT *dest)
 
 
 
-static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
 {
 	BLASLONG i;
+	FLOAT temp_r;
+	FLOAT temp_i;
 	for ( i=0; i<n; i++ )
 	{
-		*dest += *src;
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-		*(dest+1) += *(src+1);
+#if !defined(XCONJ) 
+		temp_r = alpha_r * src[0] - alpha_i * src[1];
+		temp_i = alpha_r * src[1] + alpha_i * src[0];
 #else
-		*(dest+1) -= *(src+1);
+		temp_r =  alpha_r * src[0] + alpha_i * src[1];
+		temp_i = -alpha_r * src[1] + alpha_i * src[0];
 #endif
+
+		*dest += temp_r;
+		*(dest+1) += temp_i;
+
 		src+=2;
 		dest += inc_dest;
 	}
@@ -201,33 +173,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
 		for( i = 0; i < n1 ; i++)
 		{
 
-#if !defined(XCONJ)
-			xbuffer[0] = alpha_r * x_ptr[0] - alpha_i * x_ptr[1];
-			xbuffer[1] = alpha_r * x_ptr[1] + alpha_i * x_ptr[0];
+			xbuffer[0] = x_ptr[0];
+			xbuffer[1] = x_ptr[1];
 			x_ptr += inc_x;	
-			xbuffer[2] = alpha_r * x_ptr[0] - alpha_i * x_ptr[1];
-			xbuffer[3] = alpha_r * x_ptr[1] + alpha_i * x_ptr[0];
+			xbuffer[2] = x_ptr[0];
+			xbuffer[3] = x_ptr[1];
 			x_ptr += inc_x;	
-			xbuffer[4] = alpha_r * x_ptr[0] - alpha_i * x_ptr[1];
-			xbuffer[5] = alpha_r * x_ptr[1] + alpha_i * x_ptr[0];
+			xbuffer[4] = x_ptr[0];
+			xbuffer[5] = x_ptr[1];
 			x_ptr += inc_x;	
-			xbuffer[6] = alpha_r * x_ptr[0] - alpha_i * x_ptr[1];
-			xbuffer[7] = alpha_r * x_ptr[1] + alpha_i * x_ptr[0];
+			xbuffer[6] = x_ptr[0];
+			xbuffer[7] = x_ptr[1];
 			x_ptr += inc_x;	
-#else
-			xbuffer[0] = alpha_r * x_ptr[0] + alpha_i * x_ptr[1];
-			xbuffer[1] = alpha_r * x_ptr[1] - alpha_i * x_ptr[0];
-			x_ptr += inc_x;	
-			xbuffer[2] = alpha_r * x_ptr[0] + alpha_i * x_ptr[1];
-			xbuffer[3] = alpha_r * x_ptr[1] - alpha_i * x_ptr[0];
-			x_ptr += inc_x;	
-			xbuffer[4] = alpha_r * x_ptr[0] + alpha_i * x_ptr[1];
-			xbuffer[5] = alpha_r * x_ptr[1] - alpha_i * x_ptr[0];
-			x_ptr += inc_x;	
-			xbuffer[6] = alpha_r * x_ptr[0] + alpha_i * x_ptr[1];
-			xbuffer[7] = alpha_r * x_ptr[1] - alpha_i * x_ptr[0];
-			x_ptr += inc_x;	
-#endif
 
 			ap[0] = a_ptr;
 			ap[1] = a_ptr + lda;
@@ -239,24 +196,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
 
 		for( i = 0; i < n2 ; i++)
 		{
-#if !defined(XCONJ)
-			xbuffer[0] = alpha_r * x_ptr[0] - alpha_i * x_ptr[1];
-			xbuffer[1] = alpha_r * x_ptr[1] + alpha_i * x_ptr[0];
+			xbuffer[0] = x_ptr[0];
+			xbuffer[1] = x_ptr[1];
 			x_ptr += inc_x;	
-#else
-			xbuffer[0] = alpha_r * x_ptr[0] + alpha_i * x_ptr[1];
-			xbuffer[1] = alpha_r * x_ptr[1] - alpha_i * x_ptr[0];
-			x_ptr += inc_x;	
-#endif
-
 			zgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
 			a_ptr += 1 * lda;
 
 		}
-		add_y(NB,ybuffer,y_ptr,inc_y);
+		add_y(NB,ybuffer,y_ptr,inc_y,alpha_r,alpha_i);
 		a     += 2 * NB;
 		y_ptr += NB * inc_y;
 	}
+
 	j=0;
 	while ( j < (m % 16))
 	{
@@ -266,24 +217,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
 		FLOAT temp_i = 0.0;
 		for( i = 0; i < n; i++ )
 		{
-#if  !defined(CONJ)
-#if  !defined(XCONJ)
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 			temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
 			temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
 #else
 			temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
 			temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
 #endif
-#else
-#if  !defined(XCONJ)
-			temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-			temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-#else
-			temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-			temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-#endif
-#endif
-
 
 			a_ptr += lda;
 			x_ptr += inc_x;

From 6093ee53639a77477fd55b7cfdea56df0e93ba4a Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 12 Aug 2014 10:02:25 +0200
Subject: [PATCH 63/74] bugfix in zgemv_n_microk_haswell-2.c

---
 kernel/x86_64/zgemv_n_microk_haswell-2.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/x86_64/zgemv_n_microk_haswell-2.c b/kernel/x86_64/zgemv_n_microk_haswell-2.c
index 833983fe0..bb40ec3ac 100644
--- a/kernel/x86_64/zgemv_n_microk_haswell-2.c
+++ b/kernel/x86_64/zgemv_n_microk_haswell-2.c
@@ -99,8 +99,8 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 #else
         "vpermilpd      $0x5 , %%ymm12, %%ymm12               \n\t"
         "vpermilpd      $0x5 , %%ymm14, %%ymm14               \n\t"
-        "vaddsubpd      %%ymm13, %%ymm12, %%ymm8              \n\t"
-        "vaddsubpd      %%ymm15, %%ymm14, %%ymm9              \n\t"
+        "vaddsubpd      %%ymm12, %%ymm13, %%ymm8              \n\t"
+        "vaddsubpd      %%ymm14, %%ymm15, %%ymm9              \n\t"
         "vpermilpd      $0x5 , %%ymm8 , %%ymm8                \n\t"
         "vpermilpd      $0x5 , %%ymm9 , %%ymm9                \n\t"
 #endif

From b06550519eb6a02d87f73a48b73fcef2fdedb9c9 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 12 Aug 2014 12:15:41 +0200
Subject: [PATCH 64/74] added optimized cgemv_t c-kernel

---
 kernel/x86_64/cgemv_t.c                | 269 +++++++++++++++++++++++++
 kernel/x86_64/zgemv_n_microk_sandy-2.c | 161 +++++++++++++++
 2 files changed, 430 insertions(+)
 create mode 100644 kernel/x86_64/cgemv_t.c
 create mode 100644 kernel/x86_64/zgemv_n_microk_sandy-2.c

diff --git a/kernel/x86_64/cgemv_t.c b/kernel/x86_64/cgemv_t.c
new file mode 100644
index 000000000..ccdf13a57
--- /dev/null
+++ b/kernel/x86_64/cgemv_t.c
@@ -0,0 +1,269 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+/*
+#if defined(BULLDOZER)
+#include "zgemv_t_microk_bulldozer-2.c"
+#elif defined(HASWELL)
+#include "zgemv_t_microk_haswell-2.c"
+#endif
+*/
+
+#define NBMAX 2048
+
+#ifndef HAVE_KERNEL_16x4
+
+static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3];
+	FLOAT temp_r0 = 0.0;
+	FLOAT temp_r1 = 0.0;
+	FLOAT temp_r2 = 0.0;
+	FLOAT temp_r3 = 0.0;
+	FLOAT temp_i0 = 0.0;
+	FLOAT temp_i1 = 0.0;
+	FLOAT temp_i2 = 0.0;
+	FLOAT temp_i3 = 0.0;
+
+
+	for ( i=0; i< 2*n; i+=2 )
+	{
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+		temp_r0 += a0[i]*x[i]   - a0[i+1]*x[i+1];		
+		temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i];		
+		temp_r1 += a1[i]*x[i]   - a1[i+1]*x[i+1];		
+		temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i];		
+		temp_r2 += a2[i]*x[i]   - a2[i+1]*x[i+1];		
+		temp_i2 += a2[i]*x[i+1] + a2[i+1]*x[i];		
+		temp_r3 += a3[i]*x[i]   - a3[i+1]*x[i+1];		
+		temp_i3 += a3[i]*x[i+1] + a3[i+1]*x[i];		
+#else
+		temp_r0 += a0[i]*x[i]   + a0[i+1]*x[i+1];		
+		temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i];		
+		temp_r1 += a1[i]*x[i]   + a1[i+1]*x[i+1];		
+		temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i];		
+		temp_r2 += a2[i]*x[i]   + a2[i+1]*x[i+1];		
+		temp_i2 += a2[i]*x[i+1] - a2[i+1]*x[i];		
+		temp_r3 += a3[i]*x[i]   + a3[i+1]*x[i+1];		
+		temp_i3 += a3[i]*x[i+1] - a3[i+1]*x[i];		
+#endif
+	}
+	y[0] = temp_r0;
+	y[1] = temp_i0;
+	y[2] = temp_r1;
+	y[3] = temp_i1;
+	y[4] = temp_r2;
+	y[5] = temp_i2;
+	y[6] = temp_r3;
+	y[7] = temp_i3;
+}
+	
+#endif
+
+static void zgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+	FLOAT *a0;
+	a0 = ap;
+	FLOAT temp_r = 0.0;
+	FLOAT temp_i = 0.0;
+
+	for ( i=0; i< 2*n; i+=2 )
+	{
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+		temp_r += a0[i]*x[i]   - a0[i+1]*x[i+1];		
+		temp_i += a0[i]*x[i+1] + a0[i+1]*x[i];		
+#else
+		temp_r += a0[i]*x[i]   + a0[i+1]*x[i+1];		
+		temp_i += a0[i]*x[i+1] - a0[i+1]*x[i];		
+#endif
+	}
+	*y      = temp_r;
+	*(y+1)  = temp_i;
+}
+	
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
+{
+        BLASLONG i;
+        for ( i=0; i<n; i++ )
+        {
+                *dest     = *src;
+                *(dest+1) = *(src+1);
+                dest+=2;
+                src += inc_src;
+        }
+}
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	FLOAT *ap[8];
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG n2;
+	FLOAT ybuffer[8],*xbuffer;
+
+        inc_x *= 2;
+        inc_y *= 2;
+        lda   *= 2;
+
+	xbuffer = buffer;
+	
+	n1 = n / 4 ;
+	n2 = n % 4 ;
+	
+	m1 = m - ( m % 16 );
+	m2 = (m % NBMAX) - (m % 16) ;
+	
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		y_ptr = y;
+		a_ptr = a;
+		x_ptr = x;
+		copy_x(NB,x_ptr,xbuffer,inc_x);
+		for( i = 0; i < n1 ; i++)
+		{
+			ap[0] = a_ptr;
+			ap[1] = a_ptr + lda;
+			ap[2] = ap[1] + lda;
+			ap[3] = ap[2] + lda;
+			zgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
+			a_ptr += 4 * lda;
+
+#if !defined(XCONJ)
+			y_ptr[0] += alpha_r * ybuffer[0] - alpha_i * ybuffer[1];
+			y_ptr[1] += alpha_r * ybuffer[1] + alpha_i * ybuffer[0];
+			y_ptr  += inc_y;
+			y_ptr[0] += alpha_r * ybuffer[2] - alpha_i * ybuffer[3];
+			y_ptr[1] += alpha_r * ybuffer[3] + alpha_i * ybuffer[2];
+			y_ptr  += inc_y;
+			y_ptr[0] += alpha_r * ybuffer[4] - alpha_i * ybuffer[5];
+			y_ptr[1] += alpha_r * ybuffer[5] + alpha_i * ybuffer[4];
+			y_ptr  += inc_y;
+			y_ptr[0] += alpha_r * ybuffer[6] - alpha_i * ybuffer[7];
+			y_ptr[1] += alpha_r * ybuffer[7] + alpha_i * ybuffer[6];
+			y_ptr  += inc_y;
+#else
+			y_ptr[0] += alpha_r * ybuffer[0] + alpha_i * ybuffer[1];
+			y_ptr[1] -= alpha_r * ybuffer[1] - alpha_i * ybuffer[0];
+			y_ptr  += inc_y;
+			y_ptr[0] += alpha_r * ybuffer[2] + alpha_i * ybuffer[3];
+			y_ptr[1] -= alpha_r * ybuffer[3] - alpha_i * ybuffer[2];
+			y_ptr  += inc_y;
+			y_ptr[0] += alpha_r * ybuffer[4] + alpha_i * ybuffer[5];
+			y_ptr[1] -= alpha_r * ybuffer[5] - alpha_i * ybuffer[4];
+			y_ptr  += inc_y;
+			y_ptr[0] += alpha_r * ybuffer[6] + alpha_i * ybuffer[7];
+			y_ptr[1] -= alpha_r * ybuffer[7] - alpha_i * ybuffer[6];
+			y_ptr  += inc_y;
+#endif
+		}
+
+		for( i = 0; i < n2 ; i++)
+		{
+			zgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
+			a_ptr += 1 * lda;
+
+#if !defined(XCONJ)
+			y_ptr[0] += alpha_r * ybuffer[0] - alpha_i * ybuffer[1];
+			y_ptr[1] += alpha_r * ybuffer[1] + alpha_i * ybuffer[0];
+			y_ptr  += inc_y;
+#else
+			y_ptr[0] += alpha_r * ybuffer[0] + alpha_i * ybuffer[1];
+			y_ptr[1] -= alpha_r * ybuffer[1] - alpha_i * ybuffer[0];
+			y_ptr  += inc_y;
+#endif
+
+		}
+		a += 2* NB;
+		x += NB * inc_x;	
+	}
+
+	BLASLONG m3 = m % 16;
+	if ( m3 == 0 ) return(0);
+
+	x_ptr = x;
+	copy_x(m3,x_ptr,xbuffer,inc_x);
+	j=0;
+	a_ptr = a;
+	y_ptr = y;
+	while ( j < n)
+	{
+		FLOAT temp_r = 0.0;
+		FLOAT temp_i = 0.0;
+		for( i = 0; i < m3*2; i+=2 )
+		{
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+			temp_r += a_ptr[i] * xbuffer[i]   - a_ptr[i+1] * xbuffer[i+1];
+			temp_i += a_ptr[i] * xbuffer[i+1] + a_ptr[i+1] * xbuffer[i];
+#else
+			temp_r += a_ptr[i] * xbuffer[i]   + a_ptr[i+1] * xbuffer[i+1];
+			temp_i += a_ptr[i] * xbuffer[i+1] - a_ptr[i+1] * xbuffer[i];
+#endif
+		}
+		a_ptr += lda;
+
+#if !defined(XCONJ) 
+                y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+                y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+                y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+                y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+		y_ptr += inc_y;
+		j++;
+	}
+	return(0);
+}
+
+
diff --git a/kernel/x86_64/zgemv_n_microk_sandy-2.c b/kernel/x86_64/zgemv_n_microk_sandy-2.c
new file mode 100644
index 000000000..8061ed4fa
--- /dev/null
+++ b/kernel/x86_64/zgemv_n_microk_sandy-2.c
@@ -0,0 +1,161 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16x4 1
+static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+
+	"vbroadcastsd	  (%2), %%ymm0                  \n\t"  // real part x0
+	"vbroadcastsd	 8(%2), %%ymm1                  \n\t"  // imag part x0
+	"vbroadcastsd	16(%2), %%ymm2                  \n\t"  // real part x1
+	"vbroadcastsd	24(%2), %%ymm3                  \n\t"  // imag part x1
+	"vbroadcastsd	32(%2), %%ymm4                  \n\t"  // real part x2
+	"vbroadcastsd	40(%2), %%ymm5                  \n\t"  // imag part x2
+	"vbroadcastsd	48(%2), %%ymm6                  \n\t"  // real part x3
+	"vbroadcastsd	56(%2), %%ymm7                  \n\t"  // imag part x3
+
+
+	".align 16				        \n\t"
+	".L01LOOP%=:				        \n\t"
+
+	"vxorpd		%%ymm12, %%ymm12, %%ymm12	\n\t"
+	"vxorpd		%%ymm13, %%ymm13, %%ymm13	\n\t"
+	"vxorpd		%%ymm14, %%ymm14, %%ymm14	\n\t"
+	"vxorpd		%%ymm15, %%ymm15, %%ymm15	\n\t"
+
+	"vmovups	(%4,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
+	"vmovups      32(%4,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
+
+	"vmulpd		  %%ymm8 , %%ymm0 , %%ymm10	\n\t"
+	"vaddpd		  %%ymm12, %%ymm10, %%ymm12	\n\t"
+	"vmulpd		  %%ymm8 , %%ymm1 , %%ymm11	\n\t"
+	"vaddpd		  %%ymm13, %%ymm11, %%ymm13	\n\t"
+	"vmulpd		  %%ymm9 , %%ymm0 , %%ymm10	\n\t"
+	"vaddpd		  %%ymm14, %%ymm10, %%ymm14	\n\t"
+	"vmulpd		  %%ymm9 , %%ymm1 , %%ymm11	\n\t"
+	"vaddpd		  %%ymm15, %%ymm11, %%ymm15	\n\t"
+
+	"vmovups	(%5,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
+	"vmovups      32(%5,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
+
+	"vmulpd		  %%ymm8 , %%ymm2 , %%ymm10	\n\t"
+	"vaddpd		  %%ymm12, %%ymm10, %%ymm12	\n\t"
+	"vmulpd		  %%ymm8 , %%ymm3 , %%ymm11	\n\t"
+	"vaddpd		  %%ymm13, %%ymm11, %%ymm13	\n\t"
+	"vmulpd		  %%ymm9 , %%ymm2 , %%ymm10	\n\t"
+	"vaddpd		  %%ymm14, %%ymm10, %%ymm14	\n\t"
+	"vmulpd		  %%ymm9 , %%ymm3 , %%ymm11	\n\t"
+	"vaddpd		  %%ymm15, %%ymm11, %%ymm15	\n\t"
+
+	"vmovups	(%6,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
+	"vmovups      32(%6,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
+
+	"vmulpd		  %%ymm8 , %%ymm4 , %%ymm10	\n\t"
+	"vaddpd		  %%ymm12, %%ymm10, %%ymm12	\n\t"
+	"vmulpd		  %%ymm8 , %%ymm5 , %%ymm11	\n\t"
+	"vaddpd		  %%ymm13, %%ymm11, %%ymm13	\n\t"
+	"vmulpd		  %%ymm9 , %%ymm4 , %%ymm10	\n\t"
+	"vaddpd		  %%ymm14, %%ymm10, %%ymm14	\n\t"
+	"vmulpd		  %%ymm9 , %%ymm5 , %%ymm11	\n\t"
+	"vaddpd		  %%ymm15, %%ymm11, %%ymm15	\n\t"
+
+	"vmovups	(%7,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
+	"vmovups      32(%7,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
+
+	"vmulpd		  %%ymm8 , %%ymm6 , %%ymm10	\n\t"
+	"vaddpd		  %%ymm12, %%ymm10, %%ymm12	\n\t"
+	"vmulpd		  %%ymm8 , %%ymm7 , %%ymm11	\n\t"
+	"vaddpd		  %%ymm13, %%ymm11, %%ymm13	\n\t"
+	"vmulpd		  %%ymm9 , %%ymm6 , %%ymm10	\n\t"
+	"vaddpd		  %%ymm14, %%ymm10, %%ymm14	\n\t"
+	"vmulpd		  %%ymm9 , %%ymm7 , %%ymm11	\n\t"
+	"vaddpd		  %%ymm15, %%ymm11, %%ymm15	\n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vpermilpd      $0x5 , %%ymm13, %%ymm13               \n\t"
+        "vpermilpd      $0x5 , %%ymm15, %%ymm15               \n\t"
+        "vaddsubpd      %%ymm13, %%ymm12, %%ymm8              \n\t"
+        "vaddsubpd      %%ymm15, %%ymm14, %%ymm9              \n\t"
+#else
+        "vpermilpd      $0x5 , %%ymm12, %%ymm12               \n\t"
+        "vpermilpd      $0x5 , %%ymm14, %%ymm14               \n\t"
+        "vaddsubpd      %%ymm12, %%ymm13, %%ymm8              \n\t"
+        "vaddsubpd      %%ymm14, %%ymm15, %%ymm9              \n\t"
+        "vpermilpd      $0x5 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilpd      $0x5 , %%ymm9 , %%ymm9                \n\t"
+#endif
+
+	"prefetcht0      192(%3,%0,8)			\n\t"
+	"vmovups	  (%3,%0,8),  %%ymm12           \n\t"
+	"vmovups	32(%3,%0,8),  %%ymm13           \n\t"
+
+#if !defined(XCONJ)
+        "vaddpd         %%ymm8, %%ymm12, %%ymm12              \n\t"
+        "vaddpd         %%ymm9, %%ymm13, %%ymm13              \n\t"
+#else
+        "vaddsubpd              %%ymm12, %%ymm8, %%ymm12              \n\t"
+        "vaddsubpd              %%ymm13, %%ymm9, %%ymm13              \n\t"
+#endif
+
+
+	"vmovups  %%ymm12,   (%3,%0,8)		        \n\t" // 2 complex values to y	
+	"vmovups  %%ymm13, 32(%3,%0,8)		        \n\t"	
+
+        "addq		$8 , %0	  	 	        \n\t"
+	"subq	        $4 , %1			        \n\t"		
+	"jnz		.L01LOOP%=		        \n\t"
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From 9528f0d9ee39d6050dc8330842e00a59fb751839 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Wed, 13 Aug 2014 12:18:03 +0200
Subject: [PATCH 65/74] bugfix in zgemv_n_microk_sandy-2.c

---
 kernel/x86_64/zgemv_n_microk_sandy-2.c | 35 ++++++++------------------
 1 file changed, 10 insertions(+), 25 deletions(-)

diff --git a/kernel/x86_64/zgemv_n_microk_sandy-2.c b/kernel/x86_64/zgemv_n_microk_sandy-2.c
index 8061ed4fa..f90e2210a 100644
--- a/kernel/x86_64/zgemv_n_microk_sandy-2.c
+++ b/kernel/x86_64/zgemv_n_microk_sandy-2.c
@@ -50,22 +50,13 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	".align 16				        \n\t"
 	".L01LOOP%=:				        \n\t"
 
-	"vxorpd		%%ymm12, %%ymm12, %%ymm12	\n\t"
-	"vxorpd		%%ymm13, %%ymm13, %%ymm13	\n\t"
-	"vxorpd		%%ymm14, %%ymm14, %%ymm14	\n\t"
-	"vxorpd		%%ymm15, %%ymm15, %%ymm15	\n\t"
-
 	"vmovups	(%4,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
 	"vmovups      32(%4,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
 
-	"vmulpd		  %%ymm8 , %%ymm0 , %%ymm10	\n\t"
-	"vaddpd		  %%ymm12, %%ymm10, %%ymm12	\n\t"
-	"vmulpd		  %%ymm8 , %%ymm1 , %%ymm11	\n\t"
-	"vaddpd		  %%ymm13, %%ymm11, %%ymm13	\n\t"
-	"vmulpd		  %%ymm9 , %%ymm0 , %%ymm10	\n\t"
-	"vaddpd		  %%ymm14, %%ymm10, %%ymm14	\n\t"
-	"vmulpd		  %%ymm9 , %%ymm1 , %%ymm11	\n\t"
-	"vaddpd		  %%ymm15, %%ymm11, %%ymm15	\n\t"
+	"vmulpd		  %%ymm8 , %%ymm0 , %%ymm12	\n\t"
+	"vmulpd		  %%ymm8 , %%ymm1 , %%ymm13	\n\t"
+	"vmulpd		  %%ymm9 , %%ymm0 , %%ymm14	\n\t"
+	"vmulpd		  %%ymm9 , %%ymm1 , %%ymm15	\n\t"
 
 	"vmovups	(%5,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
 	"vmovups      32(%5,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
@@ -103,6 +94,10 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	"vmulpd		  %%ymm9 , %%ymm7 , %%ymm11	\n\t"
 	"vaddpd		  %%ymm15, %%ymm11, %%ymm15	\n\t"
 
+	"prefetcht0      192(%3,%0,8)			\n\t"
+	"vmovups	  (%3,%0,8),  %%ymm10           \n\t"
+	"vmovups	32(%3,%0,8),  %%ymm11           \n\t"
+
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
         "vpermilpd      $0x5 , %%ymm13, %%ymm13               \n\t"
         "vpermilpd      $0x5 , %%ymm15, %%ymm15               \n\t"
@@ -117,18 +112,8 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
         "vpermilpd      $0x5 , %%ymm9 , %%ymm9                \n\t"
 #endif
 
-	"prefetcht0      192(%3,%0,8)			\n\t"
-	"vmovups	  (%3,%0,8),  %%ymm12           \n\t"
-	"vmovups	32(%3,%0,8),  %%ymm13           \n\t"
-
-#if !defined(XCONJ)
-        "vaddpd         %%ymm8, %%ymm12, %%ymm12              \n\t"
-        "vaddpd         %%ymm9, %%ymm13, %%ymm13              \n\t"
-#else
-        "vaddsubpd              %%ymm12, %%ymm8, %%ymm12              \n\t"
-        "vaddsubpd              %%ymm13, %%ymm9, %%ymm13              \n\t"
-#endif
-
+        "vaddpd         %%ymm8, %%ymm10, %%ymm12              \n\t"
+        "vaddpd         %%ymm9, %%ymm11, %%ymm13              \n\t"
 
 	"vmovups  %%ymm12,   (%3,%0,8)		        \n\t" // 2 complex values to y	
 	"vmovups  %%ymm13, 32(%3,%0,8)		        \n\t"	

From 11e34ddd1b45832606d5ef000d07519410f30676 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Wed, 13 Aug 2014 12:54:18 +0200
Subject: [PATCH 66/74] bugfix for zgemv_n_microk_haswell-2.c

---
 kernel/x86_64/KERNEL.HASWELL             |  4 +---
 kernel/x86_64/zgemv_n.c                  | 11 ++++++++-
 kernel/x86_64/zgemv_n_microk_haswell-2.c | 30 +++++++-----------------
 3 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index 9a48289c5..2d54920cc 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -3,10 +3,8 @@ SGEMVTKERNEL = sgemv_t.c
 DGEMVNKERNEL = dgemv_n.c
 DGEMVTKERNEL = dgemv_t.c
 
-ifndef OS_WINDOWS 
 ZGEMVNKERNEL = zgemv_n.c
-endif
-ZGEMVTKERNEL = zgemv_t.c
+#ZGEMVTKERNEL = zgemv_t.c
 
 SGEMMKERNEL    =  sgemm_kernel_16x4_haswell.S
 SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
diff --git a/kernel/x86_64/zgemv_n.c b/kernel/x86_64/zgemv_n.c
index 75e40eccb..7b8907044 100644
--- a/kernel/x86_64/zgemv_n.c
+++ b/kernel/x86_64/zgemv_n.c
@@ -25,7 +25,8 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-
+#include <stdlib.h>
+#include <stdio.h>
 #include "common.h"
 
 #if defined(HASWELL)
@@ -141,6 +142,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
 	BLASLONG n2;
 	FLOAT xbuffer[8],*ybuffer;
 
+
+#if 0
+printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y);
+#endif
+
+	if ( m < 1 ) return(0);
+	if ( n < 1 ) return(0);
+
 	ybuffer = buffer;
 	
 	inc_x *= 2;
diff --git a/kernel/x86_64/zgemv_n_microk_haswell-2.c b/kernel/x86_64/zgemv_n_microk_haswell-2.c
index bb40ec3ac..e1c5838f7 100644
--- a/kernel/x86_64/zgemv_n_microk_haswell-2.c
+++ b/kernel/x86_64/zgemv_n_microk_haswell-2.c
@@ -53,19 +53,14 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	"vmovups	(%4,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
 	"vmovups      32(%4,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
 
-	"vxorpd		%%ymm12, %%ymm12, %%ymm12	\n\t"
-	"vxorpd		%%ymm13, %%ymm13, %%ymm13	\n\t"
-	"vxorpd		%%ymm14, %%ymm14, %%ymm14	\n\t"
-	"vxorpd		%%ymm15, %%ymm15, %%ymm15	\n\t"
-
 	"prefetcht0      192(%5,%0,8)			\n\t"
 	"vmovups	(%5,%0,8), %%ymm10              \n\t" // 2 complex values form a1
 	"vmovups      32(%5,%0,8), %%ymm11              \n\t" // 2 complex values form a1
 
-	"vfmadd231pd      %%ymm8 , %%ymm0, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
-	"vfmadd231pd      %%ymm8 , %%ymm1, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
-	"vfmadd231pd      %%ymm9 , %%ymm0, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
-	"vfmadd231pd      %%ymm9 , %%ymm1, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+	"vmulpd      %%ymm8 , %%ymm0, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vmulpd      %%ymm8 , %%ymm1, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+	"vmulpd      %%ymm9 , %%ymm0, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vmulpd      %%ymm9 , %%ymm1, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
 
 	"prefetcht0      192(%6,%0,8)			\n\t"
 	"vmovups	(%6,%0,8), %%ymm8	        \n\t" // 2 complex values form a2
@@ -90,6 +85,9 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	"vfmadd231pd      %%ymm11, %%ymm6, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
 	"vfmadd231pd      %%ymm11, %%ymm7, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
 
+	"prefetcht0      192(%3,%0,8)			\n\t"
+	"vmovups	  (%3,%0,8),  %%ymm10           \n\t"
+	"vmovups	32(%3,%0,8),  %%ymm11           \n\t"
 
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
         "vpermilpd      $0x5 , %%ymm13, %%ymm13               \n\t"
@@ -105,18 +103,8 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
         "vpermilpd      $0x5 , %%ymm9 , %%ymm9                \n\t"
 #endif
 
-	"prefetcht0      192(%3,%0,8)			\n\t"
-	"vmovups	  (%3,%0,8),  %%ymm12           \n\t"
-	"vmovups	32(%3,%0,8),  %%ymm13           \n\t"
-
-#if !defined(XCONJ)
-        "vaddpd         %%ymm8, %%ymm12, %%ymm12              \n\t"
-        "vaddpd         %%ymm9, %%ymm13, %%ymm13              \n\t"
-#else
-        "vaddsubpd              %%ymm12, %%ymm8, %%ymm12              \n\t"
-        "vaddsubpd              %%ymm13, %%ymm9, %%ymm13              \n\t"
-#endif
-
+        "vaddpd         %%ymm8, %%ymm10, %%ymm12              \n\t"
+        "vaddpd         %%ymm9, %%ymm11, %%ymm13              \n\t"
 
 	"vmovups  %%ymm12,   (%3,%0,8)		        \n\t" // 2 complex values to y	
 	"vmovups  %%ymm13, 32(%3,%0,8)		        \n\t"	

From 8c582d362d0f8a53c222dc4c9cbb7919cdb32116 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Wed, 13 Aug 2014 13:42:22 +0200
Subject: [PATCH 67/74] optimized zgemv_t_microk_haswell-2.c

---
 kernel/x86_64/KERNEL.HASWELL             |  2 +-
 kernel/x86_64/zgemv_t.c                  |  3 +++
 kernel/x86_64/zgemv_t_microk_haswell-2.c | 28 +++++++++++-------------
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index 2d54920cc..6d0792f16 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -4,7 +4,7 @@ DGEMVNKERNEL = dgemv_n.c
 DGEMVTKERNEL = dgemv_t.c
 
 ZGEMVNKERNEL = zgemv_n.c
-#ZGEMVTKERNEL = zgemv_t.c
+ZGEMVTKERNEL = zgemv_t.c
 
 SGEMMKERNEL    =  sgemm_kernel_16x4_haswell.S
 SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
diff --git a/kernel/x86_64/zgemv_t.c b/kernel/x86_64/zgemv_t.c
index df75afeff..bb3f90420 100644
--- a/kernel/x86_64/zgemv_t.c
+++ b/kernel/x86_64/zgemv_t.c
@@ -141,6 +141,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
 	BLASLONG n2;
 	FLOAT ybuffer[8],*xbuffer;
 
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
         inc_x *= 2;
         inc_y *= 2;
         lda   *= 2;
diff --git a/kernel/x86_64/zgemv_t_microk_haswell-2.c b/kernel/x86_64/zgemv_t_microk_haswell-2.c
index 99a620e44..8325db5cf 100644
--- a/kernel/x86_64/zgemv_t_microk_haswell-2.c
+++ b/kernel/x86_64/zgemv_t_microk_haswell-2.c
@@ -49,23 +49,22 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	".align 16				        \n\t"
 	".L01LOOP%=:				        \n\t"
 
-        "prefetcht0      384(%2,%0,8)                   \n\t"
+        "prefetcht0      192(%2,%0,8)                   \n\t"
 	"vmovddup	   (%2,%0,8), %%xmm0            \n\t"  // real value from x0
+        "prefetcht0      192(%4,%0,8)                   \n\t"
+	"vmovups	(%5,%0,8), %%ymm5               \n\t" // 2 complex values from a1
 	"vmovddup	  8(%2,%0,8), %%xmm1            \n\t"  // imag value from x0
+	"vmovups	(%4,%0,8), %%ymm4	        \n\t" // 2 complex values from a0
+        "prefetcht0      192(%5,%0,8)                   \n\t"
 	"vmovddup	 16(%2,%0,8), %%xmm2            \n\t"  // real value from x1
+        "prefetcht0      192(%6,%0,8)                   \n\t"
+	"vmovups	(%6,%0,8), %%ymm6	        \n\t" // 2 complex values from a2
 	"vmovddup	 24(%2,%0,8), %%xmm3            \n\t"  // imag value from x1
+        "prefetcht0      192(%7,%0,8)                   \n\t"
+	"vmovups	(%7,%0,8), %%ymm7               \n\t" // 2 complex values from a3
 	"vinsertf128	 $1, %%xmm2, %%ymm0 , %%ymm0	\n\t"  // real values from x0 and x1
 	"vinsertf128	 $1, %%xmm3, %%ymm1 , %%ymm1	\n\t"  // imag values from x0 and x1
 
-        "prefetcht0      384(%4,%0,8)                   \n\t"
-	"vmovups	(%4,%0,8), %%ymm4	        \n\t" // 2 complex values from a0
-        "prefetcht0      384(%5,%0,8)                   \n\t"
-	"vmovups	(%5,%0,8), %%ymm5               \n\t" // 2 complex values from a1
-        "prefetcht0      384(%6,%0,8)                   \n\t"
-	"vmovups	(%6,%0,8), %%ymm6	        \n\t" // 2 complex values from a2
-        "prefetcht0      384(%7,%0,8)                   \n\t"
-	"vmovups	(%7,%0,8), %%ymm7               \n\t" // 2 complex values from a3
-
 	"vfmadd231pd      %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
 	"vfmadd231pd      %%ymm4 , %%ymm1, %%ymm9       \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
 	"vfmadd231pd      %%ymm5 , %%ymm0, %%ymm10      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
@@ -75,17 +74,16 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	"vfmadd231pd      %%ymm7 , %%ymm0, %%ymm14      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
 	"vfmadd231pd      %%ymm7 , %%ymm1, %%ymm15      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
 
+	"vmovups       32(%4,%0,8), %%ymm4	        \n\t" // 2 complex values from a0
+	"vmovups       32(%5,%0,8), %%ymm5              \n\t" // 2 complex values from a1
 	"vmovddup	 32(%2,%0,8), %%xmm0            \n\t"  // real value from x0
 	"vmovddup	 40(%2,%0,8), %%xmm1            \n\t"  // imag value from x0
 	"vmovddup	 48(%2,%0,8), %%xmm2            \n\t"  // real value from x1
 	"vmovddup	 56(%2,%0,8), %%xmm3            \n\t"  // imag value from x1
-	"vinsertf128	 $1, %%xmm2, %%ymm0 , %%ymm0	\n\t"  // real values from x0 and x1
-	"vinsertf128	 $1, %%xmm3, %%ymm1 , %%ymm1	\n\t"  // imag values from x0 and x1
-
-	"vmovups       32(%4,%0,8), %%ymm4	        \n\t" // 2 complex values from a0
-	"vmovups       32(%5,%0,8), %%ymm5              \n\t" // 2 complex values from a1
 	"vmovups       32(%6,%0,8), %%ymm6	        \n\t" // 2 complex values from a2
 	"vmovups       32(%7,%0,8), %%ymm7               \n\t" // 2 complex values from a3
+	"vinsertf128	 $1, %%xmm2, %%ymm0 , %%ymm0	\n\t"  // real values from x0 and x1
+	"vinsertf128	 $1, %%xmm3, %%ymm1 , %%ymm1	\n\t"  // imag values from x0 and x1
 
 	"vfmadd231pd      %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
 	"vfmadd231pd      %%ymm4 , %%ymm1, %%ymm9       \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 

From 2470129132df121aa922e4abe955c64a5d1385cb Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Wed, 13 Aug 2014 13:54:19 +0200
Subject: [PATCH 68/74] added fast return, if m or n < 1

---
 kernel/x86_64/dgemv_n.c | 3 +++
 kernel/x86_64/dgemv_t.c | 3 +++
 kernel/x86_64/sgemv_n.c | 3 +++
 kernel/x86_64/sgemv_t.c | 3 +++
 kernel/x86_64/zgemv_t.c | 2 +-
 5 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/kernel/x86_64/dgemv_n.c b/kernel/x86_64/dgemv_n.c
index 5192ba193..5d826dc63 100644
--- a/kernel/x86_64/dgemv_n.c
+++ b/kernel/x86_64/dgemv_n.c
@@ -125,6 +125,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	BLASLONG n2;
 	FLOAT xbuffer[4],*ybuffer;
 
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
 	ybuffer = buffer;
 	
 	n1 = n / 4 ;
diff --git a/kernel/x86_64/dgemv_t.c b/kernel/x86_64/dgemv_t.c
index 76aacd349..0fa8378fe 100644
--- a/kernel/x86_64/dgemv_t.c
+++ b/kernel/x86_64/dgemv_t.c
@@ -104,6 +104,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	BLASLONG n2;
 	FLOAT ybuffer[4],*xbuffer;
 
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
 	xbuffer = buffer;
 	
 	n1 = n / 4 ;
diff --git a/kernel/x86_64/sgemv_n.c b/kernel/x86_64/sgemv_n.c
index f2de1b76a..faa8e1f8c 100644
--- a/kernel/x86_64/sgemv_n.c
+++ b/kernel/x86_64/sgemv_n.c
@@ -131,6 +131,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	BLASLONG n2;
 	FLOAT xbuffer[4],*ybuffer;
 
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
 	ybuffer = buffer;
 	
 	n1 = n / 4 ;
diff --git a/kernel/x86_64/sgemv_t.c b/kernel/x86_64/sgemv_t.c
index adfaa9925..532afee5d 100644
--- a/kernel/x86_64/sgemv_t.c
+++ b/kernel/x86_64/sgemv_t.c
@@ -110,6 +110,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	BLASLONG n2;
 	FLOAT ybuffer[4],*xbuffer;
 
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
 	xbuffer = buffer;
 	
 	n1 = n / 4 ;
diff --git a/kernel/x86_64/zgemv_t.c b/kernel/x86_64/zgemv_t.c
index bb3f90420..9f5444a72 100644
--- a/kernel/x86_64/zgemv_t.c
+++ b/kernel/x86_64/zgemv_t.c
@@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(BULLDOZER)
+#if defined(BULLDOZER) || defined(PILEDRIVER)
 #include "zgemv_t_microk_bulldozer-2.c"
 #elif defined(HASWELL)
 #include "zgemv_t_microk_haswell-2.c"

From dc0593731365c17a694d297afd5ba8bf1bfaf0cd Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Wed, 13 Aug 2014 14:54:50 +0200
Subject: [PATCH 69/74] added additional test values

---
 lapack-netlib/TESTING/dstest.in | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lapack-netlib/TESTING/dstest.in b/lapack-netlib/TESTING/dstest.in
index 6ec68d13a..4a31076a6 100644
--- a/lapack-netlib/TESTING/dstest.in
+++ b/lapack-netlib/TESTING/dstest.in
@@ -1,8 +1,8 @@
 Data file for testing DSGESV/DSPOSV LAPACK routines
 12                                      Number of values of M
 0 1 2 13 17 45 78 91 101 119 120 132    values of M (row dimension)
-4                                       Number of values of NRHS
-1 2 14 16                               Values of NRHS (number of right hand sides)
+6                                       Number of values of NRHS
+1 2 14 15 16 13                         Values of NRHS (number of right hand sides)
 30.0                                    Threshold value of test ratio
 T                                       Put T to test the driver routine
 T                                       Put T to test the error exits

From c1a6374c6fe7df294aeca2c550bc58d61acfa654 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Wed, 13 Aug 2014 16:10:03 +0200
Subject: [PATCH 70/74] optimized zgemv_n kernel for sandybridge

---
 kernel/x86_64/KERNEL.SANDYBRIDGE       |  3 +++
 kernel/x86_64/zgemv_n.c                |  3 +++
 kernel/x86_64/zgemv_n_microk_sandy-2.c | 13 ++++++++-----
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE
index d4fbca7f2..b654d3564 100644
--- a/kernel/x86_64/KERNEL.SANDYBRIDGE
+++ b/kernel/x86_64/KERNEL.SANDYBRIDGE
@@ -1,6 +1,9 @@
 SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t.c
 
+ZGEMVNKERNEL = zgemv_n.c
+
+
 SGEMMKERNEL    =  sgemm_kernel_16x4_sandy.S
 SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
 SGEMMITCOPY    =  ../generic/gemm_tcopy_16.c
diff --git a/kernel/x86_64/zgemv_n.c b/kernel/x86_64/zgemv_n.c
index 7b8907044..9098368a5 100644
--- a/kernel/x86_64/zgemv_n.c
+++ b/kernel/x86_64/zgemv_n.c
@@ -31,9 +31,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(HASWELL)
 #include "zgemv_n_microk_haswell-2.c"
+#elif defined(SANDYBRIDGE)
+#include "zgemv_n_microk_sandy-2.c"
 #endif
 
 
+
 #define NBMAX 1024
 
 #ifndef HAVE_KERNEL_16x4
diff --git a/kernel/x86_64/zgemv_n_microk_sandy-2.c b/kernel/x86_64/zgemv_n_microk_sandy-2.c
index f90e2210a..352c60f87 100644
--- a/kernel/x86_64/zgemv_n_microk_sandy-2.c
+++ b/kernel/x86_64/zgemv_n_microk_sandy-2.c
@@ -50,39 +50,42 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	".align 16				        \n\t"
 	".L01LOOP%=:				        \n\t"
 
+        "prefetcht0      256(%4,%0,8)                   \n\t"
 	"vmovups	(%4,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
 	"vmovups      32(%4,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
 
 	"vmulpd		  %%ymm8 , %%ymm0 , %%ymm12	\n\t"
 	"vmulpd		  %%ymm8 , %%ymm1 , %%ymm13	\n\t"
+        "prefetcht0      256(%5,%0,8)                   \n\t"
 	"vmulpd		  %%ymm9 , %%ymm0 , %%ymm14	\n\t"
-	"vmulpd		  %%ymm9 , %%ymm1 , %%ymm15	\n\t"
-
 	"vmovups	(%5,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
+	"vmulpd		  %%ymm9 , %%ymm1 , %%ymm15	\n\t"
 	"vmovups      32(%5,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
 
 	"vmulpd		  %%ymm8 , %%ymm2 , %%ymm10	\n\t"
 	"vaddpd		  %%ymm12, %%ymm10, %%ymm12	\n\t"
 	"vmulpd		  %%ymm8 , %%ymm3 , %%ymm11	\n\t"
 	"vaddpd		  %%ymm13, %%ymm11, %%ymm13	\n\t"
+        "prefetcht0      256(%6,%0,8)                   \n\t"
 	"vmulpd		  %%ymm9 , %%ymm2 , %%ymm10	\n\t"
 	"vaddpd		  %%ymm14, %%ymm10, %%ymm14	\n\t"
+	"vmovups	(%6,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
 	"vmulpd		  %%ymm9 , %%ymm3 , %%ymm11	\n\t"
 	"vaddpd		  %%ymm15, %%ymm11, %%ymm15	\n\t"
 
-	"vmovups	(%6,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
 	"vmovups      32(%6,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
 
 	"vmulpd		  %%ymm8 , %%ymm4 , %%ymm10	\n\t"
 	"vaddpd		  %%ymm12, %%ymm10, %%ymm12	\n\t"
 	"vmulpd		  %%ymm8 , %%ymm5 , %%ymm11	\n\t"
 	"vaddpd		  %%ymm13, %%ymm11, %%ymm13	\n\t"
+        "prefetcht0      256(%7,%0,8)                   \n\t"
 	"vmulpd		  %%ymm9 , %%ymm4 , %%ymm10	\n\t"
 	"vaddpd		  %%ymm14, %%ymm10, %%ymm14	\n\t"
+	"vmovups	(%7,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
 	"vmulpd		  %%ymm9 , %%ymm5 , %%ymm11	\n\t"
 	"vaddpd		  %%ymm15, %%ymm11, %%ymm15	\n\t"
 
-	"vmovups	(%7,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
 	"vmovups      32(%7,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
 
 	"vmulpd		  %%ymm8 , %%ymm6 , %%ymm10	\n\t"
@@ -94,7 +97,7 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	"vmulpd		  %%ymm9 , %%ymm7 , %%ymm11	\n\t"
 	"vaddpd		  %%ymm15, %%ymm11, %%ymm15	\n\t"
 
-	"prefetcht0      192(%3,%0,8)			\n\t"
+	"prefetcht0      256(%3,%0,8)			\n\t"
 	"vmovups	  (%3,%0,8),  %%ymm10           \n\t"
 	"vmovups	32(%3,%0,8),  %%ymm11           \n\t"
 

From 4568d32b6bb1ad27882268b8866ef35def75605e Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Thu, 14 Aug 2014 14:10:29 +0200
Subject: [PATCH 71/74] added optimized cgemv_t kernel for haswell

---
 kernel/x86_64/KERNEL.HASWELL             |   2 +
 kernel/x86_64/cgemv_t.c                  |  18 +--
 kernel/x86_64/cgemv_t_microk_haswell-2.c | 171 +++++++++++++++++++++++
 3 files changed, 180 insertions(+), 11 deletions(-)
 create mode 100644 kernel/x86_64/cgemv_t_microk_haswell-2.c

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index 6d0792f16..e07448abb 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -6,6 +6,8 @@ DGEMVTKERNEL = dgemv_t.c
 ZGEMVNKERNEL = zgemv_n.c
 ZGEMVTKERNEL = zgemv_t.c
 
+CGEMVTKERNEL = cgemv_t.c
+
 SGEMMKERNEL    =  sgemm_kernel_16x4_haswell.S
 SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
 SGEMMITCOPY    =  ../generic/gemm_tcopy_16.c
diff --git a/kernel/x86_64/cgemv_t.c b/kernel/x86_64/cgemv_t.c
index ccdf13a57..e40fd349e 100644
--- a/kernel/x86_64/cgemv_t.c
+++ b/kernel/x86_64/cgemv_t.c
@@ -28,19 +28,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-/*
-#if defined(BULLDOZER)
-#include "zgemv_t_microk_bulldozer-2.c"
-#elif defined(HASWELL)
-#include "zgemv_t_microk_haswell-2.c"
+#if defined(HASWELL)
+#include "cgemv_t_microk_haswell-2.c"
 #endif
-*/
 
 #define NBMAX 2048
 
 #ifndef HAVE_KERNEL_16x4
 
-static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+static void cgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {
 	BLASLONG i;
 	FLOAT *a0,*a1,*a2,*a3;
@@ -92,7 +88,7 @@ static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	
 #endif
 
-static void zgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+static void cgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 {
 	BLASLONG i;
 	FLOAT *a0;
@@ -113,7 +109,7 @@ static void zgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 	*y      = temp_r;
 	*(y+1)  = temp_i;
 }
-	
+
 static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
 {
         BLASLONG i;
@@ -176,7 +172,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
 			ap[1] = a_ptr + lda;
 			ap[2] = ap[1] + lda;
 			ap[3] = ap[2] + lda;
-			zgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
+			cgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
 			a_ptr += 4 * lda;
 
 #if !defined(XCONJ)
@@ -210,7 +206,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
 
 		for( i = 0; i < n2 ; i++)
 		{
-			zgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
+			cgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
 			a_ptr += 1 * lda;
 
 #if !defined(XCONJ)
diff --git a/kernel/x86_64/cgemv_t_microk_haswell-2.c b/kernel/x86_64/cgemv_t_microk_haswell-2.c
new file mode 100644
index 000000000..0d79714af
--- /dev/null
+++ b/kernel/x86_64/cgemv_t_microk_haswell-2.c
@@ -0,0 +1,171 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary froms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary from must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16x4 1
+static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+
+	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 	\n\t" // temp
+	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 	\n\t" // temp
+	"vxorps		%%ymm10, %%ymm10, %%ymm10	\n\t" // temp
+	"vxorps		%%ymm11, %%ymm11, %%ymm11	\n\t" // temp
+	"vxorps		%%ymm12, %%ymm12, %%ymm12	\n\t" // temp
+	"vxorps		%%ymm13, %%ymm13, %%ymm13	\n\t"
+	"vxorps		%%ymm14, %%ymm14, %%ymm14	\n\t"
+	"vxorps		%%ymm15, %%ymm15, %%ymm15	\n\t"
+
+	".align 16				        \n\t"
+	".L01LOOP%=:				        \n\t"
+        "prefetcht0      192(%4,%0,4)                   \n\t"
+	"vmovups	(%4,%0,4), %%ymm4	        \n\t" // 4 complex values from a0
+        "prefetcht0      192(%5,%0,4)                   \n\t"
+	"vmovups	(%5,%0,4), %%ymm5               \n\t" // 4 complex values from a1
+
+        "prefetcht0      192(%2,%0,4)                   \n\t"
+	"vmovups	    (%2,%0,4)  , %%ymm6		\n\t" // 4 complex values from x
+	"vpermilps        $0xb1, %%ymm6, %%ymm7		\n\t" // exchange real and imap parts
+	"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0         \n\t" // only the real parts
+	"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1         \n\t" // only the imag parts
+	
+        "prefetcht0      192(%6,%0,4)                   \n\t"
+	"vmovups	(%6,%0,4), %%ymm6	        \n\t" // 4 complex values from a2
+        "prefetcht0      192(%7,%0,4)                   \n\t"
+	"vmovups	(%7,%0,4), %%ymm7               \n\t" // 4 complex values from a3
+
+	"vfmadd231ps      %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmadd231ps      %%ymm4 , %%ymm1, %%ymm9       \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmadd231ps      %%ymm5 , %%ymm0, %%ymm10      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmadd231ps      %%ymm5 , %%ymm1, %%ymm11      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmadd231ps      %%ymm6 , %%ymm0, %%ymm12      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmadd231ps      %%ymm6 , %%ymm1, %%ymm13      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmadd231ps      %%ymm7 , %%ymm0, %%ymm14      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmadd231ps      %%ymm7 , %%ymm1, %%ymm15      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+
+	"vmovups       32(%4,%0,4), %%ymm4	        \n\t" // 2 complex values from a0
+	"vmovups       32(%5,%0,4), %%ymm5              \n\t" // 2 complex values from a1
+
+	"vmovups	  32(%2,%0,4)  , %%ymm6		\n\t" // 4 complex values from x
+	"vpermilps        $0xb1, %%ymm6, %%ymm7		\n\t" // exchange real and imap parts
+	"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0         \n\t" // only the real parts
+	"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1         \n\t" // only the imag parts
+
+	"vmovups       32(%6,%0,4), %%ymm6	        \n\t" // 2 complex values from a2
+	"vmovups       32(%7,%0,4), %%ymm7              \n\t" // 2 complex values from a3
+
+	"vfmadd231ps      %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmadd231ps      %%ymm4 , %%ymm1, %%ymm9       \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmadd231ps      %%ymm5 , %%ymm0, %%ymm10      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmadd231ps      %%ymm5 , %%ymm1, %%ymm11      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmadd231ps      %%ymm6 , %%ymm0, %%ymm12      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmadd231ps      %%ymm6 , %%ymm1, %%ymm13      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmadd231ps      %%ymm7 , %%ymm0, %%ymm14      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmadd231ps      %%ymm7 , %%ymm1, %%ymm15      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+
+        "addq		$16 , %0	  	 	        \n\t"
+	"subq	        $8  , %1			        \n\t"		
+	"jnz		.L01LOOP%=		        \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vpermilps      $0xb1 , %%ymm9 , %%ymm9                \n\t"
+        "vpermilps      $0xb1 , %%ymm11, %%ymm11               \n\t"
+        "vpermilps      $0xb1 , %%ymm13, %%ymm13               \n\t"
+        "vpermilps      $0xb1 , %%ymm15, %%ymm15               \n\t"
+        "vaddsubps      %%ymm9 , %%ymm8, %%ymm8               \n\t" 
+        "vaddsubps      %%ymm11, %%ymm10, %%ymm10             \n\t"
+        "vaddsubps      %%ymm13, %%ymm12, %%ymm12             \n\t"
+        "vaddsubps      %%ymm15, %%ymm14, %%ymm14             \n\t"
+#else
+        "vpermilps      $0xb1 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilps      $0xb1 , %%ymm10, %%ymm10               \n\t"
+        "vpermilps      $0xb1 , %%ymm12, %%ymm12               \n\t"
+        "vpermilps      $0xb1 , %%ymm14, %%ymm14               \n\t"
+        "vaddsubps      %%ymm8 , %%ymm9 , %%ymm8              \n\t"
+        "vaddsubps      %%ymm10, %%ymm11, %%ymm10             \n\t"
+        "vaddsubps      %%ymm12, %%ymm13, %%ymm12             \n\t"
+        "vaddsubps      %%ymm14, %%ymm15, %%ymm14             \n\t"
+        "vpermilps      $0xb1 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilps      $0xb1 , %%ymm10, %%ymm10               \n\t"
+        "vpermilps      $0xb1 , %%ymm12, %%ymm12               \n\t"
+        "vpermilps      $0xb1 , %%ymm14, %%ymm14               \n\t"
+#endif
+
+	"vextractf128   $1, %%ymm8 , %%xmm9		      \n\t"
+	"vextractf128   $1, %%ymm10, %%xmm11	      	      \n\t"
+	"vextractf128   $1, %%ymm12, %%xmm13		      \n\t"
+	"vextractf128   $1, %%ymm14, %%xmm15		      \n\t"
+
+	"vaddps		%%xmm8 , %%xmm9 , %%xmm8       \n\t"
+	"vaddps		%%xmm10, %%xmm11, %%xmm10      \n\t"
+	"vaddps		%%xmm12, %%xmm13, %%xmm12      \n\t"
+	"vaddps		%%xmm14, %%xmm15, %%xmm14      \n\t"
+
+	"vshufpd        $0x1, %%xmm8 , %%xmm8 , %%xmm9   \n\t"
+	"vshufpd        $0x1, %%xmm10, %%xmm10, %%xmm11  \n\t"
+	"vshufpd        $0x1, %%xmm12, %%xmm12, %%xmm13  \n\t"
+	"vshufpd        $0x1, %%xmm14, %%xmm14, %%xmm15  \n\t"
+
+	"vaddps		%%xmm8 , %%xmm9 , %%xmm8       \n\t"
+	"vaddps		%%xmm10, %%xmm11, %%xmm10      \n\t"
+	"vaddps		%%xmm12, %%xmm13, %%xmm12      \n\t"
+	"vaddps		%%xmm14, %%xmm15, %%xmm14      \n\t"
+
+	"vmovsd	%%xmm8 ,   (%3)			\n\t"
+	"vmovsd	%%xmm10,  8(%3)			\n\t"
+	"vmovsd	%%xmm12, 16(%3)			\n\t"
+	"vmovsd	%%xmm14, 24(%3)			\n\t"
+
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From 11eab4c0199f85ece37453f351ffee6450bf8c7c Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Thu, 14 Aug 2014 19:00:30 +0200
Subject: [PATCH 72/74] added optimized cgemv_n for haswell

---
 kernel/x86_64/KERNEL.HASWELL             |   2 +
 kernel/x86_64/cgemv_n.c                  | 255 +++++++++++++++++++++++
 kernel/x86_64/cgemv_n_microk_haswell-2.c | 137 ++++++++++++
 3 files changed, 394 insertions(+)
 create mode 100644 kernel/x86_64/cgemv_n.c
 create mode 100644 kernel/x86_64/cgemv_n_microk_haswell-2.c

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index e07448abb..d0ac9c72f 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -1,11 +1,13 @@
 SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t.c
+
 DGEMVNKERNEL = dgemv_n.c
 DGEMVTKERNEL = dgemv_t.c
 
 ZGEMVNKERNEL = zgemv_n.c
 ZGEMVTKERNEL = zgemv_t.c
 
+CGEMVNKERNEL = cgemv_n.c
 CGEMVTKERNEL = cgemv_t.c
 
 SGEMMKERNEL    =  sgemm_kernel_16x4_haswell.S
diff --git a/kernel/x86_64/cgemv_n.c b/kernel/x86_64/cgemv_n.c
new file mode 100644
index 000000000..47ef0d447
--- /dev/null
+++ b/kernel/x86_64/cgemv_n.c
@@ -0,0 +1,255 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "common.h"
+
+#if defined(HASWELL)
+#include "cgemv_n_microk_haswell-2.c"
+#endif
+
+
+#define NBMAX 2048
+
+#ifndef HAVE_KERNEL_16x4
+
+static void cgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3];
+
+	for ( i=0; i< 2*n; i+=2 )
+	{
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+		y[i]   += a0[i]*x[0] - a0[i+1] * x[1];
+		y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
+		y[i]   += a1[i]*x[2] - a1[i+1] * x[3];
+		y[i+1] += a1[i]*x[3] + a1[i+1] * x[2];
+		y[i]   += a2[i]*x[4] - a2[i+1] * x[5];
+		y[i+1] += a2[i]*x[5] + a2[i+1] * x[4];
+		y[i]   += a3[i]*x[6] - a3[i+1] * x[7];
+		y[i+1] += a3[i]*x[7] + a3[i+1] * x[6];
+#else 
+		y[i]   += a0[i]*x[0] + a0[i+1] * x[1];
+		y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
+		y[i]   += a1[i]*x[2] + a1[i+1] * x[3];
+		y[i+1] += a1[i]*x[3] - a1[i+1] * x[2];
+		y[i]   += a2[i]*x[4] + a2[i+1] * x[5];
+		y[i+1] += a2[i]*x[5] - a2[i+1] * x[4];
+		y[i]   += a3[i]*x[6] + a3[i+1] * x[7];
+		y[i+1] += a3[i]*x[7] - a3[i+1] * x[6];
+#endif
+	}
+}
+	
+#endif
+
+static void cgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+	FLOAT *a0;
+	a0 = ap;
+
+	for ( i=0; i< 2*n; i+=2 )
+	{
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+		y[i]   += a0[i]*x[0] - a0[i+1] * x[1];
+		y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
+#else 
+		y[i]   += a0[i]*x[0] + a0[i+1] * x[1];
+		y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
+#endif
+
+	}
+}
+	
+
+static void zero_y(BLASLONG n, FLOAT *dest)
+{
+	BLASLONG i;
+	for ( i=0; i<2*n; i++ )
+	{
+		*dest = 0.0;
+		dest++;
+	}
+}
+
+
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
+{
+	BLASLONG i;
+	FLOAT temp_r;
+	FLOAT temp_i;
+	for ( i=0; i<n; i++ )
+	{
+#if !defined(XCONJ) 
+		temp_r = alpha_r * src[0] - alpha_i * src[1];
+		temp_i = alpha_r * src[1] + alpha_i * src[0];
+#else
+		temp_r =  alpha_r * src[0] + alpha_i * src[1];
+		temp_i = -alpha_r * src[1] + alpha_i * src[0];
+#endif
+
+		*dest += temp_r;
+		*(dest+1) += temp_i;
+
+		src+=2;
+		dest += inc_dest;
+	}
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	FLOAT *ap[4];
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG n2;
+	FLOAT xbuffer[8],*ybuffer;
+
+
+#if 0
+printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y);
+#endif
+
+	if ( m < 1 ) return(0);
+	if ( n < 1 ) return(0);
+
+	ybuffer = buffer;
+	
+	inc_x *= 2;
+	inc_y *= 2;
+	lda   *= 2;
+
+	n1 = n / 4 ;
+	n2 = n % 4 ;
+	
+	m1 = m - ( m % 16 );
+	m2 = (m % NBMAX) - (m % 16) ;
+	
+	y_ptr = y;
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		a_ptr = a;
+		x_ptr = x;
+		zero_y(NB,ybuffer);
+		for( i = 0; i < n1 ; i++)
+		{
+
+			xbuffer[0] = x_ptr[0];
+			xbuffer[1] = x_ptr[1];
+			x_ptr += inc_x;	
+			xbuffer[2] = x_ptr[0];
+			xbuffer[3] = x_ptr[1];
+			x_ptr += inc_x;	
+			xbuffer[4] = x_ptr[0];
+			xbuffer[5] = x_ptr[1];
+			x_ptr += inc_x;	
+			xbuffer[6] = x_ptr[0];
+			xbuffer[7] = x_ptr[1];
+			x_ptr += inc_x;	
+
+			ap[0] = a_ptr;
+			ap[1] = a_ptr + lda;
+			ap[2] = ap[1] + lda;
+			ap[3] = ap[2] + lda;
+			cgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
+			a_ptr += 4 * lda;
+		}
+
+		for( i = 0; i < n2 ; i++)
+		{
+			xbuffer[0] = x_ptr[0];
+			xbuffer[1] = x_ptr[1];
+			x_ptr += inc_x;	
+			cgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
+			a_ptr += 1 * lda;
+
+		}
+		add_y(NB,ybuffer,y_ptr,inc_y,alpha_r,alpha_i);
+		a     += 2 * NB;
+		y_ptr += NB * inc_y;
+	}
+
+	j=0;
+	while ( j < (m % 16))
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp_r = 0.0;
+		FLOAT temp_i = 0.0;
+		for( i = 0; i < n; i++ )
+		{
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+			temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+			temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+#else
+			temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+			temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+#endif
+
+			a_ptr += lda;
+			x_ptr += inc_x;
+		}
+
+#if !defined(XCONJ) 
+		y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+		y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+		y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+		y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+		y_ptr += inc_y;
+		a+=2;
+		j++;
+	}
+	return(0);
+}
+
+
diff --git a/kernel/x86_64/cgemv_n_microk_haswell-2.c b/kernel/x86_64/cgemv_n_microk_haswell-2.c
new file mode 100644
index 000000000..9b1501013
--- /dev/null
+++ b/kernel/x86_64/cgemv_n_microk_haswell-2.c
@@ -0,0 +1,137 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16x4 1
+static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+
+	"vbroadcastss	  (%2), %%ymm0                  \n\t"  // real part x0
+	"vbroadcastss	 4(%2), %%ymm1                  \n\t"  // imag part x0
+	"vbroadcastss	 8(%2), %%ymm2                  \n\t"  // real part x1
+	"vbroadcastss	12(%2), %%ymm3                  \n\t"  // imag part x1
+	"vbroadcastss	16(%2), %%ymm4                  \n\t"  // real part x2
+	"vbroadcastss	20(%2), %%ymm5                  \n\t"  // imag part x2
+	"vbroadcastss	24(%2), %%ymm6                  \n\t"  // real part x3
+	"vbroadcastss	28(%2), %%ymm7                  \n\t"  // imag part x3
+
+
+	".align 16				        \n\t"
+	".L01LOOP%=:				        \n\t"
+	"prefetcht0      320(%4,%0,4)			\n\t"
+	"vmovups	(%4,%0,4), %%ymm8	        \n\t" // 4 complex values form a0
+	"vmovups      32(%4,%0,4), %%ymm9	        \n\t" // 4 complex values form a0
+
+	"prefetcht0      320(%5,%0,4)			\n\t"
+	"vmovups	(%5,%0,4), %%ymm10              \n\t" // 4 complex values form a1
+	"vmovups      32(%5,%0,4), %%ymm11              \n\t" // 4 complex values form a1
+
+	"vmulps      %%ymm8 , %%ymm0, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vmulps      %%ymm8 , %%ymm1, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+	"vmulps      %%ymm9 , %%ymm0, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vmulps      %%ymm9 , %%ymm1, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"prefetcht0      320(%6,%0,4)			\n\t"
+	"vmovups	(%6,%0,4), %%ymm8	        \n\t" // 4 complex values form a2
+	"vmovups      32(%6,%0,4), %%ymm9	        \n\t" // 4 complex values form a2
+
+	"vfmadd231ps      %%ymm10, %%ymm2, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmadd231ps      %%ymm10, %%ymm3, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+	"vfmadd231ps      %%ymm11, %%ymm2, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vfmadd231ps      %%ymm11, %%ymm3, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"prefetcht0      320(%7,%0,4)			\n\t"
+	"vmovups	(%7,%0,4), %%ymm10              \n\t" // 4 complex values form a3
+	"vmovups      32(%7,%0,4), %%ymm11              \n\t" // 4 complex values form a3
+
+	"vfmadd231ps      %%ymm8 , %%ymm4, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmadd231ps      %%ymm8 , %%ymm5, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+	"vfmadd231ps      %%ymm9 , %%ymm4, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vfmadd231ps      %%ymm9 , %%ymm5, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"vfmadd231ps      %%ymm10, %%ymm6, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmadd231ps      %%ymm10, %%ymm7, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+	"vfmadd231ps      %%ymm11, %%ymm6, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vfmadd231ps      %%ymm11, %%ymm7, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"prefetcht0      320(%3,%0,4)			\n\t"
+	"vmovups	  (%3,%0,4),  %%ymm10           \n\t"
+	"vmovups	32(%3,%0,4),  %%ymm11           \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vpermilps      $0xb1 , %%ymm13, %%ymm13               \n\t"
+        "vpermilps      $0xb1 , %%ymm15, %%ymm15               \n\t"
+        "vaddsubps      %%ymm13, %%ymm12, %%ymm8              \n\t"
+        "vaddsubps      %%ymm15, %%ymm14, %%ymm9              \n\t"
+#else
+        "vpermilps      $0xb1 , %%ymm12, %%ymm12               \n\t"
+        "vpermilps      $0xb1 , %%ymm14, %%ymm14               \n\t"
+        "vaddsubps      %%ymm12, %%ymm13, %%ymm8              \n\t"
+        "vaddsubps      %%ymm14, %%ymm15, %%ymm9              \n\t"
+        "vpermilps      $0xb1 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilps      $0xb1 , %%ymm9 , %%ymm9                \n\t"
+#endif
+
+        "vaddps         %%ymm8, %%ymm10, %%ymm12              \n\t"
+        "vaddps         %%ymm9, %%ymm11, %%ymm13              \n\t"
+
+	"vmovups  %%ymm12,   (%3,%0,4)		        \n\t" // 4 complex values to y	
+	"vmovups  %%ymm13, 32(%3,%0,4)		        \n\t"	
+
+        "addq		$16, %0	  	 	        \n\t"
+	"subq	        $8 , %1			        \n\t"		
+	"jnz		.L01LOOP%=		        \n\t"
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From 0a22816e708cc0470f2c35aa7c5e31712cd76ecc Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Fri, 15 Aug 2014 12:40:10 +0200
Subject: [PATCH 73/74] Ref #433: removed obsolete lapack entries from
 common_interface.h

---
 common_interface.h | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/common_interface.h b/common_interface.h
index 6ab3450a0..ddd2cf6e5 100644
--- a/common_interface.h
+++ b/common_interface.h
@@ -679,13 +679,6 @@ int BLASFUNC(cgesv)(blasint *, blasint *, float  *, blasint *, blasint *, float
 int BLASFUNC(zgesv)(blasint *, blasint *, double *, blasint *, blasint *, double*, blasint *, blasint *);
 int BLASFUNC(xgesv)(blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble*, blasint *, blasint *);
 
-int BLASFUNC(sgesvd)(char *, char *, blasint *, blasint *, float   *, blasint *, float   *, float   *, blasint *, float   *, blasint *, float   *, blasint *, blasint *);
-int BLASFUNC(dgesvd)(char *, char *, blasint *, blasint *, double  *, blasint *, double  *, double  *, blasint *, double  *, blasint *, double  *, blasint *, blasint *);
-int BLASFUNC(qgesvd)(char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *);
-int BLASFUNC(cgesvd)(char *, char *, blasint *, blasint *, float   *, blasint *, float   *, float   *, blasint *, float   *, blasint *, float   *, blasint *, blasint *);
-int BLASFUNC(zgesvd)(char *, char *, blasint *, blasint *, double  *, blasint *, double  *, double  *, blasint *, double  *, blasint *, double  *, blasint *, blasint *);
-int BLASFUNC(xgesvd)(char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *);
-
 int BLASFUNC(spotf2)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(dpotf2)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(qpotf2)(char *, blasint *, xdouble *, blasint *, blasint *);
@@ -735,19 +728,6 @@ int BLASFUNC(ctrtri)(char *, char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(ztrtri)(char *, char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(xtrtri)(char *, char *, blasint *, xdouble *, blasint *, blasint *);
 
-int BLASFUNC(spotri)(char *, blasint *, float  *, blasint *, blasint *);
-int BLASFUNC(dpotri)(char *, blasint *, double *, blasint *, blasint *);
-int BLASFUNC(qpotri)(char *, blasint *, xdouble *, blasint *, blasint *);
-int BLASFUNC(cpotri)(char *, blasint *, float  *, blasint *, blasint *);
-int BLASFUNC(zpotri)(char *, blasint *, double *, blasint *, blasint *);
-int BLASFUNC(xpotri)(char *, blasint *, xdouble *, blasint *, blasint *);
-
-int BLASFUNC(slarf)(char *, blasint *, blasint *, float *, blasint *, float *, float *, blasint *, float *);
-int BLASFUNC(dlarf)(char *, blasint *, blasint *, double *, blasint *, double *, double *, blasint *, double *);
-int BLASFUNC(qlarf)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *, xdouble *);
-int BLASFUNC(clarf)(char *, blasint *, blasint *, float *, blasint *, float *, float *, blasint *, float *);
-int BLASFUNC(zlarf)(char *, blasint *, blasint *, double *, blasint *, double *, double *, blasint *, double *);
-int BLASFUNC(xlarf)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *, xdouble *);
 
 FLOATRET  BLASFUNC(slamch)(char *);
 double    BLASFUNC(dlamch)(char *);

From a69dd3fbc5c38f7098d1539a69963c0d2bd3163a Mon Sep 17 00:00:00 2001
From: Zhang Xianyi <traits.zhang@gmail.com>
Date: Mon, 18 Aug 2014 11:15:42 +0800
Subject: [PATCH 74/74] OpenBLAS 0.2.11 version.

---
 Changelog.txt | 15 +++++++++++++++
 Makefile.rule |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/Changelog.txt b/Changelog.txt
index 5b85227b2..d33cffc7b 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,19 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.2.10
+18-Aug-2014
+common:
+	* Added some benchmark codes.
+	* Fix link error on Linux/musl.(Thanks Isaac Dunham)
+
+x86/x86-64:
+	* Improved s/c/zgemm performance for Intel Haswell.
+	* Improved s/d/c/zgemv performance.
+	* Support the big numa machine.(EXPERIMENT)
+
+ARM:
+	* Fix detection when cpuinfo uses "Processor". (Thanks Isaiah)
+
 ====================================================================
 Version 0.2.10
 16-Jul-2014
diff --git a/Makefile.rule b/Makefile.rule
index 7bbb39e7a..7430320b7 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.2.10
+VERSION = 0.2.11
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library