From 53e6dbf6ca6e7c798e0ed0dfd24a78570a814553 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sat, 30 Aug 2014 13:36:27 +0200
Subject: [PATCH 01/44] optimized sgemv_t kernel for small sizes

---
 kernel/x86_64/KERNEL.NEHALEM             |   2 +-
 kernel/x86_64/sgemv_t_4.c                | 235 +++++++++++++++++++++++
 kernel/x86_64/sgemv_t_microk_nehalem-4.c |  99 ++++++++++
 3 files changed, 335 insertions(+), 1 deletion(-)
 create mode 100644 kernel/x86_64/sgemv_t_4.c
 create mode 100644 kernel/x86_64/sgemv_t_microk_nehalem-4.c

diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM
index 8adb579cf..00c3b4d15 100644
--- a/kernel/x86_64/KERNEL.NEHALEM
+++ b/kernel/x86_64/KERNEL.NEHALEM
@@ -10,7 +10,7 @@ SSYMV_U_KERNEL = ssymv_U.c
 SSYMV_L_KERNEL = ssymv_L.c
 
 SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = sgemv_t.c
+SGEMVTKERNEL = sgemv_t_4.c
 DGEMVNKERNEL = dgemv_n.c
 
 SGEMMKERNEL    =  gemm_kernel_4x8_nehalem.S
diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
new file mode 100644
index 000000000..e0eb9220b
--- /dev/null
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -0,0 +1,235 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(NEHALEM)
+#include "sgemv_t_microk_nehalem-4.c"
+#endif
+
+#define NBMAX 4096
+
+#ifndef HAVE_KERNEL_4x4
+
+static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3];
+	FLOAT temp0 = 0.0;
+	FLOAT temp1 = 0.0;
+	FLOAT temp2 = 0.0;
+	FLOAT temp3 = 0.0;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];		
+		temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];		
+		temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];		
+		temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];		
+	}
+	y[0] = temp0;
+	y[1] = temp1;
+	y[2] = temp2;
+	y[3] = temp3;
+}
+	
+#endif
+
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+	FLOAT *a0;
+	a0 = ap;
+	FLOAT temp = 0.0;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		temp += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];		
+	}
+	*y = temp;
+}
+	
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
+{
+        BLASLONG i;
+        for ( i=0; i<n; i++ )
+        {
+                *dest = *src;
+                dest++;
+                src += inc_src;
+        }
+}
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG register i;
+	BLASLONG register j;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG m3;
+	BLASLONG n2;
+	FLOAT ybuffer[4],*xbuffer;
+
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
+	xbuffer = buffer;
+	
+        n1 = n >> 2 ;
+        n2 = n & 3  ;
+
+	m3 = m & 3  ;
+        m1 = m & -4 ;
+        m2 = (m & (NBMAX-1)) - m3 ;
+
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		y_ptr = y;
+		a_ptr = a;
+		x_ptr = x;
+
+		if ( inc_x == 1 )
+			xbuffer = x_ptr;
+		else
+			copy_x(NB,x_ptr,xbuffer,inc_x);
+
+
+		FLOAT *ap[4];
+		BLASLONG register lda4 = 4 * lda;
+		ap[0] = a_ptr;
+		ap[1] = a_ptr + lda;
+		ap[2] = ap[1] + lda;
+		ap[3] = ap[2] + lda;
+
+		for( i = 0; i < n1 ; i++)
+		{
+			sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer);
+			ap[0] += lda4 ;
+			ap[1] += lda4 ;
+			ap[2] += lda4 ;
+			ap[3] += lda4 ;
+			a_ptr += lda4 ;
+			if ( inc_y == 1 )
+			{
+
+			        __asm__  __volatile__
+       				(
+				"movss	 (%0) , %%xmm10                 \n\t"
+				"shufps  $0 , %%xmm10 , %%xmm10		\n\t"
+				"movups  (%1) , %%xmm12			\n\t"
+				"movups  (%2) , %%xmm11			\n\t"
+				"mulps   %%xmm10  , %%xmm12 		\n\t"
+
+				"addps   %%xmm11 , %%xmm12		\n\t"
+				"movups  %%xmm12, (%2)			\n\t"
+
+			        :
+        			:
+          			"r" (&alpha),     // 0    
+        			"r" (ybuffer),    // 1
+          			"r" (y_ptr)       // 2
+        			: 
+          			"%xmm10", "%xmm11", "%xmm12",
+          			"memory"
+        			);
+
+				y_ptr  += 4;
+
+			}
+			else
+			{	
+				*y_ptr += ybuffer[0]*alpha;
+				y_ptr  += inc_y;
+				*y_ptr += ybuffer[1]*alpha;
+				y_ptr  += inc_y;
+				*y_ptr += ybuffer[2]*alpha;
+				y_ptr  += inc_y;
+				*y_ptr += ybuffer[3]*alpha;
+				y_ptr  += inc_y;
+			}
+		}
+
+		for( i = 0; i < n2 ; i++)
+		{
+			sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
+			a_ptr += 1 * lda;
+			*y_ptr += ybuffer[0]*alpha;
+			y_ptr  += inc_y;
+
+		}
+		a += NB;
+		x += NB * inc_x;	
+	}
+
+	if ( m3 == 0 ) return(0);
+	xbuffer = buffer;
+	x_ptr = x;
+	for ( i=0; i< m3; i++ )
+	{
+		xbuffer[i] = *x_ptr;
+		x_ptr += inc_x;
+	}
+	j=0;
+	a_ptr = a;
+	y_ptr = y;
+	while ( j < n)
+	{
+		FLOAT temp = 0.0;
+		for( i = 0; i < m3; i++ )
+		{
+			temp += a_ptr[i] * xbuffer[i];
+		}
+		a_ptr += lda;
+		y_ptr[0] += alpha * temp;
+		y_ptr += inc_y;
+		j++;
+	}
+	return(0);
+}
+
+
diff --git a/kernel/x86_64/sgemv_t_microk_nehalem-4.c b/kernel/x86_64/sgemv_t_microk_nehalem-4.c
new file mode 100644
index 000000000..4a167900e
--- /dev/null
+++ b/kernel/x86_64/sgemv_t_microk_nehalem-4.c
@@ -0,0 +1,99 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"xorps		%%xmm4 , %%xmm4	         \n\t"
+	"xorps		%%xmm5 , %%xmm5	         \n\t"
+	"xorps		%%xmm6 , %%xmm6	         \n\t"
+	"xorps		%%xmm7 , %%xmm7	         \n\t"
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+
+	"movups	       (%2,%0,4), %%xmm12              \n\t"   // 4 * x
+	"movups        (%4,%0,4), %%xmm8               \n\t"   // 4 * a0
+	"movups        (%5,%0,4), %%xmm9               \n\t"   // 4 * a1
+	"movups        (%6,%0,4), %%xmm10              \n\t"   // 4 * a2
+	"movups        (%7,%0,4), %%xmm11              \n\t"   // 4 * a3
+
+	"mulps		%%xmm12, %%xmm8		       \n\t"
+	"mulps		%%xmm12, %%xmm9		       \n\t"
+	"mulps		%%xmm12, %%xmm10	       \n\t"
+	"mulps		%%xmm12, %%xmm11	       \n\t"
+	"addps		%%xmm8 , %%xmm4		       \n\t"
+        "addq		$4 , %0	  	 	       \n\t"
+	"addps		%%xmm9 , %%xmm5		       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+	"addps		%%xmm10, %%xmm6		       \n\t"
+	"addps		%%xmm11, %%xmm7		       \n\t"
+
+	"jnz		.L01LOOP%=		       \n\t"
+
+        "haddps        %%xmm4, %%xmm4  \n\t"
+        "haddps        %%xmm5, %%xmm5  \n\t"
+        "haddps        %%xmm6, %%xmm6  \n\t"
+        "haddps        %%xmm7, %%xmm7  \n\t"
+
+        "haddps        %%xmm4, %%xmm4  \n\t"
+        "haddps        %%xmm5, %%xmm5  \n\t"
+        "haddps        %%xmm6, %%xmm6  \n\t"
+        "haddps        %%xmm7, %%xmm7  \n\t"
+
+        "movss         %%xmm4,    (%3)         \n\t"
+        "movss         %%xmm5,   4(%3)         \n\t"
+        "movss         %%xmm6,   8(%3)         \n\t"
+        "movss         %%xmm7,  12(%3)         \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12",
+	  "memory"
+	);
+
+} 
+
+

From e2fc8c8c2cd490d8774eb0d2b74e3060373a0199 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sat, 30 Aug 2014 13:58:02 +0200
Subject: [PATCH 02/44] changed 1 test value (bug in lapack-testing?)

---
 lapack-netlib/TESTING/dstest.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/TESTING/dstest.in b/lapack-netlib/TESTING/dstest.in
index 4a31076a6..b5a9f29f4 100644
--- a/lapack-netlib/TESTING/dstest.in
+++ b/lapack-netlib/TESTING/dstest.in
@@ -1,6 +1,6 @@
 Data file for testing DSGESV/DSPOSV LAPACK routines
 12                                      Number of values of M
-0 1 2 13 17 45 78 91 101 119 120 132    values of M (row dimension)
+0 1 2 13 17 45 78 91 101 119 112 132    values of M (row dimension)
 6                                       Number of values of NRHS
 1 2 14 15 16 13                         Values of NRHS (number of right hand sides)
 30.0                                    Threshold value of test ratio

From 848c0f16f7740563be56dc11f2b6c10ef174024e Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sun, 31 Aug 2014 13:23:44 +0200
Subject: [PATCH 03/44] optimized sgemv_t_4.c for small size

---
 kernel/x86_64/sgemv_t_4.c | 150 +++++++++++++++++++++++++++-----------
 1 file changed, 108 insertions(+), 42 deletions(-)

diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index e0eb9220b..cefbaccd4 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -64,6 +64,8 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	
 #endif
 
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)  __attribute__ ((noinline));
+
 static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 {
 	BLASLONG i;
@@ -71,11 +73,51 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 	a0 = ap;
 	FLOAT temp = 0.0;
 
+	if (n <=0 ) return;
+/*
 	for ( i=0; i< n; i+=4 )
 	{
 		temp += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];		
 	}
 	*y = temp;
+*/
+
+	i=0;
+
+        __asm__  __volatile__
+ 	(
+	"xorps %%xmm10 , %%xmm10		\n\t"
+
+        ".align 16                              \n\t"
+        ".L01LOOP%=:                            \n\t"
+
+	"movups  (%3,%0,4) , %%xmm12		\n\t"
+	"movups  (%4,%0,4) , %%xmm11		\n\t"
+	"mulps   %%xmm11   , %%xmm12 		\n\t"
+        "addq           $4 , %0                 \n\t"
+	"addps   %%xmm12   , %%xmm10		\n\t"
+        "subq           $4 , %1                 \n\t"
+
+        "jnz            .L01LOOP%=              \n\t"
+
+	"haddps        %%xmm10, %%xmm10         \n\t"
+	"haddps        %%xmm10, %%xmm10         \n\t"
+
+	"movss	       %%xmm10, (%2)	        \n\t"
+
+        :
+   	:
+	"r" (i),	 // 0
+	"r" (n),	 // 1
+        "r" (y),         // 2    
+        "r" (ap),        // 3
+        "r" (x)          // 4
+        : "cc",
+       	"%xmm10", "%xmm11", "%xmm12",
+       	"memory"
+       	);
+
+
 }
 	
 static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
@@ -89,6 +131,57 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
         }
 }
 
+static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+
+        BLASLONG i;
+
+	if ( inc_dest != 1 )
+	{
+        	for ( i=0; i<n; i++ )
+        	{
+                	*dest += src[i]  * da;
+                	dest  += inc_dest;
+		}
+		return;
+        }
+
+	i=0;
+
+        __asm__  __volatile__
+ 	(
+	"movss	 (%2) , %%xmm10                 \n\t"
+	"shufps  $0 , %%xmm10 , %%xmm10		\n\t"
+
+        ".align 16                              \n\t"
+        ".L01LOOP%=:                            \n\t"
+
+	"movups  (%3,%0,4) , %%xmm12		\n\t"
+	"movups  (%4,%0,4) , %%xmm11		\n\t"
+	"mulps   %%xmm10   , %%xmm12 		\n\t"
+        "addq           $4 , %0                 \n\t"
+	"addps   %%xmm11   , %%xmm12		\n\t"
+        "subq           $4 , %1                 \n\t"
+	"movups  %%xmm12, -16(%4,%0,4)		\n\t"
+
+        "jnz            .L01LOOP%=              \n\t"
+
+        :
+   	:
+	"r" (i),	  // 0
+	"r" (n),	  // 1
+        "r" (&da),        // 2    
+        "r" (src),        // 3
+        "r" (dest)        // 4
+        : "cc",
+       	"%xmm10", "%xmm11", "%xmm12",
+       	"memory"
+       	);
+
+
+}
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 {
@@ -103,11 +196,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	BLASLONG m3;
 	BLASLONG n2;
 	FLOAT ybuffer[4],*xbuffer;
+	FLOAT *ytemp;
 
         if ( m < 1 ) return(0);
         if ( n < 1 ) return(0);
 
 	xbuffer = buffer;
+	ytemp   = buffer + NBMAX;
 	
         n1 = n >> 2 ;
         n2 = n & 3  ;
@@ -140,65 +235,36 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 
 
 		FLOAT *ap[4];
+		FLOAT *yp;
 		BLASLONG register lda4 = 4 * lda;
 		ap[0] = a_ptr;
 		ap[1] = a_ptr + lda;
 		ap[2] = ap[1] + lda;
 		ap[3] = ap[2] + lda;
+		yp = ytemp;
 
 		for( i = 0; i < n1 ; i++)
 		{
-			sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer);
+			sgemv_kernel_4x4(NB,ap,xbuffer,yp);
 			ap[0] += lda4 ;
 			ap[1] += lda4 ;
 			ap[2] += lda4 ;
 			ap[3] += lda4 ;
-			a_ptr += lda4 ;
-			if ( inc_y == 1 )
-			{
-
-			        __asm__  __volatile__
-       				(
-				"movss	 (%0) , %%xmm10                 \n\t"
-				"shufps  $0 , %%xmm10 , %%xmm10		\n\t"
-				"movups  (%1) , %%xmm12			\n\t"
-				"movups  (%2) , %%xmm11			\n\t"
-				"mulps   %%xmm10  , %%xmm12 		\n\t"
-
-				"addps   %%xmm11 , %%xmm12		\n\t"
-				"movups  %%xmm12, (%2)			\n\t"
-
-			        :
-        			:
-          			"r" (&alpha),     // 0    
-        			"r" (ybuffer),    // 1
-          			"r" (y_ptr)       // 2
-        			: 
-          			"%xmm10", "%xmm11", "%xmm12",
-          			"memory"
-        			);
-
-				y_ptr  += 4;
-
-			}
-			else
-			{	
-				*y_ptr += ybuffer[0]*alpha;
-				y_ptr  += inc_y;
-				*y_ptr += ybuffer[1]*alpha;
-				y_ptr  += inc_y;
-				*y_ptr += ybuffer[2]*alpha;
-				y_ptr  += inc_y;
-				*y_ptr += ybuffer[3]*alpha;
-				y_ptr  += inc_y;
-			}
+			yp += 4;
+		}
+		if ( n1 > 0 )
+		{
+			add_y(n1*4, alpha, ytemp, y_ptr, inc_y );
+			y_ptr += n1 * inc_y * 4;
+			a_ptr += n1 * lda4 ;
 		}
 
 		for( i = 0; i < n2 ; i++)
 		{
+
 			sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
-			a_ptr += 1 * lda;
-			*y_ptr += ybuffer[0]*alpha;
+			a_ptr  += lda;
+			*y_ptr += ybuffer[0] * alpha;
 			y_ptr  += inc_y;
 
 		}

From bc99faef1bf2e1a98e99dcf6cfba2ea58ae0a56e Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sun, 31 Aug 2014 14:33:15 +0200
Subject: [PATCH 04/44] optimized sgemv_t_4.c for uneven sizes

---
 kernel/x86_64/sgemv_t_4.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index cefbaccd4..76187b57d 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -273,28 +273,24 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	}
 
 	if ( m3 == 0 ) return(0);
-	xbuffer = buffer;
+
 	x_ptr = x;
+	a_ptr = a;
 	for ( i=0; i< m3; i++ )
 	{
-		xbuffer[i] = *x_ptr;
-		x_ptr += inc_x;
-	}
-	j=0;
-	a_ptr = a;
-	y_ptr = y;
-	while ( j < n)
-	{
-		FLOAT temp = 0.0;
-		for( i = 0; i < m3; i++ )
+		FLOAT xtemp = *x_ptr * alpha;
+		FLOAT *aj = a_ptr;
+		y_ptr = y;
+		for ( j=0; j<n; j++ )
 		{
-			temp += a_ptr[i] * xbuffer[i];
+			*y_ptr += *aj * xtemp;
+			 y_ptr += inc_y;
+			 aj    += lda;
 		}
-		a_ptr += lda;
-		y_ptr[0] += alpha * temp;
-		y_ptr += inc_y;
-		j++;
+		x_ptr += inc_x;
+		a_ptr++ ;
 	}
+	
 	return(0);
 }
 

From 53f1277b6b0282e635fc4603862965c9a640a84d Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sun, 31 Aug 2014 15:38:18 +0200
Subject: [PATCH 05/44] modified benchmark/gemv.c

---
 benchmark/gemv.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/benchmark/gemv.c b/benchmark/gemv.c
index e26a36ac1..c5db09d89 100644
--- a/benchmark/gemv.c
+++ b/benchmark/gemv.c
@@ -151,23 +151,26 @@ int MAIN__(int argc, char *argv[]){
   if ((p = getenv("OPENBLAS_TRANS")))  trans=*p;
   if ((p = getenv("OPENBLAS_PARAM_N"))) {
 	  n = atoi(p);
-	  if ((n>0) && (n<=to)) has_param_n = 1;
+	  if ((n>0)) has_param_n = 1;
   }
 
+  int tomax = to;
+  if ( n > tomax ) tomax = n;
+
   if ( has_param_n == 1 )
     fprintf(stderr, "From : %3d  To : %3d Step = %3d Trans = '%c' N = %d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,n,inc_x,inc_y,loops);
   else
     fprintf(stderr, "From : %3d  To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops);
 
-  if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+  if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){
     fprintf(stderr,"Out of Memory!!\n");exit(1);
   }
 
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){
     fprintf(stderr,"Out of Memory!!\n");exit(1);
   }
 
-  if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
+  if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){
     fprintf(stderr,"Out of Memory!!\n");exit(1);
   }
 

From d7f91f8b4f506b0e6071c61164a8e1c7ac8f32e9 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Mon, 1 Sep 2014 15:07:36 +0200
Subject: [PATCH 06/44] extended gemv.c benchmark

---
 benchmark/gemv.c | 103 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 70 insertions(+), 33 deletions(-)

diff --git a/benchmark/gemv.c b/benchmark/gemv.c
index c5db09d89..e21868259 100644
--- a/benchmark/gemv.c
+++ b/benchmark/gemv.c
@@ -128,6 +128,7 @@ int MAIN__(int argc, char *argv[]){
   blasint inc_x=1,inc_y=1;
   blasint n=0;
   int has_param_n = 0;
+  int has_param_m = 0;
   int loops = 1;
   int l;
   char *p;
@@ -145,6 +146,9 @@ int MAIN__(int argc, char *argv[]){
   if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
   if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
 
+
+  int tomax = to;
+
   if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
   if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
   if ((p = getenv("OPENBLAS_INCY")))   inc_y = atoi(p);
@@ -152,15 +156,18 @@ int MAIN__(int argc, char *argv[]){
   if ((p = getenv("OPENBLAS_PARAM_N"))) {
 	  n = atoi(p);
 	  if ((n>0)) has_param_n = 1;
+  	  if ( n > tomax ) tomax = n;
   }
+  if ( has_param_n == 0 )
+  	if ((p = getenv("OPENBLAS_PARAM_M"))) {
+		  m = atoi(p);
+		  if ((m>0)) has_param_m = 1;
+  	  	  if ( m > tomax ) tomax = m;
+  	}
 
-  int tomax = to;
-  if ( n > tomax ) tomax = n;
 
-  if ( has_param_n == 1 )
-    fprintf(stderr, "From : %3d  To : %3d Step = %3d Trans = '%c' N = %d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,n,inc_x,inc_y,loops);
-  else
-    fprintf(stderr, "From : %3d  To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops);
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops);
 
   if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){
     fprintf(stderr,"Out of Memory!!\n");exit(1);
@@ -180,50 +187,80 @@ int MAIN__(int argc, char *argv[]){
 
   fprintf(stderr, "   SIZE       Flops\n");
 
-  for(m = from; m <= to; m += step)
+  if (has_param_m == 0)
   {
 
-   timeg=0;
+  	for(m = from; m <= to; m += step)
+  	{
+   		timeg=0;
+   		if ( has_param_n == 0 ) n = m;
+   		fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
+   		for(j = 0; j < m; j++){
+      			for(i = 0; i < n * COMPSIZE; i++){
+				a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+      			}
+   		}
 
-   if ( has_param_n == 0 ) n = m;
+    		for (l=0; l<loops; l++)
+    		{
 
-   fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
+   			for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
+				x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   			}
 
-   for(j = 0; j < m; j++){
-      		for(i = 0; i < n * COMPSIZE; i++){
-			a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-      		}
-   }
+   			for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
+				y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   			}
+    			gettimeofday( &start, (struct timezone *)0);
+    			GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
+    			gettimeofday( &stop, (struct timezone *)0);
+    			time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+			timeg += time1;
 
+    		}
 
-    for (l=0; l<loops; l++)
-    {
+    		timeg /= loops;
 
-   	for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
+    		fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
 
-   	for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
-			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-    	gettimeofday( &start, (struct timezone *)0);
+  	}
+  }
+  else
+  {
 
-    	GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
+  	for(n = from; n <= to; n += step)
+  	{
+   		timeg=0;
+   		fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
+   		for(j = 0; j < m; j++){
+      			for(i = 0; i < n * COMPSIZE; i++){
+				a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+      			}
+   		}
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    		for (l=0; l<loops; l++)
+    		{
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+   			for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
+				x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   			}
 
-	timeg += time1;
+   			for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
+				y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   			}
+    			gettimeofday( &start, (struct timezone *)0);
+    			GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
+    			gettimeofday( &stop, (struct timezone *)0);
+    			time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+			timeg += time1;
 
-    }
+    		}
 
-    timeg /= loops;
+    		timeg /= loops;
 
-    fprintf(stderr,
-	    " %10.2f MFlops\n",
-	    COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
+    		fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
 
+  	}
   }
 
   return 0;

From 9570e56965b720058673f42cd0468488f816a9fb Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Mon, 1 Sep 2014 15:11:37 +0200
Subject: [PATCH 07/44] optimized sgemv_t_4.c for small sizes

---
 kernel/x86_64/sgemv_t_4.c | 123 ++++++++++++++++++++++++++++++++------
 1 file changed, 105 insertions(+), 18 deletions(-)

diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index 76187b57d..ae1279296 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -64,23 +64,63 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	
 #endif
 
+static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y)  __attribute__ ((noinline));
+
+static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+
+	i=0;
+
+        __asm__  __volatile__
+ 	(
+	"xorps %%xmm10 , %%xmm10		\n\t"
+	"xorps %%xmm11 , %%xmm11		\n\t"
+
+        ".align 16                              \n\t"
+        ".L01LOOP%=:                            \n\t"
+
+	"movups  (%5,%0,4) , %%xmm14		\n\t" // x
+	"movups  (%3,%0,4) , %%xmm12		\n\t" // ap0
+	"movups  (%4,%0,4) , %%xmm13		\n\t" // ap1
+	"mulps   %%xmm14   , %%xmm12 		\n\t"
+	"mulps   %%xmm14   , %%xmm13 		\n\t"
+        "addq           $4 , %0                 \n\t"
+	"addps   %%xmm12   , %%xmm10		\n\t"
+        "subq           $4 , %1                 \n\t"
+	"addps   %%xmm13   , %%xmm11		\n\t"
+
+        "jnz            .L01LOOP%=              \n\t"
+
+	"haddps        %%xmm10, %%xmm10         \n\t"
+	"haddps        %%xmm11, %%xmm11         \n\t"
+	"haddps        %%xmm10, %%xmm10         \n\t"
+	"haddps        %%xmm11, %%xmm11         \n\t"
+
+	"movss	       %%xmm10, (%2)	        \n\t"
+	"movss	       %%xmm11,4(%2)	        \n\t"
+
+        :
+   	:
+	"r" (i),	 // 0
+	"r" (n),	 // 1
+        "r" (y),         // 2    
+        "r" (ap0),       // 3
+        "r" (ap1),       // 4
+        "r" (x)          // 5
+        : "cc",
+       	"%xmm10", "%xmm11", "%xmm12",
+       	"memory"
+       	);
+
+
+}
+	
 static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)  __attribute__ ((noinline));
 
 static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 {
 	BLASLONG i;
-	FLOAT *a0;
-	a0 = ap;
-	FLOAT temp = 0.0;
-
-	if (n <=0 ) return;
-/*
-	for ( i=0; i< n; i+=4 )
-	{
-		temp += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];		
-	}
-	*y = temp;
-*/
 
 	i=0;
 
@@ -259,7 +299,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 			a_ptr += n1 * lda4 ;
 		}
 
-		for( i = 0; i < n2 ; i++)
+		if ( n2 & 2 )
+		{
+
+			sgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer);
+			a_ptr  += lda * 2;
+			*y_ptr += ybuffer[0] * alpha;
+			y_ptr  += inc_y;
+			*y_ptr += ybuffer[1] * alpha;
+			y_ptr  += inc_y;
+
+		}
+
+		if ( n2 & 1 )
 		{
 
 			sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
@@ -276,20 +328,55 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 
 	x_ptr = x;
 	a_ptr = a;
-	for ( i=0; i< m3; i++ )
+	if ( m3 == 3 )
 	{
-		FLOAT xtemp = *x_ptr * alpha;
+		FLOAT xtemp0 = *x_ptr * alpha;
+		x_ptr += inc_x;
+		FLOAT xtemp1 = *x_ptr * alpha;
+		x_ptr += inc_x;
+		FLOAT xtemp2 = *x_ptr * alpha;
+
 		FLOAT *aj = a_ptr;
 		y_ptr = y;
 		for ( j=0; j<n; j++ )
 		{
-			*y_ptr += *aj * xtemp;
+			*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) *xtemp2;
 			 y_ptr += inc_y;
 			 aj    += lda;
 		}
-		x_ptr += inc_x;
-		a_ptr++ ;
+		return(0);
+
 	}
+
+	if ( m3 == 2 )
+	{
+		FLOAT xtemp0 = *x_ptr * alpha;
+		x_ptr += inc_x;
+		FLOAT xtemp1 = *x_ptr * alpha;
+
+		FLOAT *aj = a_ptr;
+		y_ptr = y;
+		for ( j=0; j<n; j++ )
+		{
+			*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
+			 y_ptr += inc_y;
+			 aj    += lda;
+		}
+		return(0);
+
+	}
+
+	FLOAT xtemp = *x_ptr * alpha;
+	FLOAT *aj = a_ptr;
+	y_ptr = y;
+	for ( j=0; j<n; j++ )
+	{
+		*y_ptr += *aj * xtemp;
+		 y_ptr += inc_y;
+		 aj    += lda;
+	}
+	x_ptr += inc_x;
+	a_ptr++ ;
 	
 	return(0);
 }

From 93eaba959de8b77bcbbf1528480dc3c2306a1fd9 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 2 Sep 2014 12:42:36 +0200
Subject: [PATCH 08/44] optimized sgemv_t for bulldozer

---
 kernel/x86_64/KERNEL.BULLDOZER             |   2 +-
 kernel/x86_64/sgemv_t_4.c                  |   6 +-
 kernel/x86_64/sgemv_t_microk_bulldozer-4.c | 147 +++++++++++++++++++++
 3 files changed, 152 insertions(+), 3 deletions(-)
 create mode 100644 kernel/x86_64/sgemv_t_microk_bulldozer-4.c

diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER
index 6318b202c..346315aba 100644
--- a/kernel/x86_64/KERNEL.BULLDOZER
+++ b/kernel/x86_64/KERNEL.BULLDOZER
@@ -11,7 +11,7 @@ SSYMV_U_KERNEL = ssymv_U.c
 SSYMV_L_KERNEL = ssymv_L.c
 
 SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = sgemv_t.c
+SGEMVTKERNEL = sgemv_t_4.c
 
 ZGEMVNKERNEL = zgemv_n_dup.S
 ZGEMVTKERNEL = zgemv_t.c
diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index ae1279296..5568b98cc 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(NEHALEM)
 #include "sgemv_t_microk_nehalem-4.c"
+#elif defined(BULLDOZER)
+#include "sgemv_t_microk_bulldozer-4.c"
 #endif
 
 #define NBMAX 4096
@@ -202,9 +204,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
 	"movups  (%4,%0,4) , %%xmm11		\n\t"
 	"mulps   %%xmm10   , %%xmm12 		\n\t"
         "addq           $4 , %0                 \n\t"
-	"addps   %%xmm11   , %%xmm12		\n\t"
+	"addps   %%xmm12   , %%xmm11		\n\t"
         "subq           $4 , %1                 \n\t"
-	"movups  %%xmm12, -16(%4,%0,4)		\n\t"
+	"movups  %%xmm11, -16(%4,%0,4)		\n\t"
 
         "jnz            .L01LOOP%=              \n\t"
 
diff --git a/kernel/x86_64/sgemv_t_microk_bulldozer-4.c b/kernel/x86_64/sgemv_t_microk_bulldozer-4.c
new file mode 100644
index 000000000..40e318de3
--- /dev/null
+++ b/kernel/x86_64/sgemv_t_microk_bulldozer-4.c
@@ -0,0 +1,147 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vxorps		%%xmm4, %%xmm4, %%xmm4	 \n\t"
+	"vxorps		%%xmm5, %%xmm5, %%xmm5	 \n\t"
+	"vxorps		%%xmm6, %%xmm6, %%xmm6	 \n\t"
+	"vxorps		%%xmm7, %%xmm7, %%xmm7	 \n\t"
+
+	"testq		$0x04, %1		       \n\t"
+	"jz		.L08LABEL%=		       \n\t"
+
+        "vmovups        (%2,%0,4), %%xmm12             \n\t"  // 4 * x
+	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%5,%0,4), %%xmm12, %%xmm5 \n\t" 
+	"vfmaddps %%xmm6,   (%6,%0,4), %%xmm12, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7,   (%7,%0,4), %%xmm12, %%xmm7 \n\t" 
+        "addq		$4 , %0	  	 	       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+
+	".L08LABEL%=:				       \n\t"
+
+	"testq		$0x08, %1		       \n\t"
+	"jz		.L16LABEL%=		       \n\t"
+
+        "vmovups        (%2,%0,4), %%xmm12             \n\t"  // 4 * x
+        "vmovups      16(%2,%0,4), %%xmm13             \n\t"  // 4 * x
+	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%5,%0,4), %%xmm12, %%xmm5 \n\t" 
+	"vfmaddps %%xmm6,   (%6,%0,4), %%xmm12, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7,   (%7,%0,4), %%xmm12, %%xmm7 \n\t" 
+	"vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" 
+	"vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t" 
+
+        "addq		$8 , %0	  	 	       \n\t"
+	"subq	        $8 , %1			       \n\t"		
+
+	".L16LABEL%=:				       \n\t"
+
+	"cmpq		$0, %1		               \n\t"
+	"je		.L16END%=		       \n\t"
+
+	".align 16				       \n\t"
+	".L01LOOP%=:				       \n\t"
+        "vmovups        (%2,%0,4), %%xmm12             \n\t"  // 4 * x
+
+	"prefetcht0	 384(%4,%0,4)		       \n\t"
+	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%5,%0,4), %%xmm12, %%xmm5 \n\t" 
+        "vmovups      16(%2,%0,4), %%xmm13             \n\t"  // 4 * x
+	"vfmaddps %%xmm6,   (%6,%0,4), %%xmm12, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7,   (%7,%0,4), %%xmm12, %%xmm7 \n\t" 
+	"prefetcht0	 384(%5,%0,4)		       \n\t"
+	".align 2				       \n\t"
+	"vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" 
+        "vmovups      32(%2,%0,4), %%xmm14             \n\t"  // 4 * x
+	"vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t" 
+	"prefetcht0	 384(%6,%0,4)		       \n\t"
+	".align 2				       \n\t"
+	"vfmaddps %%xmm4, 32(%4,%0,4), %%xmm14, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 32(%5,%0,4), %%xmm14, %%xmm5 \n\t" 
+        "vmovups      48(%2,%0,4), %%xmm15             \n\t"  // 4 * x
+	"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 32(%7,%0,4), %%xmm14, %%xmm7 \n\t" 
+	"prefetcht0	 384(%7,%0,4)		       \n\t"
+	"vfmaddps %%xmm4, 48(%4,%0,4), %%xmm15, %%xmm4 \n\t" 
+        "addq		$16, %0	  	 	       \n\t"
+	"vfmaddps %%xmm5,-16(%5,%0,4), %%xmm15, %%xmm5 \n\t" 
+	"vfmaddps %%xmm6,-16(%6,%0,4), %%xmm15, %%xmm6 \n\t" 
+	"subq	        $16, %1			       \n\t"		
+	"vfmaddps %%xmm7,-16(%7,%0,4), %%xmm15, %%xmm7 \n\t" 
+
+	"jnz		.L01LOOP%=		       \n\t"
+
+	".L16END%=:				\n\t"
+	"vhaddps        %%xmm4, %%xmm4, %%xmm4	\n\t"
+	"vhaddps        %%xmm5, %%xmm5, %%xmm5	\n\t"
+	"vhaddps        %%xmm6, %%xmm6, %%xmm6	\n\t"
+	"vhaddps        %%xmm7, %%xmm7, %%xmm7	\n\t"
+
+	"vhaddps        %%xmm4, %%xmm4, %%xmm4	\n\t"
+	"vhaddps        %%xmm5, %%xmm5, %%xmm5	\n\t"
+	"vhaddps        %%xmm6, %%xmm6, %%xmm6	\n\t"
+	"vhaddps        %%xmm7, %%xmm7, %%xmm7	\n\t"
+
+	"vmovss		%%xmm4,    (%3)		\n\t"
+	"vmovss		%%xmm5,   4(%3)		\n\t"
+	"vmovss		%%xmm6,   8(%3)		\n\t"
+	"vmovss		%%xmm7,  12(%3)		\n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From f3b50dcf5b1f06b8d778544f70d8e85e0f445090 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 2 Sep 2014 13:35:41 +0200
Subject: [PATCH 09/44] removed obsolete instructions from sgemv_t_4.c

---
 kernel/x86_64/sgemv_t_4.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index 5568b98cc..2d0648a6c 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -377,8 +377,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 		 y_ptr += inc_y;
 		 aj    += lda;
 	}
-	x_ptr += inc_x;
-	a_ptr++ ;
 	
 	return(0);
 }

From 210bec9111c5252dfe600795c3ac63baaa060a9c Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 2 Sep 2014 14:11:42 +0200
Subject: [PATCH 10/44] added plot-header to compare multithreading

---
 benchmark/tplot-header | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 benchmark/tplot-header

diff --git a/benchmark/tplot-header b/benchmark/tplot-header
new file mode 100644
index 000000000..b7ce7f225
--- /dev/null
+++ b/benchmark/tplot-header
@@ -0,0 +1,42 @@
+# **********************************************************************************
+# Copyright (c) 2014, The OpenBLAS Project
+# All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# 3. Neither the name of the OpenBLAS project nor the names of
+# its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# **********************************************************************************
+
+set term x11 font sans;
+set ylabel "MFlops";
+set xlabel "Size";
+set grid xtics;
+set grid ytics;
+set key left;
+set timestamp "generated on %Y-%m-%d by `whoami`"
+set title "Sgemv\nTRANS=T\nBulldozer"
+plot '1-THREAD' smooth bezier, '2-THREADS' smooth bezier, '4-THREADS' smooth bezier;
+set output "print.png";
+show title;
+show plot;
+show output;
+
+

From f4ff889491de5d95d24d9d4edcbd85b0f83ff380 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 2 Sep 2014 16:30:04 +0200
Subject: [PATCH 11/44] updated interface/gemv.c for multithreading

---
 interface/gemv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/interface/gemv.c b/interface/gemv.c
index 08553ad21..3bcc099a5 100644
--- a/interface/gemv.c
+++ b/interface/gemv.c
@@ -215,8 +215,9 @@ void CNAME(enum CBLAS_ORDER order,
   int  nthreads_max = num_cpu_avail(2);
   int  nthreads_avail = nthreads_max;
 
+
   double MNK = (double) m * (double) n;
-  if ( MNK <= (500.0 * 100.0  * (double) GEMM_MULTITHREAD_THRESHOLD)  )
+  if ( MNK <= (128.0 * 32.0  * (double) GEMM_MULTITHREAD_THRESHOLD)  )
         nthreads_max = 1;
 
   if ( nthreads_max > nthreads_avail )

From d1800397f592226cd0cb933303c09de325034412 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 2 Sep 2014 17:36:07 +0200
Subject: [PATCH 12/44] optimized interface/gemv.c for multithreading

---
 interface/gemv.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/interface/gemv.c b/interface/gemv.c
index 3bcc099a5..64dc641d0 100644
--- a/interface/gemv.c
+++ b/interface/gemv.c
@@ -215,9 +215,8 @@ void CNAME(enum CBLAS_ORDER order,
   int  nthreads_max = num_cpu_avail(2);
   int  nthreads_avail = nthreads_max;
 
-
   double MNK = (double) m * (double) n;
-  if ( MNK <= (128.0 * 32.0  * (double) GEMM_MULTITHREAD_THRESHOLD)  )
+  if ( MNK <= (96.0 * 24.0  * (double) GEMM_MULTITHREAD_THRESHOLD)  )
         nthreads_max = 1;
 
   if ( nthreads_max > nthreads_avail )

From 0fc560ba239767098f05f2f13161b036b2eb805d Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Wed, 3 Sep 2014 10:13:47 +0200
Subject: [PATCH 13/44] bugfix for buffer overflow

---
 kernel/x86_64/sgemv_t_4.c | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index 2d0648a6c..fb0ba9741 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -232,6 +232,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	FLOAT *a_ptr;
 	FLOAT *x_ptr;
 	FLOAT *y_ptr;
+	BLASLONG n0;
 	BLASLONG n1;
 	BLASLONG m1;
 	BLASLONG m2;
@@ -246,7 +247,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	xbuffer = buffer;
 	ytemp   = buffer + NBMAX;
 	
-        n1 = n >> 2 ;
+	n0 = n / NBMAX;
+        n1 = (n % NBMAX)  >> 2 ;
         n2 = n & 3  ;
 
 	m3 = m & 3  ;
@@ -283,6 +285,32 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 		ap[1] = a_ptr + lda;
 		ap[2] = ap[1] + lda;
 		ap[3] = ap[2] + lda;
+
+		if ( n0 > 0 )
+		{
+			BLASLONG nb1 = NBMAX / 4;
+			for( j=0; j<n0; j++)
+			{
+
+				yp = ytemp;
+				for( i = 0; i < nb1  ; i++)
+				{
+					sgemv_kernel_4x4(NB,ap,xbuffer,yp);
+					ap[0] += lda4 ;
+					ap[1] += lda4 ;
+					ap[2] += lda4 ;
+					ap[3] += lda4 ;
+					yp += 4;
+				}
+				add_y(nb1*4, alpha, ytemp, y_ptr, inc_y );
+				y_ptr += nb1 * inc_y * 4;
+				a_ptr += nb1 * lda4 ;
+
+			}
+
+		}
+
+
 		yp = ytemp;
 
 		for( i = 0; i < n1 ; i++)

From 2a60c6d4b04be59e9b5ef407c52bd1a3544efb38 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Wed, 3 Sep 2014 14:48:45 +0200
Subject: [PATCH 14/44] optimized sgemv_n for small sizes

---
 kernel/x86_64/KERNEL.NEHALEM             |   2 +-
 kernel/x86_64/sgemv_n_4.c                | 319 +++++++++++++++++++++++
 kernel/x86_64/sgemv_n_microk_nehalem-4.c | 185 +++++++++++++
 3 files changed, 505 insertions(+), 1 deletion(-)
 create mode 100644 kernel/x86_64/sgemv_n_4.c
 create mode 100644 kernel/x86_64/sgemv_n_microk_nehalem-4.c

diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM
index 00c3b4d15..68c741cea 100644
--- a/kernel/x86_64/KERNEL.NEHALEM
+++ b/kernel/x86_64/KERNEL.NEHALEM
@@ -9,7 +9,7 @@ DSYMV_L_KERNEL = dsymv_L.c
 SSYMV_U_KERNEL = ssymv_U.c
 SSYMV_L_KERNEL = ssymv_L.c
 
-SGEMVNKERNEL = sgemv_n.c
+SGEMVNKERNEL = sgemv_n_4.c
 SGEMVTKERNEL = sgemv_t_4.c
 DGEMVNKERNEL = dgemv_n.c
 
diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
new file mode 100644
index 000000000..f84016075
--- /dev/null
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -0,0 +1,319 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+
+#if defined(BULLDOZER) || defined(PILEDRIVER)
+#include "sgemv_n_microk_bulldozer-4.c"
+#elif defined(NEHALEM)
+#include "sgemv_n_microk_nehalem-4.c"
+#endif
+
+
+#define NBMAX 4096
+
+#ifndef HAVE_KERNEL_4x8
+
+static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4)
+{
+	BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3;
+	FLOAT *b0,*b1,*b2,*b3;
+	FLOAT *x4;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3];
+	b0 = a0 + lda4 ;
+	b1 = a1 + lda4 ;
+	b2 = a2 + lda4 ;
+	b3 = a3 + lda4 ;
+	x4 = x + 4;
+
+	for ( i=0; i< n; i+=4 )
+	{
+
+		y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];		
+		y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];		
+		y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];		
+		y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];		
+
+		y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3];		
+		y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3];		
+		y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3];		
+		y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3];		
+
+	}
+}
+	
+#endif
+
+
+#ifndef HAVE_KERNEL_4x4
+
+static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3];
+
+	for ( i=0; i< n; i+=4 )
+	{
+		y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];		
+		y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];		
+		y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];		
+		y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];		
+	}
+}
+	
+#endif
+
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+	FLOAT *a0;
+	a0 = ap;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		y[i] += a0[i]*x[0];		
+		y[i+1] += a0[i+1]*x[0];		
+		y[i+2] += a0[i+2]*x[0];		
+		y[i+3] += a0[i+3]*x[0];		
+	}
+}
+	
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT *alpha) __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT *alpha)
+{
+	BLASLONG i;
+	if ( inc_dest != 1 )
+	{
+		FLOAT da = *alpha;
+		for ( i=0; i<n; i++ )
+		{
+			*dest += *src * da;
+			src++;
+			dest += inc_dest;
+		}
+		return;
+	}
+
+        i=0;
+
+        __asm__  __volatile__
+        (
+        "movss   (%2) , %%xmm10                 \n\t"
+        "shufps  $0 , %%xmm10 , %%xmm10         \n\t"
+
+        ".align 16                              \n\t"
+        ".L01LOOP%=:                            \n\t"
+
+        "movups  (%3,%0,4) , %%xmm12            \n\t"
+        "movups  (%4,%0,4) , %%xmm11            \n\t"
+        "mulps   %%xmm10   , %%xmm12            \n\t"
+        "addq           $4 , %0                 \n\t"
+        "addps   %%xmm12   , %%xmm11            \n\t"
+        "subq           $4 , %1                 \n\t"
+        "movups  %%xmm11, -16(%4,%0,4)          \n\t"
+
+        "jnz            .L01LOOP%=              \n\t"
+
+        :
+        :
+        "r" (i),          // 0
+        "r" (n),          // 1
+        "r" (alpha),      // 2    
+        "r" (src),        // 3
+        "r" (dest)        // 4
+        : "cc",
+        "%xmm10", "%xmm11", "%xmm12",
+        "memory"
+        );
+
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	FLOAT *ap[4];
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG n2;
+	BLASLONG lda4 = 4 * lda;
+	BLASLONG lda8 = 8 * lda;
+	FLOAT xbuffer[8],*ybuffer;
+
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
+	ybuffer = buffer;
+	
+        if ( inc_x == 1 )
+	{
+		n1 = n / 8 ;
+		n2 = n % 8 ;
+	}
+	else
+	{
+		n1 = n / 4 ;
+		n2 = n % 4 ;
+
+	}
+	
+	m1 = m - ( m % 4 );
+	m2 = (m % NBMAX) - (m % 4) ;
+	
+	y_ptr = y;
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		a_ptr = a;
+		x_ptr = x;
+		
+		ap[0] = a_ptr;
+		ap[1] = a_ptr + lda;
+		ap[2] = ap[1] + lda;
+		ap[3] = ap[2] + lda;
+
+		memset(ybuffer,0,NB*4);
+		if ( inc_x == 1 )
+		{
+
+
+			for( i = 0; i < n1 ; i++)
+			{
+				sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4);
+				ap[0] += lda8; 
+				ap[1] += lda8; 
+				ap[2] += lda8; 
+				ap[3] += lda8; 
+				a_ptr += lda8;
+				x_ptr += 8;	
+			}
+
+/*
+			for( i = 0; i < n1 ; i++)
+			{
+				sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				ap[2] += lda4; 
+				ap[3] += lda4; 
+				a_ptr += lda4;
+				x_ptr += 4;	
+			}
+*/
+			for( i = 0; i < n2 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
+				a_ptr += lda;
+
+			}
+
+
+		}
+		else
+		{
+
+			for( i = 0; i < n1 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[1] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[2] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[3] = x_ptr[0];
+				x_ptr += inc_x;	
+				sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				ap[2] += lda4; 
+				ap[3] += lda4; 
+				a_ptr += lda4;
+			}
+
+			for( i = 0; i < n2 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
+				a_ptr += lda;
+
+			}
+
+		}
+
+		add_y(NB,ybuffer,y_ptr,inc_y,&alpha);
+		a     += NB;
+		y_ptr += NB * inc_y;
+	}
+	j=0;
+	while ( j < (m % 4))
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp = 0.0;
+		for( i = 0; i < n; i++ )
+		{
+			temp += a_ptr[0] * x_ptr[0];
+			a_ptr += lda;
+			x_ptr += inc_x;
+		}
+		y_ptr[0] += alpha * temp;
+		y_ptr += inc_y;
+		a++;
+		j++;
+	}
+	return(0);
+}
+
+
diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c
new file mode 100644
index 000000000..accc529b3
--- /dev/null
+++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c
@@ -0,0 +1,185 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+
+#define HAVE_KERNEL_4x8 1
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"movss    (%2), %%xmm12	 \n\t"	// x0 
+	"movss   4(%2), %%xmm13	 \n\t"	// x1 
+	"movss   8(%2), %%xmm14	 \n\t"	// x2 
+	"movss  12(%2), %%xmm15	 \n\t"	// x3 
+	"shufps $0,  %%xmm12, %%xmm12\n\t"	
+	"shufps $0,  %%xmm13, %%xmm13\n\t"	
+	"shufps $0,  %%xmm14, %%xmm14\n\t"	
+	"shufps $0,  %%xmm15, %%xmm15\n\t"	
+
+	"movss  16(%2), %%xmm0	 \n\t"	// x4 
+	"movss  20(%2), %%xmm1	 \n\t"	// x5 
+	"movss  24(%2), %%xmm2	 \n\t"	// x6 
+	"movss  28(%2), %%xmm3	 \n\t"	// x7 
+	"shufps $0,  %%xmm0 , %%xmm0 \n\t"	
+	"shufps $0,  %%xmm1 , %%xmm1 \n\t"	
+	"shufps $0,  %%xmm2 , %%xmm2 \n\t"	
+	"shufps $0,  %%xmm3 , %%xmm3 \n\t"	
+
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"movups	       (%3,%0,4), %%xmm4	 \n\t"	// 4 * y
+	"xorps           %%xmm5 , %%xmm5	 \n\t"
+
+	"movups             (%4,%0,4), %%xmm8          \n\t" 
+	"movups             (%5,%0,4), %%xmm9          \n\t" 
+	"movups             (%6,%0,4), %%xmm10         \n\t" 
+	"movups             (%7,%0,4), %%xmm11         \n\t" 
+	"mulps		%%xmm12, %%xmm8		       \n\t"
+	"mulps		%%xmm13, %%xmm9		       \n\t"
+	"mulps		%%xmm14, %%xmm10	       \n\t"
+	"mulps		%%xmm15, %%xmm11	       \n\t"
+	"addps		%%xmm8 , %%xmm4		       \n\t"
+	"addps		%%xmm9 , %%xmm5		       \n\t"
+	"addps		%%xmm10, %%xmm4	               \n\t"
+	"addps		%%xmm11, %%xmm5 	       \n\t"
+
+	"movups             (%4,%8,4), %%xmm8          \n\t" 
+	"movups             (%5,%8,4), %%xmm9          \n\t" 
+	"movups             (%6,%8,4), %%xmm10         \n\t" 
+	"movups             (%7,%8,4), %%xmm11         \n\t" 
+	"mulps		%%xmm0 , %%xmm8		       \n\t"
+	"mulps		%%xmm1 , %%xmm9		       \n\t"
+	"mulps		%%xmm2 , %%xmm10	       \n\t"
+	"mulps		%%xmm3 , %%xmm11	       \n\t"
+        "addq		$4 , %8	  	 	       \n\t"
+	"addps		%%xmm8 , %%xmm4		       \n\t"
+	"addps		%%xmm9 , %%xmm5		       \n\t"
+        "addq		$4 , %0	  	 	       \n\t"
+	"addps		%%xmm10, %%xmm4	       	       \n\t"
+	"addps		%%xmm11, %%xmm5 	       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+
+	"addps		%%xmm4 , %%xmm5 	       \n\t"
+	"movups  %%xmm5 , -16(%3,%0,4)		       \n\t"	// 4 * y
+
+	"jnz		.L01LOOP%=		       \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (lda4)    // 8
+	: "cc", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+
+
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"movss    (%2), %%xmm12	 \n\t"	// x0 
+	"movss   4(%2), %%xmm13	 \n\t"	// x1 
+	"movss   8(%2), %%xmm14	 \n\t"	// x2 
+	"movss  12(%2), %%xmm15	 \n\t"	// x3 
+	"shufps $0,  %%xmm12, %%xmm12\n\t"	
+	"shufps $0,  %%xmm13, %%xmm13\n\t"	
+	"shufps $0,  %%xmm14, %%xmm14\n\t"	
+	"shufps $0,  %%xmm15, %%xmm15\n\t"	
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"movups	       (%3,%0,4), %%xmm4	 \n\t"	// 4 * y
+
+	"movups             (%4,%0,4), %%xmm8          \n\t" 
+	"movups             (%5,%0,4), %%xmm9          \n\t" 
+	"movups             (%6,%0,4), %%xmm10         \n\t" 
+	"movups             (%7,%0,4), %%xmm11         \n\t" 
+	"mulps		%%xmm12, %%xmm8		       \n\t"
+	"mulps		%%xmm13, %%xmm9		       \n\t"
+	"mulps		%%xmm14, %%xmm10	       \n\t"
+	"mulps		%%xmm15, %%xmm11	       \n\t"
+	"addps		%%xmm8 , %%xmm4		       \n\t"
+        "addq		$4 , %0	  	 	       \n\t"
+	"addps		%%xmm9 , %%xmm4		       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+	"addps		%%xmm10 , %%xmm4	       \n\t"
+	"addps		%%xmm4 , %%xmm11	       \n\t"
+
+	"movups  %%xmm11, -16(%3,%0,4)		       \n\t"	// 4 * y
+
+	"jnz		.L01LOOP%=		       \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From 3a5d8dbff986f6fcda11075057119bcdfa5c5c15 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Wed, 3 Sep 2014 15:34:30 +0200
Subject: [PATCH 15/44] optimized sgemv_n_4.c

---
 kernel/x86_64/sgemv_n_4.c                | 34 ++++++++++++++----------
 kernel/x86_64/sgemv_n_microk_nehalem-4.c |  5 +++-
 2 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
index f84016075..31d841ddd 100644
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -174,9 +174,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	BLASLONG n1;
 	BLASLONG m1;
 	BLASLONG m2;
+	BLASLONG m3;
 	BLASLONG n2;
-	BLASLONG lda4 = 4 * lda;
-	BLASLONG lda8 = 8 * lda;
+	BLASLONG lda4 =  lda << 2;
+	BLASLONG lda8 =  lda << 3;
 	FLOAT xbuffer[8],*ybuffer;
 
         if ( m < 1 ) return(0);
@@ -186,19 +187,21 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	
         if ( inc_x == 1 )
 	{
-		n1 = n / 8 ;
-		n2 = n % 8 ;
+		n1 = n >> 3 ;
+		n2 = n &  7 ;
 	}
 	else
 	{
-		n1 = n / 4 ;
-		n2 = n % 4 ;
+		n1 = n >> 2 ;
+		n2 = n &  3 ;
 
 	}
 	
-	m1 = m - ( m % 4 );
-	m2 = (m % NBMAX) - (m % 4) ;
-	
+        m3 = m & 3  ;
+        m1 = m & -4 ;
+        m2 = (m & (NBMAX-1)) - m3 ;
+
+
 	y_ptr = y;
 
 	BLASLONG NB = NBMAX;
@@ -237,8 +240,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 				x_ptr += 8;	
 			}
 
-/*
-			for( i = 0; i < n1 ; i++)
+
+			if ( n2 & 4 )
 			{
 				sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer);
 				ap[0] += lda4; 
@@ -248,8 +251,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 				a_ptr += lda4;
 				x_ptr += 4;	
 			}
-*/
-			for( i = 0; i < n2 ; i++)
+
+			for( i = 0; i < ( n2 & 3 ) ; i++)
 			{
 				xbuffer[0] = x_ptr[0];
 				x_ptr += inc_x;	
@@ -296,8 +299,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 		a     += NB;
 		y_ptr += NB * inc_y;
 	}
+
+	if ( m3 == 0 ) return;
+
 	j=0;
-	while ( j < (m % 4))
+	while ( j < m3 )
 	{
 		a_ptr = a;
 		x_ptr = x;
diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c
index accc529b3..f87cfa425 100644
--- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c
+++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c
@@ -58,13 +58,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
-	"movups	       (%3,%0,4), %%xmm4	 \n\t"	// 4 * y
 	"xorps           %%xmm5 , %%xmm5	 \n\t"
+	"movups	       (%3,%0,4), %%xmm4	 \n\t"	// 4 * y
 
+	".align 2				       \n\t"
 	"movups             (%4,%0,4), %%xmm8          \n\t" 
 	"movups             (%5,%0,4), %%xmm9          \n\t" 
 	"movups             (%6,%0,4), %%xmm10         \n\t" 
 	"movups             (%7,%0,4), %%xmm11         \n\t" 
+	".align 2				       \n\t"
 	"mulps		%%xmm12, %%xmm8		       \n\t"
 	"mulps		%%xmm13, %%xmm9		       \n\t"
 	"mulps		%%xmm14, %%xmm10	       \n\t"
@@ -78,6 +80,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	"movups             (%5,%8,4), %%xmm9          \n\t" 
 	"movups             (%6,%8,4), %%xmm10         \n\t" 
 	"movups             (%7,%8,4), %%xmm11         \n\t" 
+	".align 2				       \n\t"
 	"mulps		%%xmm0 , %%xmm8		       \n\t"
 	"mulps		%%xmm1 , %%xmm9		       \n\t"
 	"mulps		%%xmm2 , %%xmm10	       \n\t"

From 7f910010a08f84b6ed74149f6cdcaaa71ca7f09b Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Thu, 4 Sep 2014 13:09:27 +0200
Subject: [PATCH 16/44] optimized sgemv_n kernel for small sizes

---
 kernel/x86_64/sgemv_n_4.c                  |  41 +++-
 kernel/x86_64/sgemv_n_microk_bulldozer-4.c | 254 +++++++++++++++++++++
 2 files changed, 283 insertions(+), 12 deletions(-)
 create mode 100644 kernel/x86_64/sgemv_n_microk_bulldozer-4.c

diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
index 31d841ddd..f1573dd30 100644
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -185,17 +185,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 
 	ybuffer = buffer;
 	
-        if ( inc_x == 1 )
-	{
-		n1 = n >> 3 ;
-		n2 = n &  7 ;
-	}
-	else
-	{
-		n1 = n >> 2 ;
-		n2 = n &  3 ;
-
-	}
+	n1 = n  >> 3 ;
+	n2 = n  &  7  ;
 	
         m3 = m & 3  ;
         m1 = m & -4 ;
@@ -267,6 +258,32 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 		{
 
 			for( i = 0; i < n1 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[1] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[2] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[3] = x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[4] = x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[5] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[6] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[7] = x_ptr[0];
+				x_ptr += inc_x;	
+				sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4);
+				ap[0] += lda8; 
+				ap[1] += lda8; 
+				ap[2] += lda8; 
+				ap[3] += lda8; 
+				a_ptr += lda8;
+			}
+
+			if ( n2 & 4 )
 			{
 				xbuffer[0] = x_ptr[0];
 				x_ptr += inc_x;	
@@ -284,7 +301,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 				a_ptr += lda4;
 			}
 
-			for( i = 0; i < n2 ; i++)
+			for( i = 0; i < ( n2 & 3) ; i++)
 			{
 				xbuffer[0] = x_ptr[0];
 				x_ptr += inc_x;	
diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
new file mode 100644
index 000000000..53287df75
--- /dev/null
+++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
@@ -0,0 +1,254 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+
+#define HAVE_KERNEL_4x8 1
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vbroadcastss    (%2), %%xmm12	 \n\t"	// x0 
+	"vbroadcastss   4(%2), %%xmm13	 \n\t"	// x1 
+	"vbroadcastss   8(%2), %%xmm14	 \n\t"	// x2 
+	"vbroadcastss  12(%2), %%xmm15	 \n\t"	// x3 
+	"vbroadcastss  16(%2), %%xmm0 	 \n\t"	// x4 
+	"vbroadcastss  20(%2), %%xmm1 	 \n\t"	// x5 
+	"vbroadcastss  24(%2), %%xmm2 	 \n\t"	// x6 
+	"vbroadcastss  28(%2), %%xmm3 	 \n\t"	// x7 
+
+        "testq          $0x04, %1                      \n\t"
+        "jz             .L08LABEL%=                    \n\t"
+
+	"vxorps		%%xmm5, %%xmm5 , %%xmm5  \n\t"
+	"vmovups	(%3,%0,4), %%xmm4	 \n\t"	// 4 * y
+
+	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%5,%0,4), %%xmm13, %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%6,%0,4), %%xmm14, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%7,%0,4), %%xmm15, %%xmm5 \n\t" 
+        "addq		$4 , %0	  	 	      \n\t"
+
+	"vfmaddps %%xmm4,   (%4,%8,4), %%xmm0 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%5,%8,4), %%xmm1 , %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%6,%8,4), %%xmm2 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%7,%8,4), %%xmm3 , %%xmm5 \n\t" 
+        "addq		$4 , %8	  	 	      \n\t"
+	
+	"vaddps	  %%xmm4, %%xmm5, %%xmm6	       \n\t"
+	"vmovups  %%xmm6, -16(%3,%0,4)		      \n\t"	// 4 * y
+
+	"subq	        $4 , %1			      \n\t"		
+
+	".L08LABEL%=:                                  \n\t"
+
+        "testq          $0x08, %1                      \n\t"
+        "jz             .L16LABEL%=                    \n\t"
+
+	"vmovups	(%3,%0,4), %%xmm4	 \n\t"	// 4 * y
+	"vmovups      16(%3,%0,4), %%xmm5	 \n\t"	// 4 * y
+
+	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%5,%0,4), %%xmm13, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%6,%0,4), %%xmm14, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%7,%0,4), %%xmm15, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" 
+
+	"vfmaddps %%xmm4,   (%4,%8,4), %%xmm0 , %%xmm4 \n\t" 
+        "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%5,%8,4), %%xmm1 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%6,%8,4), %%xmm2 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%7,%8,4), %%xmm3 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" 
+	
+	"vmovups  %%xmm4,   (%3,%0,4)		      \n\t"	// 4 * y
+	"vmovups  %%xmm5, 16(%3,%0,4)		      \n\t"	// 4 * y
+
+        "addq		$8 , %0	  	 	      \n\t"
+        "addq		$8 , %8	  	 	      \n\t"
+	"subq	        $8 , %1			      \n\t"		
+
+
+        ".L16LABEL%=:                                  \n\t"
+
+        "cmpq           $0, %1                         \n\t"
+        "je             .L16END%=                      \n\t"
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+
+	"vmovups	(%3,%0,4), %%xmm4	 \n\t"	// 4 * y
+	".align 2				 \n\t"
+	"vmovups      16(%3,%0,4), %%xmm5	 \n\t"	// 4 * y
+	"vmovups      32(%3,%0,4), %%xmm6	 \n\t"	// 4 * y
+	"vmovups      48(%3,%0,4), %%xmm7	 \n\t"	// 4 * y
+
+        "prefetcht0      192(%4,%0,4)                  \n\t"
+	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" 
+        "prefetcht0      192(%5,%0,4)                  \n\t"
+	"vfmaddps %%xmm4,   (%5,%0,4), %%xmm13, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" 
+        "prefetcht0      192(%6,%0,4)                  \n\t"
+	"vfmaddps %%xmm4,   (%6,%0,4), %%xmm14, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" 
+        "prefetcht0      192(%7,%0,4)                  \n\t"
+	"vfmaddps %%xmm4,   (%7,%0,4), %%xmm15, %%xmm4 \n\t" 
+	".align 2				 \n\t"
+	"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" 
+
+	"vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t" 
+	"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t" 
+	"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t" 
+	"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" 
+
+        "prefetcht0      192(%4,%8,4)                  \n\t"
+	"vfmaddps %%xmm4,   (%4,%8,4), %%xmm0 , %%xmm4 \n\t" 
+        "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" 
+        "prefetcht0      192(%5,%8,4)                  \n\t"
+	"vfmaddps %%xmm4,   (%5,%8,4), %%xmm1 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" 
+        "prefetcht0      192(%6,%8,4)                  \n\t"
+	"vfmaddps %%xmm4,   (%6,%8,4), %%xmm2 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" 
+        "prefetcht0      192(%7,%8,4)                  \n\t"
+	"vfmaddps %%xmm4,   (%7,%8,4), %%xmm3 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" 
+	
+	"vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t" 
+        "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t" 
+	"vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t" 
+        "addq		$16, %0	  	 	      \n\t"
+	"vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t" 
+	"vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t" 
+	
+        "addq		$16, %8	  	 	      \n\t"
+	"vmovups  %%xmm4,-64(%3,%0,4)		      \n\t"	// 4 * y
+	"vmovups  %%xmm5,-48(%3,%0,4)		      \n\t"	// 4 * y
+	"subq	        $16, %1			      \n\t"		
+	"vmovups  %%xmm6,-32(%3,%0,4)		      \n\t"	// 4 * y
+	"vmovups  %%xmm7,-16(%3,%0,4)		      \n\t"	// 4 * y
+
+	"jnz		.L01LOOP%=		      \n\t"
+
+	".L16END%=:                             \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (lda4)    // 8
+	: "cc", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+
+
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vbroadcastss    (%2), %%xmm12	 \n\t"	// x0 
+	"vbroadcastss   4(%2), %%xmm13	 \n\t"	// x1 
+	"vbroadcastss   8(%2), %%xmm14	 \n\t"	// x2 
+	"vbroadcastss  12(%2), %%xmm15	 \n\t"	// x3 
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vmovups	(%3,%0,4), %%xmm4	 \n\t"	// 4 * y
+	"vxorps		%%xmm5, %%xmm5 , %%xmm5  \n\t"
+
+	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%5,%0,4), %%xmm13, %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%6,%0,4), %%xmm14, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%7,%0,4), %%xmm15, %%xmm5 \n\t" 
+	
+	"vaddps	  %%xmm4, %%xmm5, %%xmm6	       \n\t"
+
+	"vmovups  %%xmm6,   (%3,%0,4)		      \n\t"	// 4 * y
+
+        "addq		$4 , %0	  	 	      \n\t"
+	"subq	        $4 , %1			      \n\t"		
+	"jnz		.L01LOOP%=		      \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From 53de943690abc2f500ae131627136c9fbd35e541 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Thu, 4 Sep 2014 18:55:52 +0200
Subject: [PATCH 17/44] bugfix for sgemv_n_4.c

---
 kernel/x86_64/KERNEL.BULLDOZER |  2 +-
 kernel/x86_64/sgemv_n_4.c      | 41 ++++++++++------------------------
 2 files changed, 13 insertions(+), 30 deletions(-)

diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER
index 346315aba..0fd7ac35f 100644
--- a/kernel/x86_64/KERNEL.BULLDOZER
+++ b/kernel/x86_64/KERNEL.BULLDOZER
@@ -10,7 +10,7 @@ DSYMV_L_KERNEL = dsymv_L.c
 SSYMV_U_KERNEL = ssymv_U.c
 SSYMV_L_KERNEL = ssymv_L.c
 
-SGEMVNKERNEL = sgemv_n.c
+SGEMVNKERNEL = sgemv_n_4.c
 SGEMVTKERNEL = sgemv_t_4.c
 
 ZGEMVNKERNEL = zgemv_n_dup.S
diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
index f1573dd30..31d841ddd 100644
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -185,8 +185,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 
 	ybuffer = buffer;
 	
-	n1 = n  >> 3 ;
-	n2 = n  &  7  ;
+        if ( inc_x == 1 )
+	{
+		n1 = n >> 3 ;
+		n2 = n &  7 ;
+	}
+	else
+	{
+		n1 = n >> 2 ;
+		n2 = n &  3 ;
+
+	}
 	
         m3 = m & 3  ;
         m1 = m & -4 ;
@@ -258,32 +267,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 		{
 
 			for( i = 0; i < n1 ; i++)
-			{
-				xbuffer[0] = x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[1] =  x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[2] =  x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[3] = x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[4] = x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[5] =  x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[6] =  x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[7] = x_ptr[0];
-				x_ptr += inc_x;	
-				sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4);
-				ap[0] += lda8; 
-				ap[1] += lda8; 
-				ap[2] += lda8; 
-				ap[3] += lda8; 
-				a_ptr += lda8;
-			}
-
-			if ( n2 & 4 )
 			{
 				xbuffer[0] = x_ptr[0];
 				x_ptr += inc_x;	
@@ -301,7 +284,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 				a_ptr += lda4;
 			}
 
-			for( i = 0; i < ( n2 & 3) ; i++)
+			for( i = 0; i < n2 ; i++)
 			{
 				xbuffer[0] = x_ptr[0];
 				x_ptr += inc_x;	

From 6df7a8893078e2f9878efeb7212fb7030185cf37 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Fri, 5 Sep 2014 10:22:50 +0200
Subject: [PATCH 18/44] optimized sgemv_t for sandybridge

---
 kernel/x86_64/KERNEL.SANDYBRIDGE       |   2 +-
 kernel/x86_64/sgemv_t_4.c              |   2 +
 kernel/x86_64/sgemv_t_microk_sandy-4.c | 174 +++++++++++++++++++++++++
 3 files changed, 177 insertions(+), 1 deletion(-)
 create mode 100644 kernel/x86_64/sgemv_t_microk_sandy-4.c

diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE
index b654d3564..b70486436 100644
--- a/kernel/x86_64/KERNEL.SANDYBRIDGE
+++ b/kernel/x86_64/KERNEL.SANDYBRIDGE
@@ -1,5 +1,5 @@
 SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = sgemv_t.c
+SGEMVTKERNEL = sgemv_t_4.c
 
 ZGEMVNKERNEL = zgemv_n.c
 
diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index fb0ba9741..b89ec7f7f 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -32,6 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "sgemv_t_microk_nehalem-4.c"
 #elif defined(BULLDOZER)
 #include "sgemv_t_microk_bulldozer-4.c"
+#elif defined(SANDYBRIDGE)
+#include "sgemv_t_microk_sandy-4.c"
 #endif
 
 #define NBMAX 4096
diff --git a/kernel/x86_64/sgemv_t_microk_sandy-4.c b/kernel/x86_64/sgemv_t_microk_sandy-4.c
new file mode 100644
index 000000000..6550518f7
--- /dev/null
+++ b/kernel/x86_64/sgemv_t_microk_sandy-4.c
@@ -0,0 +1,174 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+        "vxorps         %%ymm0 , %%ymm0, %%ymm0  \n\t"
+        "vxorps         %%ymm1 , %%ymm1, %%ymm1  \n\t"
+        "vxorps         %%ymm2 , %%ymm2, %%ymm2  \n\t"
+        "vxorps         %%ymm3 , %%ymm3, %%ymm3  \n\t"
+        "vxorps         %%ymm4 , %%ymm4, %%ymm4  \n\t"
+        "vxorps         %%ymm5 , %%ymm5, %%ymm5  \n\t"
+        "vxorps         %%ymm6 , %%ymm6, %%ymm6  \n\t"
+        "vxorps         %%ymm7 , %%ymm7, %%ymm7  \n\t"
+
+        "testq          $0x04, %1                      \n\t"
+        "jz             .L08LABEL%=                    \n\t"
+
+        "vmovups        (%2,%0,4), %%xmm12       \n\t"  // 4 * x
+
+	"vmulps   (%4,%0,4), %%xmm12, %%xmm8      \n\t" 
+	"vmulps   (%5,%0,4), %%xmm12, %%xmm10     \n\t" 
+	"vmulps   (%6,%0,4), %%xmm12, %%xmm9      \n\t" 
+	"vmulps   (%7,%0,4), %%xmm12, %%xmm11     \n\t" 
+	"vaddps	  %%xmm4, %%xmm8 , %%xmm4	  \n\t"
+        "addq		$4 , %0	  	 	      \n\t"
+	"vaddps	  %%xmm5, %%xmm10, %%xmm5	  \n\t"
+	"vaddps	  %%xmm6, %%xmm9 , %%xmm6	  \n\t"
+	"subq	        $4 , %1			      \n\t"		
+	"vaddps	  %%xmm7, %%xmm11, %%xmm7	  \n\t"
+
+        ".L08LABEL%=:                                  \n\t"
+
+        "testq          $0x08, %1                      \n\t"
+        "jz             .L16LABEL%=                    \n\t"
+
+        "vmovups        (%2,%0,4), %%ymm12       \n\t"  // 8 * x
+
+	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
+	"vmulps   (%5,%0,4), %%ymm12, %%ymm10     \n\t" 
+	"vmulps   (%6,%0,4), %%ymm12, %%ymm9      \n\t" 
+	"vmulps   (%7,%0,4), %%ymm12, %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+        "addq		$8 , %0	  	 	      \n\t"
+	"vaddps	  %%ymm5, %%ymm10, %%ymm5	  \n\t"
+	"vaddps	  %%ymm6, %%ymm9 , %%ymm6	  \n\t"
+	"subq	        $8 , %1			      \n\t"		
+	"vaddps	  %%ymm7, %%ymm11, %%ymm7	  \n\t"
+
+        ".L16LABEL%=:                                  \n\t"
+
+        "cmpq           $0, %1                         \n\t"
+        "je             .L16END%=                      \n\t"
+
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"prefetcht0	 384(%2,%0,4)		       \n\t"
+        "vmovups        (%2,%0,4), %%ymm12       \n\t"  // 8 * x
+        "vmovups      32(%2,%0,4), %%ymm13       \n\t"  // 8 * x
+
+	"prefetcht0	 384(%4,%0,4)		       \n\t"
+	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
+	"vmulps 32(%4,%0,4), %%ymm13, %%ymm9      \n\t" 
+	"prefetcht0	 384(%5,%0,4)		       \n\t"
+	"vmulps   (%5,%0,4), %%ymm12, %%ymm10     \n\t" 
+	"vmulps 32(%5,%0,4), %%ymm13, %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm0, %%ymm9 , %%ymm0	  \n\t"
+	"vaddps	  %%ymm1, %%ymm10, %%ymm1	  \n\t"
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
+	"prefetcht0	 384(%6,%0,4)		  \n\t"
+	"vmulps   (%6,%0,4), %%ymm12, %%ymm8      \n\t" 
+	"vmulps 32(%6,%0,4), %%ymm13, %%ymm9      \n\t" 
+	"prefetcht0	 384(%7,%0,4)		  \n\t"
+	"vmulps   (%7,%0,4), %%ymm12, %%ymm10     \n\t" 
+	"vmulps 32(%7,%0,4), %%ymm13, %%ymm11     \n\t" 
+	"vaddps	  %%ymm6, %%ymm8 , %%ymm6	  \n\t"
+        "addq		$16, %0	  	 	      \n\t"
+	"vaddps	  %%ymm2, %%ymm9 , %%ymm2	  \n\t"
+	"vaddps	  %%ymm7, %%ymm10, %%ymm7	  \n\t"
+	"subq	        $16, %1			      \n\t"		
+	"vaddps	  %%ymm3, %%ymm11, %%ymm3	  \n\t"
+
+	"jnz		.L01LOOP%=		      \n\t"
+
+        ".L16END%=:				      \n\t"
+
+        "vaddps         %%ymm4, %%ymm0, %%ymm4       \n\t"
+        "vaddps         %%ymm5, %%ymm1, %%ymm5       \n\t"
+        "vaddps         %%ymm6, %%ymm2, %%ymm6       \n\t"
+        "vaddps         %%ymm7, %%ymm3, %%ymm7       \n\t"
+
+        "vextractf128   $1 , %%ymm4, %%xmm12          \n\t"
+        "vextractf128   $1 , %%ymm5, %%xmm13          \n\t"
+        "vextractf128   $1 , %%ymm6, %%xmm14          \n\t"
+        "vextractf128   $1 , %%ymm7, %%xmm15          \n\t"
+
+        "vaddps         %%xmm4, %%xmm12, %%xmm4       \n\t"
+        "vaddps         %%xmm5, %%xmm13, %%xmm5       \n\t"
+        "vaddps         %%xmm6, %%xmm14, %%xmm6       \n\t"
+        "vaddps         %%xmm7, %%xmm15, %%xmm7       \n\t"
+
+        "vhaddps        %%xmm4, %%xmm4, %%xmm4  \n\t"
+        "vhaddps        %%xmm5, %%xmm5, %%xmm5  \n\t"
+        "vhaddps        %%xmm6, %%xmm6, %%xmm6  \n\t"
+        "vhaddps        %%xmm7, %%xmm7, %%xmm7  \n\t"
+
+        "vhaddps        %%xmm4, %%xmm4, %%xmm4  \n\t"
+        "vhaddps        %%xmm5, %%xmm5, %%xmm5  \n\t"
+        "vhaddps        %%xmm6, %%xmm6, %%xmm6  \n\t"
+        "vhaddps        %%xmm7, %%xmm7, %%xmm7  \n\t"
+
+        "vmovss         %%xmm4,    (%3)         \n\t"
+        "vmovss         %%xmm5,   4(%3)         \n\t"
+        "vmovss         %%xmm6,   8(%3)         \n\t"
+        "vmovss         %%xmm7,  12(%3)         \n\t"
+
+
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From 2021d0f9d6e155997450ab199d8af7e0a3a8551a Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Fri, 5 Sep 2014 15:05:53 +0200
Subject: [PATCH 19/44] experimentally removed expensive function calls

---
 common_x86_64.h           | 9 +++++++++
 driver/others/parameter.c | 2 ++
 2 files changed, 11 insertions(+)

diff --git a/common_x86_64.h b/common_x86_64.h
index 0f842ee94..ae9b88718 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -46,6 +46,7 @@
 #define	__volatile__
 #endif
 
+/*
 #ifdef HAVE_SSE2
 #define MB   __asm__ __volatile__ ("mfence");
 #define WMB  __asm__ __volatile__ ("sfence");
@@ -53,6 +54,10 @@
 #define MB
 #define WMB
 #endif
+*/
+
+#define MB
+#define WMB
 
 static void __inline blas_lock(volatile BLASULONG *address){
 
@@ -99,6 +104,8 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
 			     : "0" (op));
 }
 
+/*
+
 #define WHEREAMI
 
 static inline int WhereAmI(void){
@@ -111,6 +118,8 @@ static inline int WhereAmI(void){
   return apicid;
 }
 
+*/
+
 #ifdef CORE_BARCELONA
 #define IFLUSH		gotoblas_iflush()
 #define IFLUSH_HALF	gotoblas_iflush_half()
diff --git a/driver/others/parameter.c b/driver/others/parameter.c
index a0a8b5188..c6c7301e8 100644
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@@ -251,7 +251,9 @@ void blas_set_parameter(void){
 
   env_var_t p;
   int factor;
+#if !defined(BULLDOZER)
   int size = get_L2_size();
+#endif
 
 #if defined(CORE_KATMAI)  || defined(CORE_COPPERMINE) || defined(CORE_BANIAS)
   size >>= 7;

From a64fe9bcc95b5378d47c424f615da95d38a9ec43 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sat, 6 Sep 2014 08:41:53 +0200
Subject: [PATCH 20/44] added optimized sgemv_n kernel for sandybridge

---
 driver/others/parameter.c              |   4 +-
 kernel/x86_64/KERNEL.SANDYBRIDGE       |   2 +-
 kernel/x86_64/sgemv_n_4.c              |   2 +
 kernel/x86_64/sgemv_n_microk_sandy-4.c | 322 +++++++++++++++++++++++++
 4 files changed, 328 insertions(+), 2 deletions(-)
 create mode 100644 kernel/x86_64/sgemv_n_microk_sandy-4.c

diff --git a/driver/others/parameter.c b/driver/others/parameter.c
index c6c7301e8..f0f889a15 100644
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@@ -251,7 +251,9 @@ void blas_set_parameter(void){
 
   env_var_t p;
   int factor;
-#if !defined(BULLDOZER)
+#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL)
+  int size = 16;
+#else
   int size = get_L2_size();
 #endif
 
diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE
index b70486436..dfc2882aa 100644
--- a/kernel/x86_64/KERNEL.SANDYBRIDGE
+++ b/kernel/x86_64/KERNEL.SANDYBRIDGE
@@ -1,4 +1,4 @@
-SGEMVNKERNEL = sgemv_n.c
+SGEMVNKERNEL = sgemv_n_4.c
 SGEMVTKERNEL = sgemv_t_4.c
 
 ZGEMVNKERNEL = zgemv_n.c
diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
index 31d841ddd..617b1788f 100644
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -33,6 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "sgemv_n_microk_bulldozer-4.c"
 #elif defined(NEHALEM)
 #include "sgemv_n_microk_nehalem-4.c"
+#elif defined(SANDYBRIDGE)
+#include "sgemv_n_microk_sandy-4.c"
 #endif
 
 
diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c
new file mode 100644
index 000000000..b4caca630
--- /dev/null
+++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c
@@ -0,0 +1,322 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+
+
+#define HAVE_KERNEL_4x8 1
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+	"vbroadcastss    (%2), %%ymm12	 \n\t"	// x0 
+	"vbroadcastss   4(%2), %%ymm13	 \n\t"	// x1 
+	"vbroadcastss   8(%2), %%ymm14	 \n\t"	// x2 
+	"vbroadcastss  12(%2), %%ymm15	 \n\t"	// x3 
+	"vbroadcastss  16(%2), %%ymm0 	 \n\t"	// x4 
+	"vbroadcastss  20(%2), %%ymm1 	 \n\t"	// x5 
+	"vbroadcastss  24(%2), %%ymm2 	 \n\t"	// x6 
+	"vbroadcastss  28(%2), %%ymm3 	 \n\t"	// x7 
+
+        "testq          $0x04, %1               \n\t"
+        "jz             .L08LABEL%=             \n\t"
+
+	"vmovups	(%3,%0,4), %%xmm4	  \n\t"	// 4 * y
+
+	"vmulps   (%4,%0,4), %%xmm12, %%xmm8      \n\t" 
+	"vmulps   (%5,%0,4), %%xmm13, %%xmm10     \n\t" 
+	"vmulps   (%6,%0,4), %%xmm14, %%xmm9      \n\t" 
+	"vmulps   (%7,%0,4), %%xmm15, %%xmm11     \n\t" 
+	"vaddps	  %%xmm4, %%xmm8 , %%xmm4	  \n\t"
+	"vaddps	  %%xmm4, %%xmm10, %%xmm4	  \n\t"
+	"vaddps	  %%xmm4, %%xmm9 , %%xmm4	  \n\t"
+	"vaddps	  %%xmm4, %%xmm11, %%xmm4	  \n\t"
+
+	"vmulps   (%4,%8,4), %%xmm0 , %%xmm8      \n\t" 
+	"vmulps   (%5,%8,4), %%xmm1 , %%xmm10     \n\t" 
+	"vmulps   (%6,%8,4), %%xmm2 , %%xmm9      \n\t" 
+	"vmulps   (%7,%8,4), %%xmm3 , %%xmm11     \n\t" 
+	"vaddps	  %%xmm4, %%xmm8 , %%xmm4	  \n\t"
+	"vaddps	  %%xmm4, %%xmm10, %%xmm4	  \n\t"
+	"vaddps	  %%xmm4, %%xmm9 , %%xmm4	  \n\t"
+	"vaddps	  %%xmm4, %%xmm11, %%xmm4	  \n\t"
+
+	"vmovups  %%xmm4,   (%3,%0,4)		  \n\t"	// 4 * y
+
+        "addq		$4, %8	  	 	  \n\t"
+        "addq		$4, %0	  	 	  \n\t"
+	"subq	        $4, %1			  \n\t"		
+
+        ".L08LABEL%=:                           \n\t"
+
+        "testq          $0x08, %1                 \n\t"
+        "jz             .L16LABEL%=               \n\t"
+
+	"vmovups	(%3,%0,4), %%ymm4	  \n\t"	// 8 * y
+
+	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
+	"vmulps   (%5,%0,4), %%ymm13, %%ymm10     \n\t" 
+	"vmulps   (%6,%0,4), %%ymm14, %%ymm9      \n\t" 
+	"vmulps   (%7,%0,4), %%ymm15, %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
+	"vaddps	  %%ymm4, %%ymm9 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm4, %%ymm11, %%ymm4	  \n\t"
+
+	"vmulps   (%4,%8,4), %%ymm0 , %%ymm8      \n\t" 
+	"vmulps   (%5,%8,4), %%ymm1 , %%ymm10     \n\t" 
+	"vmulps   (%6,%8,4), %%ymm2 , %%ymm9      \n\t" 
+	"vmulps   (%7,%8,4), %%ymm3 , %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
+	"vaddps	  %%ymm4, %%ymm9 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm4, %%ymm11, %%ymm4	  \n\t"
+
+	"vmovups  %%ymm4,   (%3,%0,4)		  \n\t"	// 8 * y
+
+        "addq		$8, %8	  	 	  \n\t"
+        "addq		$8, %0	  	 	  \n\t"
+	"subq	        $8, %1			  \n\t"		
+
+
+        ".L16LABEL%=:                             \n\t"
+
+        "cmpq           $0, %1                    \n\t"
+        "je             .L16END%=                 \n\t"
+
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vmovups	(%3,%0,4), %%ymm4	 \n\t"	// 8 * y
+	"vmovups      32(%3,%0,4), %%ymm5	 \n\t"	// 8 * y
+
+	"prefetcht0	 192(%4,%0,4)		  \n\t"
+	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
+	"vmulps 32(%4,%0,4), %%ymm12, %%ymm9      \n\t" 
+	"prefetcht0	 192(%5,%0,4)		  \n\t"
+	"vmulps   (%5,%0,4), %%ymm13, %%ymm10     \n\t" 
+	"vmulps 32(%5,%0,4), %%ymm13, %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
+	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
+
+	"prefetcht0	 192(%6,%0,4)		  \n\t"
+	"vmulps   (%6,%0,4), %%ymm14, %%ymm8      \n\t" 
+	"vmulps 32(%6,%0,4), %%ymm14, %%ymm9      \n\t" 
+	"prefetcht0	 192(%7,%0,4)		  \n\t"
+	"vmulps   (%7,%0,4), %%ymm15, %%ymm10     \n\t" 
+	"vmulps 32(%7,%0,4), %%ymm15, %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
+	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
+
+	"prefetcht0	 192(%4,%8,4)		  \n\t"
+	"vmulps   (%4,%8,4), %%ymm0 , %%ymm8      \n\t" 
+	"vmulps 32(%4,%8,4), %%ymm0 , %%ymm9      \n\t" 
+	"prefetcht0	 192(%5,%8,4)		  \n\t"
+	"vmulps   (%5,%8,4), %%ymm1 , %%ymm10     \n\t" 
+	"vmulps 32(%5,%8,4), %%ymm1 , %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
+	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
+
+	"prefetcht0	 192(%6,%8,4)		  \n\t"
+	"vmulps   (%6,%8,4), %%ymm2 , %%ymm8      \n\t" 
+	"vmulps 32(%6,%8,4), %%ymm2 , %%ymm9      \n\t" 
+	"prefetcht0	 192(%7,%8,4)		  \n\t"
+	"vmulps   (%7,%8,4), %%ymm3 , %%ymm10     \n\t" 
+	"vmulps 32(%7,%8,4), %%ymm3 , %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
+	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
+
+	"vmovups  %%ymm4,   (%3,%0,4)		  \n\t"	// 8 * y
+	"vmovups  %%ymm5, 32(%3,%0,4)		  \n\t"	// 8 * y
+
+        "addq		$16, %8	  	 	  \n\t"
+        "addq		$16, %0	  	 	  \n\t"
+	"subq	        $16, %1			  \n\t"		
+	"jnz		.L01LOOP%=		  \n\t"
+
+	".L16END%=:                               \n\t"
+	"vzeroupper			          \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (lda4)    // 8
+	: "cc", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+
+
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+	"vbroadcastss    (%2), %%ymm12	 \n\t"	// x0 
+	"vbroadcastss   4(%2), %%ymm13	 \n\t"	// x1 
+	"vbroadcastss   8(%2), %%ymm14	 \n\t"	// x2 
+	"vbroadcastss  12(%2), %%ymm15	 \n\t"	// x3 
+
+        "testq          $0x04, %1               \n\t"
+        "jz             .L08LABEL%=             \n\t"
+
+	"vmovups	(%3,%0,4), %%xmm4	  \n\t"	// 4 * y
+
+	"vmulps   (%4,%0,4), %%xmm12, %%xmm8      \n\t" 
+	"vmulps   (%5,%0,4), %%xmm13, %%xmm10     \n\t" 
+	"vmulps   (%6,%0,4), %%xmm14, %%xmm9      \n\t" 
+	"vmulps   (%7,%0,4), %%xmm15, %%xmm11     \n\t" 
+	"vaddps	  %%xmm4, %%xmm8 , %%xmm4	  \n\t"
+	"vaddps	  %%xmm4, %%xmm10, %%xmm4	  \n\t"
+	"vaddps	  %%xmm4, %%xmm9 , %%xmm4	  \n\t"
+	"vaddps	  %%xmm4, %%xmm11, %%xmm4	  \n\t"
+
+	"vmovups  %%xmm4,   (%3,%0,4)		  \n\t"	// 4 * y
+
+        "addq		$4, %0	  	 	  \n\t"
+	"subq	        $4, %1			  \n\t"		
+
+        ".L08LABEL%=:                           \n\t"
+
+        "testq          $0x08, %1                 \n\t"
+        "jz             .L16LABEL%=               \n\t"
+
+	"vmovups	(%3,%0,4), %%ymm4	  \n\t"	// 8 * y
+
+	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
+	"vmulps   (%5,%0,4), %%ymm13, %%ymm10     \n\t" 
+	"vmulps   (%6,%0,4), %%ymm14, %%ymm9      \n\t" 
+	"vmulps   (%7,%0,4), %%ymm15, %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
+	"vaddps	  %%ymm4, %%ymm9 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm4, %%ymm11, %%ymm4	  \n\t"
+
+	"vmovups  %%ymm4,   (%3,%0,4)		  \n\t"	// 8 * y
+
+        "addq		$8, %0	  	 	  \n\t"
+	"subq	        $8, %1			  \n\t"		
+
+
+        ".L16LABEL%=:                             \n\t"
+
+        "cmpq           $0, %1                    \n\t"
+        "je             .L16END%=                 \n\t"
+
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vmovups	(%3,%0,4), %%ymm4	 \n\t"	// 8 * y
+	"vmovups      32(%3,%0,4), %%ymm5	 \n\t"	// 8 * y
+
+	"prefetcht0	 192(%4,%0,4)		  \n\t"
+	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
+	"vmulps 32(%4,%0,4), %%ymm12, %%ymm9      \n\t" 
+	"prefetcht0	 192(%5,%0,4)		  \n\t"
+	"vmulps   (%5,%0,4), %%ymm13, %%ymm10     \n\t" 
+	"vmulps 32(%5,%0,4), %%ymm13, %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
+	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
+
+	"prefetcht0	 192(%6,%0,4)		  \n\t"
+	"vmulps   (%6,%0,4), %%ymm14, %%ymm8      \n\t" 
+	"vmulps 32(%6,%0,4), %%ymm14, %%ymm9      \n\t" 
+	"prefetcht0	 192(%7,%0,4)		  \n\t"
+	"vmulps   (%7,%0,4), %%ymm15, %%ymm10     \n\t" 
+	"vmulps 32(%7,%0,4), %%ymm15, %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
+	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
+
+	"vmovups  %%ymm4,   (%3,%0,4)		  \n\t"	// 8 * y
+	"vmovups  %%ymm5, 32(%3,%0,4)		  \n\t"	// 8 * y
+
+        "addq		$16, %0	  	 	  \n\t"
+	"subq	        $16, %1			  \n\t"		
+	"jnz		.L01LOOP%=		  \n\t"
+
+	".L16END%=:                               \n\t"
+	"vzeroupper			          \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From 77942374759502f740ccd7ec9130e7e790494d3a Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sat, 6 Sep 2014 11:01:42 +0200
Subject: [PATCH 21/44] undef WHEREAMI

---
 common_x86_64.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/common_x86_64.h b/common_x86_64.h
index ae9b88718..547614f74 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -105,8 +105,8 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
 }
 
 /*
-
 #define WHEREAMI
+*/
 
 static inline int WhereAmI(void){
   int eax, ebx, ecx, edx;
@@ -118,7 +118,6 @@ static inline int WhereAmI(void){
   return apicid;
 }
 
-*/
 
 #ifdef CORE_BARCELONA
 #define IFLUSH		gotoblas_iflush()

From d143f84dd26219e4a8d62e545a5449d47fe80583 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sat, 6 Sep 2014 12:08:48 +0200
Subject: [PATCH 22/44] added optimized sgemv_n kernel for haswell

---
 kernel/x86_64/KERNEL.HASWELL             |   2 +-
 kernel/x86_64/sgemv_n_4.c                |   2 +
 kernel/x86_64/sgemv_n_microk_haswell-4.c | 271 +++++++++++++++++++++++
 3 files changed, 274 insertions(+), 1 deletion(-)
 create mode 100644 kernel/x86_64/sgemv_n_microk_haswell-4.c

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index d0ac9c72f..c2c64939b 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -1,4 +1,4 @@
-SGEMVNKERNEL = sgemv_n.c
+SGEMVNKERNEL = sgemv_n_4.c
 SGEMVTKERNEL = sgemv_t.c
 
 DGEMVNKERNEL = dgemv_n.c
diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
index 617b1788f..943dcdefa 100644
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -35,6 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "sgemv_n_microk_nehalem-4.c"
 #elif defined(SANDYBRIDGE)
 #include "sgemv_n_microk_sandy-4.c"
+#elif defined(HASWELL)
+#include "sgemv_n_microk_haswell-4.c"
 #endif
 
 
diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c
new file mode 100644
index 000000000..ed1792245
--- /dev/null
+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c
@@ -0,0 +1,271 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+
+#define HAVE_KERNEL_4x8 1
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+	"vbroadcastss    (%2), %%ymm12	 \n\t"	// x0 
+	"vbroadcastss   4(%2), %%ymm13	 \n\t"	// x1 
+	"vbroadcastss   8(%2), %%ymm14	 \n\t"	// x2 
+	"vbroadcastss  12(%2), %%ymm15	 \n\t"	// x3 
+	"vbroadcastss  16(%2), %%ymm0 	 \n\t"	// x4 
+	"vbroadcastss  20(%2), %%ymm1 	 \n\t"	// x5 
+	"vbroadcastss  24(%2), %%ymm2 	 \n\t"	// x6 
+	"vbroadcastss  28(%2), %%ymm3 	 \n\t"	// x7 
+
+        "testq          $0x04, %1                      \n\t"
+        "jz             .L08LABEL%=                    \n\t"
+
+	"vmovups	(%3,%0,4), %%xmm4	       \n\t"	// 4 * y
+	"vxorps		%%xmm5 , %%xmm5, %%xmm5        \n\t"
+
+	"vfmadd231ps   (%4,%0,4), %%xmm12, %%xmm4      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%xmm13, %%xmm5      \n\t" 
+	"vfmadd231ps   (%6,%0,4), %%xmm14, %%xmm4      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%xmm15, %%xmm5      \n\t" 
+
+	"vfmadd231ps   (%4,%8,4), %%xmm0 , %%xmm4      \n\t" 
+	"vfmadd231ps   (%5,%8,4), %%xmm1 , %%xmm5      \n\t" 
+	"vfmadd231ps   (%6,%8,4), %%xmm2 , %%xmm4      \n\t" 
+	"vfmadd231ps   (%7,%8,4), %%xmm3 , %%xmm5      \n\t" 
+
+	"vaddps		%%xmm4 , %%xmm5 , %%xmm5       \n\t"
+
+	"vmovups  %%xmm5,   (%3,%0,4)		       \n\t"	// 4 * y
+
+        "addq		$4 , %8	  	 	       \n\t"
+        "addq		$4 , %0	  	 	       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+
+        ".L08LABEL%=:                                  \n\t"
+
+        "testq          $0x08, %1                      \n\t"
+        "jz             .L16LABEL%=                    \n\t"
+
+	"vmovups	(%3,%0,4), %%ymm4	       \n\t"	// 8 * y
+	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"
+
+	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%ymm13, %%ymm5      \n\t" 
+	"vfmadd231ps   (%6,%0,4), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%ymm15, %%ymm5      \n\t" 
+
+	"vfmadd231ps   (%4,%8,4), %%ymm0 , %%ymm4      \n\t" 
+	"vfmadd231ps   (%5,%8,4), %%ymm1 , %%ymm5      \n\t" 
+	"vfmadd231ps   (%6,%8,4), %%ymm2 , %%ymm4      \n\t" 
+	"vfmadd231ps   (%7,%8,4), %%ymm3 , %%ymm5      \n\t" 
+
+	"vaddps		%%ymm4 , %%ymm5 , %%ymm5       \n\t"
+
+	"vmovups  %%ymm5,   (%3,%0,4)		       \n\t"	// 8 * y
+
+        "addq		$8 , %8	  	 	       \n\t"
+        "addq		$8 , %0	  	 	       \n\t"
+	"subq	        $8 , %1			       \n\t"		
+
+        ".L16LABEL%=:                                  \n\t"
+
+        "cmpq           $0, %1                         \n\t"
+        "je             .L16END%=                      \n\t"
+
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vmovups	(%3,%0,4), %%ymm4	 \n\t"	// 8 * y
+	"vmovups      32(%3,%0,4), %%ymm5	 \n\t"	// 8 * y
+
+	"prefetcht0	 192(%4,%0,4)		       \n\t"
+	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5      \n\t" 
+	"prefetcht0	 192(%5,%0,4)		       \n\t"
+	"vfmadd231ps   (%5,%0,4), %%ymm13, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5      \n\t" 
+	"prefetcht0	 192(%6,%0,4)		       \n\t"
+	"vfmadd231ps   (%6,%0,4), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5      \n\t" 
+	"prefetcht0	 192(%7,%0,4)		       \n\t"
+	"vfmadd231ps   (%7,%0,4), %%ymm15, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5      \n\t" 
+
+	"prefetcht0	 192(%4,%8,4)		       \n\t"
+	"vfmadd231ps   (%4,%8,4), %%ymm0 , %%ymm4      \n\t" 
+	"vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5      \n\t" 
+	"prefetcht0	 192(%5,%8,4)		       \n\t"
+	"vfmadd231ps   (%5,%8,4), %%ymm1 , %%ymm4      \n\t" 
+	"vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5      \n\t" 
+	"prefetcht0	 192(%6,%8,4)		       \n\t"
+	"vfmadd231ps   (%6,%8,4), %%ymm2 , %%ymm4      \n\t" 
+	"vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5      \n\t" 
+	"prefetcht0	 192(%7,%8,4)		       \n\t"
+	"vfmadd231ps   (%7,%8,4), %%ymm3 , %%ymm4      \n\t" 
+	"vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5      \n\t" 
+
+	"vmovups  %%ymm4,   (%3,%0,4)		      \n\t"	// 8 * y
+	"vmovups  %%ymm5, 32(%3,%0,4)		      \n\t"	// 8 * y
+
+        "addq		$16, %8	  	 	      \n\t"
+        "addq		$16, %0	  	 	      \n\t"
+	"subq	        $16, %1			      \n\t"		
+	"jnz		.L01LOOP%=		      \n\t"
+
+        ".L16END%=:                             \n\t"
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (lda4)    // 8
+	: "cc", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+	"vbroadcastss    (%2), %%ymm12	 \n\t"	// x0 
+	"vbroadcastss   4(%2), %%ymm13	 \n\t"	// x1 
+	"vbroadcastss   8(%2), %%ymm14	 \n\t"	// x2 
+	"vbroadcastss  12(%2), %%ymm15	 \n\t"	// x3 
+
+        "testq          $0x04, %1                      \n\t"
+        "jz             .L08LABEL%=                    \n\t"
+
+	"vmovups	(%3,%0,4), %%xmm4	       \n\t"	// 4 * y
+
+	"vfmadd231ps   (%4,%0,4), %%xmm12, %%xmm4      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%xmm13, %%xmm4      \n\t" 
+	"vfmadd231ps   (%6,%0,4), %%xmm14, %%xmm4      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%xmm15, %%xmm4      \n\t" 
+
+	"vmovups  %%xmm4,   (%3,%0,4)		       \n\t"	// 4 * y
+
+        "addq		$4 , %0	  	 	       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+
+        ".L08LABEL%=:                                  \n\t"
+
+        "testq          $0x08, %1                      \n\t"
+        "jz             .L16LABEL%=                    \n\t"
+
+	"vmovups	(%3,%0,4), %%ymm4	       \n\t"	// 8 * y
+
+	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%ymm13, %%ymm4      \n\t" 
+	"vfmadd231ps   (%6,%0,4), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%ymm15, %%ymm4      \n\t" 
+
+	"vmovups  %%ymm4,   (%3,%0,4)		       \n\t"	// 8 * y
+
+        "addq		$8 , %0	  	 	       \n\t"
+	"subq	        $8 , %1			       \n\t"		
+
+        ".L16LABEL%=:                                  \n\t"
+
+        "cmpq           $0, %1                         \n\t"
+        "je             .L16END%=                      \n\t"
+
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vmovups	(%3,%0,4), %%ymm4	 \n\t"	// 8 * y
+	"vmovups      32(%3,%0,4), %%ymm5	 \n\t"	// 8 * y
+
+	"prefetcht0	 192(%4,%0,4)		       \n\t"
+	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5      \n\t" 
+	"prefetcht0	 192(%5,%0,4)		       \n\t"
+	"vfmadd231ps   (%5,%0,4), %%ymm13, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5      \n\t" 
+	"prefetcht0	 192(%6,%0,4)		       \n\t"
+	"vfmadd231ps   (%6,%0,4), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5      \n\t" 
+	"prefetcht0	 192(%7,%0,4)		       \n\t"
+	"vfmadd231ps   (%7,%0,4), %%ymm15, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5      \n\t" 
+
+	"vmovups  %%ymm4,   (%3,%0,4)		      \n\t"	// 8 * y
+	"vmovups  %%ymm5, 32(%3,%0,4)		      \n\t"	// 8 * y
+
+        "addq		$16, %0	  	 	      \n\t"
+	"subq	        $16, %1			      \n\t"		
+	"jnz		.L01LOOP%=		      \n\t"
+
+        ".L16END%=:                             \n\t"
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From cf5544b41750fae89ec2c3e83f6ed70ca2d508dc Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sat, 6 Sep 2014 13:17:56 +0200
Subject: [PATCH 23/44] optimization for small size

---
 kernel/x86_64/sgemv_n_microk_haswell-4.c | 25 ++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c
index ed1792245..a2470a4b7 100644
--- a/kernel/x86_64/sgemv_n_microk_haswell-4.c
+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c
@@ -105,41 +105,42 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
+	// "prefetcht0	 192(%3,%0,4)		       \n\t"
 	"vmovups	(%3,%0,4), %%ymm4	 \n\t"	// 8 * y
 	"vmovups      32(%3,%0,4), %%ymm5	 \n\t"	// 8 * y
 
-	"prefetcht0	 192(%4,%0,4)		       \n\t"
+	// "prefetcht0	 192(%4,%0,4)		       \n\t"
 	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
 	"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5      \n\t" 
-	"prefetcht0	 192(%5,%0,4)		       \n\t"
+	// "prefetcht0	 192(%5,%0,4)		       \n\t"
 	"vfmadd231ps   (%5,%0,4), %%ymm13, %%ymm4      \n\t" 
 	"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5      \n\t" 
-	"prefetcht0	 192(%6,%0,4)		       \n\t"
+	// "prefetcht0	 192(%6,%0,4)		       \n\t"
 	"vfmadd231ps   (%6,%0,4), %%ymm14, %%ymm4      \n\t" 
 	"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5      \n\t" 
-	"prefetcht0	 192(%7,%0,4)		       \n\t"
+	// "prefetcht0	 192(%7,%0,4)		       \n\t"
 	"vfmadd231ps   (%7,%0,4), %%ymm15, %%ymm4      \n\t" 
 	"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5      \n\t" 
 
-	"prefetcht0	 192(%4,%8,4)		       \n\t"
+	// "prefetcht0	 192(%4,%8,4)		       \n\t"
 	"vfmadd231ps   (%4,%8,4), %%ymm0 , %%ymm4      \n\t" 
+        "addq		$16, %0	  	 	      \n\t"
 	"vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5      \n\t" 
-	"prefetcht0	 192(%5,%8,4)		       \n\t"
+	// "prefetcht0	 192(%5,%8,4)		       \n\t"
 	"vfmadd231ps   (%5,%8,4), %%ymm1 , %%ymm4      \n\t" 
 	"vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5      \n\t" 
-	"prefetcht0	 192(%6,%8,4)		       \n\t"
+	// "prefetcht0	 192(%6,%8,4)		       \n\t"
 	"vfmadd231ps   (%6,%8,4), %%ymm2 , %%ymm4      \n\t" 
 	"vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5      \n\t" 
-	"prefetcht0	 192(%7,%8,4)		       \n\t"
+	// "prefetcht0	 192(%7,%8,4)		       \n\t"
 	"vfmadd231ps   (%7,%8,4), %%ymm3 , %%ymm4      \n\t" 
 	"vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5      \n\t" 
 
-	"vmovups  %%ymm4,   (%3,%0,4)		      \n\t"	// 8 * y
-	"vmovups  %%ymm5, 32(%3,%0,4)		      \n\t"	// 8 * y
-
         "addq		$16, %8	  	 	      \n\t"
-        "addq		$16, %0	  	 	      \n\t"
+	"vmovups  %%ymm4,-64(%3,%0,4)		      \n\t"	// 8 * y
 	"subq	        $16, %1			      \n\t"		
+	"vmovups  %%ymm5,-32(%3,%0,4)		      \n\t"	// 8 * y
+
 	"jnz		.L01LOOP%=		      \n\t"
 
         ".L16END%=:                             \n\t"

From 3a7ab47ee95a34d113e68003a37c81eb70d74a6b Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sat, 6 Sep 2014 18:34:25 +0200
Subject: [PATCH 24/44] optimized sgemv_t

---
 kernel/x86_64/sgemv_t_4.c | 69 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 61 insertions(+), 8 deletions(-)

diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index b89ec7f7f..e4476080a 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -80,9 +80,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
  	(
 	"xorps %%xmm10 , %%xmm10		\n\t"
 	"xorps %%xmm11 , %%xmm11		\n\t"
-
-        ".align 16                              \n\t"
-        ".L01LOOP%=:                            \n\t"
+		
+	"testq	$4 , %1				\n\t"
+	"jz	.L01LABEL%=			\n\t"
 
 	"movups  (%5,%0,4) , %%xmm14		\n\t" // x
 	"movups  (%3,%0,4) , %%xmm12		\n\t" // ap0
@@ -94,8 +94,36 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
         "subq           $4 , %1                 \n\t"
 	"addps   %%xmm13   , %%xmm11		\n\t"
 
+        ".L01LABEL%=:                           \n\t"
+
+	"cmpq	$0, %1				\n\t"
+	"je	.L01END%=			\n\t"
+
+        ".align 16                              \n\t"
+        ".L01LOOP%=:                            \n\t"
+
+	"movups  (%5,%0,4) , %%xmm14		\n\t" // x
+	"movups  (%3,%0,4) , %%xmm12		\n\t" // ap0
+	"movups  (%4,%0,4) , %%xmm13		\n\t" // ap1
+	"mulps   %%xmm14   , %%xmm12 		\n\t"
+	"mulps   %%xmm14   , %%xmm13 		\n\t"
+	"addps   %%xmm12   , %%xmm10		\n\t"
+	"addps   %%xmm13   , %%xmm11		\n\t"
+
+	"movups  16(%5,%0,4) , %%xmm14		\n\t" // x
+	"movups  16(%3,%0,4) , %%xmm12		\n\t" // ap0
+	"movups  16(%4,%0,4) , %%xmm13		\n\t" // ap1
+	"mulps   %%xmm14   , %%xmm12 		\n\t"
+	"mulps   %%xmm14   , %%xmm13 		\n\t"
+	"addps   %%xmm12   , %%xmm10		\n\t"
+	"addps   %%xmm13   , %%xmm11		\n\t"
+
+        "addq           $8 , %0                 \n\t"
+        "subq           $8 , %1                 \n\t"
         "jnz            .L01LOOP%=              \n\t"
 
+        ".L01END%=:                             \n\t"
+
 	"haddps        %%xmm10, %%xmm10         \n\t"
 	"haddps        %%xmm11, %%xmm11         \n\t"
 	"haddps        %%xmm10, %%xmm10         \n\t"
@@ -113,7 +141,8 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
         "r" (ap1),       // 4
         "r" (x)          // 5
         : "cc",
-       	"%xmm10", "%xmm11", "%xmm12",
+       	"%xmm4", "%xmm5", "%xmm10", "%xmm11",
+       	"%xmm12", "%xmm13", "%xmm14", "%xmm15",
        	"memory"
        	);
 
@@ -130,10 +159,11 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 
         __asm__  __volatile__
  	(
+	"xorps %%xmm9  , %%xmm9 		\n\t"
 	"xorps %%xmm10 , %%xmm10		\n\t"
-
-        ".align 16                              \n\t"
-        ".L01LOOP%=:                            \n\t"
+	
+	"testq	$4 , %1				\n\t"
+	"jz	.L01LABEL%=			\n\t"
 
 	"movups  (%3,%0,4) , %%xmm12		\n\t"
 	"movups  (%4,%0,4) , %%xmm11		\n\t"
@@ -142,8 +172,30 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 	"addps   %%xmm12   , %%xmm10		\n\t"
         "subq           $4 , %1                 \n\t"
 
+        ".L01LABEL%=:                           \n\t"
+
+	"cmpq	$0, %1				\n\t"
+	"je	.L01END%=			\n\t"
+
+        ".align 16                              \n\t"
+        ".L01LOOP%=:                            \n\t"
+
+	"movups    (%3,%0,4) , %%xmm12		\n\t"
+	"movups  16(%3,%0,4) , %%xmm14		\n\t"
+	"movups    (%4,%0,4) , %%xmm11		\n\t"
+	"movups  16(%4,%0,4) , %%xmm13		\n\t"
+	"mulps   %%xmm11   , %%xmm12 		\n\t"
+	"mulps   %%xmm13   , %%xmm14 		\n\t"
+        "addq           $8 , %0                 \n\t"
+	"addps   %%xmm12   , %%xmm10		\n\t"
+        "subq           $8 , %1                 \n\t"
+	"addps   %%xmm14   , %%xmm9 		\n\t"
+
         "jnz            .L01LOOP%=              \n\t"
 
+        ".L01END%=:                             \n\t"
+
+	"addps	       %%xmm9 , %%xmm10         \n\t"
 	"haddps        %%xmm10, %%xmm10         \n\t"
 	"haddps        %%xmm10, %%xmm10         \n\t"
 
@@ -157,7 +209,8 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
         "r" (ap),        // 3
         "r" (x)          // 4
         : "cc",
-       	"%xmm10", "%xmm11", "%xmm12",
+       	"%xmm9", "%xmm10" ,
+       	"%xmm11", "%xmm12", "%xmm13", "%xmm14",
        	"memory"
        	);
 

From c8eaf3ae2d19a60039f55e5579c44329ff2d3000 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sat, 6 Sep 2014 19:41:57 +0200
Subject: [PATCH 25/44] optimized sgemv_t_4 kernel for very small sizes

---
 kernel/x86_64/sgemv_t_4.c | 98 +++++++++++++++++++++++++++++++++------
 1 file changed, 84 insertions(+), 14 deletions(-)

diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index e4476080a..692dd536d 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -423,14 +423,37 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 
 		FLOAT *aj = a_ptr;
 		y_ptr = y;
-		for ( j=0; j<n; j++ )
+
+		if ( lda == 3 && inc_y == 1 )
 		{
-			*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) *xtemp2;
-			 y_ptr += inc_y;
-			 aj    += lda;
+
+			for ( j=0; j< ( n & -4) ; j+=4 )
+			{
+
+				y_ptr[j]   += aj[0] * xtemp0 + aj[1]  * xtemp1 + aj[2]  * xtemp2;
+				y_ptr[j+1] += aj[3] * xtemp0 + aj[4]  * xtemp1 + aj[5]  * xtemp2;
+				y_ptr[j+2] += aj[6] * xtemp0 + aj[7]  * xtemp1 + aj[8]  * xtemp2;
+				y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
+			 	aj        += 12;
+			}
+
+			for ( ; j<n; j++ )
+			{
+				y_ptr[j]  += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+			 	aj        += 3;
+			}
+
+		}
+		else
+		{
+			for ( j=0; j<n; j++ )
+			{
+				*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
+			 	y_ptr += inc_y;
+			 	aj    += lda;
+			}
 		}
 		return(0);
-
 	}
 
 	if ( m3 == 2 )
@@ -441,11 +464,38 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 
 		FLOAT *aj = a_ptr;
 		y_ptr = y;
-		for ( j=0; j<n; j++ )
+
+		if ( lda == 2 && inc_y == 1 )
 		{
-			*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
-			 y_ptr += inc_y;
-			 aj    += lda;
+
+			for ( j=0; j< ( n & -4) ; j+=4 )
+			{
+				y_ptr[j]   += aj[0] * xtemp0 + aj[1] * xtemp1 ;
+				y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ;
+				y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ;
+				y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ;
+			 	aj         += 8;
+
+			}
+
+			for ( ; j<n; j++ )
+			{
+				y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
+			 	aj       += 2;
+			}
+
+		}
+		else
+		{
+
+			for ( j=0; j<n; j++ )
+			{
+				*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
+			 	y_ptr += inc_y;
+			 	aj    += lda;
+			}
+
+
 		}
 		return(0);
 
@@ -454,13 +504,33 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	FLOAT xtemp = *x_ptr * alpha;
 	FLOAT *aj = a_ptr;
 	y_ptr = y;
-	for ( j=0; j<n; j++ )
+	if ( lda == 1 && inc_y == 1 )
 	{
-		*y_ptr += *aj * xtemp;
-		 y_ptr += inc_y;
-		 aj    += lda;
+		for ( j=0; j< ( n & -4) ; j+=4 )
+		{
+			y_ptr[j]   += aj[j]   * xtemp;
+			y_ptr[j+1] += aj[j+1] * xtemp;
+			y_ptr[j+2] += aj[j+2] * xtemp;
+			y_ptr[j+3] += aj[j+3] * xtemp;
+		}
+		for ( ; j<n   ; j++ )
+		{
+			y_ptr[j] += aj[j] * xtemp;
+		}
+
+
+
 	}
-	
+	else
+	{
+		for ( j=0; j<n; j++ )
+		{
+			*y_ptr += *aj * xtemp;
+		 	y_ptr += inc_y;
+		 	aj    += lda;
+		}
+	}
+
 	return(0);
 }
 

From 5ae1731fe63d62c92a0e1f6a618471a462bbcd84 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sat, 6 Sep 2014 21:28:57 +0200
Subject: [PATCH 26/44] better optimzations for sgemv_t kernel

---
 kernel/x86_64/sgemv_t_4.c | 113 +++++++++++++++++++++++++++++++++-----
 1 file changed, 99 insertions(+), 14 deletions(-)

diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index 692dd536d..920322c4f 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -446,12 +446,45 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 		}
 		else
 		{
-			for ( j=0; j<n; j++ )
+
+			if ( inc_y == 1 )
 			{
-				*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
-			 	y_ptr += inc_y;
-			 	aj    += lda;
+
+				BLASLONG register lda2 = lda << 1;
+				BLASLONG register lda4 = lda << 2;
+				BLASLONG register lda3 = lda2 + lda;
+
+				for ( j=0; j< ( n & -4 ); j+=4 )
+				{
+
+					y_ptr[j]    += *aj        * xtemp0 + *(aj+1)      * xtemp1 + *(aj+2)      * xtemp2;
+					y_ptr[j+1]  += *(aj+lda)  * xtemp0 + *(aj+lda+1)  * xtemp1 + *(aj+lda+2)  * xtemp2;
+					y_ptr[j+2]  += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2;
+					y_ptr[j+3]  += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2;
+			 		aj          += lda4;
+				}
+
+				for ( ; j< n ; j++ )
+				{
+
+					y_ptr[j]    += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ;
+			 		aj          += lda;
+				}
+
 			}
+			else
+			{
+
+				for ( j=0; j<n; j++ )
+				{
+					*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
+				 	y_ptr += inc_y;
+			 		aj    += lda;
+				}
+
+
+			}
+
 		}
 		return(0);
 	}
@@ -487,14 +520,40 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 		}
 		else
 		{
-
-			for ( j=0; j<n; j++ )
+			if ( inc_y == 1 )
 			{
-				*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
-			 	y_ptr += inc_y;
-			 	aj    += lda;
-			}
 
+				BLASLONG register lda2 = lda << 1;
+				BLASLONG register lda4 = lda << 2;
+				BLASLONG register lda3 = lda2 + lda;
+
+				for ( j=0; j< ( n & -4 ); j+=4 )
+				{
+
+					y_ptr[j]    += *aj        * xtemp0 + *(aj+1)      * xtemp1 ;
+					y_ptr[j+1]  += *(aj+lda)  * xtemp0 + *(aj+lda+1)  * xtemp1 ;
+					y_ptr[j+2]  += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ;
+					y_ptr[j+3]  += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ;
+			 		aj          += lda4;
+				}
+
+				for ( ; j< n ; j++ )
+				{
+
+					y_ptr[j]    += *aj * xtemp0 + *(aj+1) * xtemp1 ;
+			 		aj          += lda;
+				}
+
+			}
+			else
+			{
+				for ( j=0; j<n; j++ )
+				{
+					*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
+			 		y_ptr += inc_y;
+			 		aj    += lda;
+				}
+			}
 
 		}
 		return(0);
@@ -523,11 +582,37 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	}
 	else
 	{
-		for ( j=0; j<n; j++ )
+		if ( inc_y == 1 )
 		{
-			*y_ptr += *aj * xtemp;
-		 	y_ptr += inc_y;
-		 	aj    += lda;
+
+			BLASLONG register lda2 = lda << 1;
+			BLASLONG register lda4 = lda << 2;
+			BLASLONG register lda3 = lda2 + lda;
+			for ( j=0; j< ( n & -4 ); j+=4 )
+			{
+				y_ptr[j]    += *aj        * xtemp;
+				y_ptr[j+1]  += *(aj+lda)  * xtemp;
+				y_ptr[j+2]  += *(aj+lda2) * xtemp;
+				y_ptr[j+3]  += *(aj+lda3) * xtemp;
+		 		aj          += lda4  ;
+			}
+
+			for ( ; j<n; j++ )
+			{
+				y_ptr[j]  += *aj * xtemp;
+		 		aj        += lda;
+			}
+
+		}
+		else
+		{
+			for ( j=0; j<n; j++ )
+			{
+				*y_ptr += *aj * xtemp;
+		 		y_ptr += inc_y;
+		 		aj    += lda;
+			}
+
 		}
 	}
 

From 274828fa502045a13e4edc457badda216e6c120a Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sun, 7 Sep 2014 13:45:03 +0200
Subject: [PATCH 27/44] optimizations for very small sizes

---
 kernel/x86_64/sgemv_n_4.c                  | 218 +++++++++++++++++----
 kernel/x86_64/sgemv_n_microk_bulldozer-4.c |  65 +++---
 2 files changed, 216 insertions(+), 67 deletions(-)

diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
index 943dcdefa..ee762ffce 100644
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -44,12 +44,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #ifndef HAVE_KERNEL_4x8
 
-static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4)
+static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
 {
 	BLASLONG i;
 	FLOAT *a0,*a1,*a2,*a3;
 	FLOAT *b0,*b1,*b2,*b3;
 	FLOAT *x4;
+	FLOAT x[8];
 	a0 = ap[0];
 	a1 = ap[1];
 	a2 = ap[2];
@@ -60,6 +61,9 @@ static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLON
 	b3 = a3 + lda4 ;
 	x4 = x + 4;
 
+	for ( i=0; i<8; i++)
+		x[i] = xo[i] * *alpha;
+
 	for ( i=0; i< n; i+=4 )
 	{
 
@@ -81,15 +85,19 @@ static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLON
 
 #ifndef HAVE_KERNEL_4x4
 
-static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
 {
 	BLASLONG i;
 	FLOAT *a0,*a1,*a2,*a3;
+	FLOAT x[4];
 	a0 = ap[0];
 	a1 = ap[1];
 	a2 = ap[2];
 	a3 = ap[3];
 
+	for ( i=0; i<4; i++)
+		x[i] = xo[i] * *alpha;
+
 	for ( i=0; i< n; i+=4 )
 	{
 		y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];		
@@ -101,32 +109,147 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	
 #endif
 
-static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+#ifndef HAVE_KERNEL_4x2
+
+static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
 {
-	BLASLONG i;
-	FLOAT *a0;
-	a0 = ap;
 
-	for ( i=0; i< n; i+=4 )
-	{
-		y[i] += a0[i]*x[0];		
-		y[i+1] += a0[i+1]*x[0];		
-		y[i+2] += a0[i+2]*x[0];		
-		y[i+3] += a0[i+3]*x[0];		
-	}
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"movss    (%2)  , %%xmm12	 \n\t"	// x0 
+	"movss    (%6)  , %%xmm4 	 \n\t"	// alpha 
+	"movss   4(%2)  , %%xmm13	 \n\t"	// x1 
+        "mulss  %%xmm4  , %%xmm12        \n\t"  // alpha 
+        "mulss  %%xmm4  , %%xmm13        \n\t"  // alpha 
+	"shufps $0,  %%xmm12, %%xmm12    \n\t"	
+	"shufps $0,  %%xmm13, %%xmm13    \n\t"	
+
+	".align 16				       \n\t"
+	".L01LOOP%=:				       \n\t"
+	"movups	       (%3,%0,4), %%xmm4	       \n\t"	// 4 * y
+
+	"movups             (%4,%0,4), %%xmm8          \n\t" 
+	"movups             (%5,%0,4), %%xmm9          \n\t" 
+	"mulps		%%xmm12, %%xmm8		       \n\t"
+	"mulps		%%xmm13, %%xmm9		       \n\t"
+	"addps		%%xmm8 , %%xmm4		       \n\t"
+        "addq		$4 , %0	  	 	       \n\t"
+	"addps		%%xmm9 , %%xmm4		       \n\t"
+
+	"movups  %%xmm4 , -16(%3,%0,4)		       \n\t"	// 4 * y
+
+	"subq	        $4 , %1			       \n\t"		
+	"jnz		.L01LOOP%=		       \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (alpha)   // 6
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+#endif
+
+#ifndef HAVE_KERNEL_4x2
+
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+        BLASLONG register i = 0;
+	BLASLONG register n1 = n & -8 ;
+	BLASLONG register n2 = n & 4  ;
+
+        __asm__  __volatile__
+        (
+        "movss          (%2), %%xmm12            \n\t"  // x0 
+        "mulss          (%6), %%xmm12            \n\t"  // alpha 
+        "shufps $0,  %%xmm12, %%xmm12            \n\t"
+
+        "cmpq           $0, %1                   \n\t"
+        "je             .L16END%=                \n\t"
+
+        ".align 16                               \n\t"
+        ".L01LOOP%=:                             \n\t"
+        "movups       (%3,%0,4), %%xmm4          \n\t"  // 4 * y
+        "movups     16(%3,%0,4), %%xmm5          \n\t"  // 4 * y
+        "movups       (%4,%0,4), %%xmm8          \n\t"  // 4 * a
+        "movups     16(%4,%0,4), %%xmm9          \n\t"  // 4 * a
+	"mulps          %%xmm12, %%xmm8          \n\t"
+	"mulps          %%xmm12, %%xmm9          \n\t"
+        "addps          %%xmm4 , %%xmm8          \n\t"
+        "addps          %%xmm5 , %%xmm9          \n\t"
+
+        "addq           $8 , %0                  \n\t"
+	"movups  %%xmm8 , -32(%3,%0,4)           \n\t"    // 4 * y
+	"movups  %%xmm9 , -16(%3,%0,4)           \n\t"    // 4 * y
+
+        "subq           $8 , %1                  \n\t"
+
+        "jnz            .L01LOOP%=               \n\t"
+
+        ".L16END%=:                              \n\t"
+
+        "testq          $0x04, %5                \n\t"
+        "jz             .L08LABEL%=              \n\t"
+
+        "movups       (%3,%0,4), %%xmm4          \n\t"  // 4 * y
+        "movups       (%4,%0,4), %%xmm8          \n\t"  // 4 * a
+	"mulps          %%xmm12, %%xmm8          \n\t"
+        "addps          %%xmm8 , %%xmm4          \n\t"
+	"movups  %%xmm4 ,    (%3,%0,4)           \n\t"    // 4 * y
+        "addq           $4 , %0                  \n\t"
+        "subq           $4 , %1                  \n\t"
+
+        ".L08LABEL%=:      			 \n\t" 
+        :
+        :
+          "r" (i),      // 0    
+          "r" (n1),     // 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap),     // 4
+          "r" (n2),     // 5
+          "r" (alpha)   // 6
+        : "cc",
+          "%xmm4", "%xmm5",
+          "%xmm6", "%xmm7",
+          "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+          "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+          "memory"
+        );
+
 }
-	
-static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT *alpha) __attribute__ ((noinline));
 
-static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT *alpha)
+#endif
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
 {
 	BLASLONG i;
 	if ( inc_dest != 1 )
 	{
-		FLOAT da = *alpha;
 		for ( i=0; i<n; i++ )
 		{
-			*dest += *src * da;
+			*dest += *src;
 			src++;
 			dest += inc_dest;
 		}
@@ -137,29 +260,25 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
 
         __asm__  __volatile__
         (
-        "movss   (%2) , %%xmm10                 \n\t"
-        "shufps  $0 , %%xmm10 , %%xmm10         \n\t"
 
         ".align 16                              \n\t"
         ".L01LOOP%=:                            \n\t"
 
-        "movups  (%3,%0,4) , %%xmm12            \n\t"
-        "movups  (%4,%0,4) , %%xmm11            \n\t"
-        "mulps   %%xmm10   , %%xmm12            \n\t"
-        "addq           $4 , %0                 \n\t"
+        "movups  (%2,%0,4) , %%xmm12            \n\t"
+        "movups  (%3,%0,4) , %%xmm11            \n\t"
         "addps   %%xmm12   , %%xmm11            \n\t"
-        "subq           $4 , %1                 \n\t"
-        "movups  %%xmm11, -16(%4,%0,4)          \n\t"
+        "addq           $4 , %0                 \n\t"
+        "movups  %%xmm11, -16(%3,%0,4)          \n\t"
 
+        "subq           $4 , %1                 \n\t"
         "jnz            .L01LOOP%=              \n\t"
 
         :
         :
         "r" (i),          // 0
         "r" (n),          // 1
-        "r" (alpha),      // 2    
-        "r" (src),        // 3
-        "r" (dest)        // 4
+        "r" (src),        // 2
+        "r" (dest)        // 3
         : "cc",
         "%xmm10", "%xmm11", "%xmm12",
         "memory"
@@ -228,14 +347,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 		ap[2] = ap[1] + lda;
 		ap[3] = ap[2] + lda;
 
-		memset(ybuffer,0,NB*4);
+		if ( inc_y != 1 )
+			memset(ybuffer,0,NB*4);
+		else
+			ybuffer = y_ptr;
+
 		if ( inc_x == 1 )
 		{
 
 
 			for( i = 0; i < n1 ; i++)
 			{
-				sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4);
+				sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
 				ap[0] += lda8; 
 				ap[1] += lda8; 
 				ap[2] += lda8; 
@@ -247,21 +370,26 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 
 			if ( n2 & 4 )
 			{
-				sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer);
+				sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
 				ap[0] += lda4; 
 				ap[1] += lda4; 
-				ap[2] += lda4; 
-				ap[3] += lda4; 
 				a_ptr += lda4;
 				x_ptr += 4;	
 			}
 
-			for( i = 0; i < ( n2 & 3 ) ; i++)
+			if ( n2 & 2 )
 			{
-				xbuffer[0] = x_ptr[0];
-				x_ptr += inc_x;	
-				sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
+				sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
+				a_ptr += lda*2;
+				x_ptr += 2;	
+			}
+
+
+			if ( n2 & 1 )
+			{
+				sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
 				a_ptr += lda;
+				x_ptr += 1;	
 
 			}
 
@@ -280,7 +408,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 				x_ptr += inc_x;	
 				xbuffer[3] = x_ptr[0];
 				x_ptr += inc_x;	
-				sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer);
+				sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
 				ap[0] += lda4; 
 				ap[1] += lda4; 
 				ap[2] += lda4; 
@@ -292,16 +420,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 			{
 				xbuffer[0] = x_ptr[0];
 				x_ptr += inc_x;	
-				sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
+				sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
 				a_ptr += lda;
 
 			}
 
 		}
 
-		add_y(NB,ybuffer,y_ptr,inc_y,&alpha);
 		a     += NB;
-		y_ptr += NB * inc_y;
+		if ( inc_y != 1 )
+		{
+			add_y(NB,ybuffer,y_ptr,inc_y);
+			y_ptr += NB * inc_y;
+		}
+		else
+			y_ptr += NB ;
+
 	}
 
 	if ( m3 == 0 ) return;
diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
index 53287df75..40238be49 100644
--- a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
+++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
@@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #define HAVE_KERNEL_4x8 1
-static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) __attribute__ ((noinline));
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
 
-static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4)
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
 {
 
 	BLASLONG register i = 0;
@@ -46,36 +46,38 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	"vbroadcastss  24(%2), %%xmm2 	 \n\t"	// x6 
 	"vbroadcastss  28(%2), %%xmm3 	 \n\t"	// x7 
 
+	"vbroadcastss    (%9), %%xmm8 	 \n\t"	// alpha 
+
         "testq          $0x04, %1                      \n\t"
         "jz             .L08LABEL%=                    \n\t"
 
+	"vxorps		%%xmm4, %%xmm4 , %%xmm4  \n\t"
 	"vxorps		%%xmm5, %%xmm5 , %%xmm5  \n\t"
-	"vmovups	(%3,%0,4), %%xmm4	 \n\t"	// 4 * y
 
 	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
 	"vfmaddps %%xmm5,   (%5,%0,4), %%xmm13, %%xmm5 \n\t" 
 	"vfmaddps %%xmm4,   (%6,%0,4), %%xmm14, %%xmm4 \n\t" 
 	"vfmaddps %%xmm5,   (%7,%0,4), %%xmm15, %%xmm5 \n\t" 
-        "addq		$4 , %0	  	 	      \n\t"
+        "addq		$4 , %0	  	 	       \n\t"
 
 	"vfmaddps %%xmm4,   (%4,%8,4), %%xmm0 , %%xmm4 \n\t" 
 	"vfmaddps %%xmm5,   (%5,%8,4), %%xmm1 , %%xmm5 \n\t" 
 	"vfmaddps %%xmm4,   (%6,%8,4), %%xmm2 , %%xmm4 \n\t" 
 	"vfmaddps %%xmm5,   (%7,%8,4), %%xmm3 , %%xmm5 \n\t" 
-        "addq		$4 , %8	  	 	      \n\t"
+        "addq		$4 , %8	  	 	       \n\t"
 	
-	"vaddps	  %%xmm4, %%xmm5, %%xmm6	       \n\t"
-	"vmovups  %%xmm6, -16(%3,%0,4)		      \n\t"	// 4 * y
-
-	"subq	        $4 , %1			      \n\t"		
+	"vaddps		%%xmm5 , %%xmm4, %%xmm4        \n\t"
+	"vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
+	"subq	        $4 , %1			       \n\t"		
+	"vmovups  %%xmm6, -16(%3,%0,4)		       \n\t"	// 4 * y
 
 	".L08LABEL%=:                                  \n\t"
 
         "testq          $0x08, %1                      \n\t"
         "jz             .L16LABEL%=                    \n\t"
 
-	"vmovups	(%3,%0,4), %%xmm4	 \n\t"	// 4 * y
-	"vmovups      16(%3,%0,4), %%xmm5	 \n\t"	// 4 * y
+	"vxorps		%%xmm4, %%xmm4 , %%xmm4  \n\t"
+	"vxorps		%%xmm5, %%xmm5 , %%xmm5  \n\t"
 
 	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
 	"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" 
@@ -95,6 +97,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	"vfmaddps %%xmm4,   (%7,%8,4), %%xmm3 , %%xmm4 \n\t" 
 	"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" 
 	
+	"vfmaddps    (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
+	"vfmaddps  16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
 	"vmovups  %%xmm4,   (%3,%0,4)		      \n\t"	// 4 * y
 	"vmovups  %%xmm5, 16(%3,%0,4)		      \n\t"	// 4 * y
 
@@ -111,11 +115,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 
-	"vmovups	(%3,%0,4), %%xmm4	 \n\t"	// 4 * y
-	".align 2				 \n\t"
-	"vmovups      16(%3,%0,4), %%xmm5	 \n\t"	// 4 * y
-	"vmovups      32(%3,%0,4), %%xmm6	 \n\t"	// 4 * y
-	"vmovups      48(%3,%0,4), %%xmm7	 \n\t"	// 4 * y
+	"vxorps		%%xmm4, %%xmm4 , %%xmm4  \n\t"
+	"vxorps		%%xmm5, %%xmm5 , %%xmm5  \n\t"
+	"vxorps		%%xmm6, %%xmm6 , %%xmm6  \n\t"
+	"vxorps		%%xmm7, %%xmm7 , %%xmm7  \n\t"
 
         "prefetcht0      192(%4,%0,4)                  \n\t"
 	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
@@ -157,19 +160,24 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
         "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t" 
 	"vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t" 
 	"vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t" 
-        "addq		$16, %0	  	 	      \n\t"
 	"vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t" 
 	"vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t" 
 	"vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t" 
 	"vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t" 
 	
-        "addq		$16, %8	  	 	      \n\t"
+	"vfmaddps    (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
+	"vfmaddps  16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
+	"vfmaddps  32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
+	"vfmaddps  48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
+
+        "addq		$16, %0	  	 	      \n\t"
 	"vmovups  %%xmm4,-64(%3,%0,4)		      \n\t"	// 4 * y
 	"vmovups  %%xmm5,-48(%3,%0,4)		      \n\t"	// 4 * y
-	"subq	        $16, %1			      \n\t"		
+        "addq		$16, %8	  	 	      \n\t"
 	"vmovups  %%xmm6,-32(%3,%0,4)		      \n\t"	// 4 * y
 	"vmovups  %%xmm7,-16(%3,%0,4)		      \n\t"	// 4 * y
 
+	"subq	        $16, %1			      \n\t"		
 	"jnz		.L01LOOP%=		      \n\t"
 
 	".L16END%=:                             \n\t"
@@ -184,12 +192,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
           "r" (ap[1]),  // 5
           "r" (ap[2]),  // 6
           "r" (ap[3]),  // 7
-          "r" (lda4)    // 8
+          "r" (lda4),   // 8
+          "r" (alpha)   // 9
 	: "cc", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", 
 	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", 
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
@@ -200,9 +210,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 
 
 #define HAVE_KERNEL_4x4 1
-static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
 
-static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
 {
 
 	BLASLONG register i = 0;
@@ -214,9 +224,11 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	"vbroadcastss   8(%2), %%xmm14	 \n\t"	// x2 
 	"vbroadcastss  12(%2), %%xmm15	 \n\t"	// x3 
 
+	"vbroadcastss    (%8), %%xmm8 	 \n\t"	// alpha 
+
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
-	"vmovups	(%3,%0,4), %%xmm4	 \n\t"	// 4 * y
+	"vxorps		%%xmm4, %%xmm4 , %%xmm4  \n\t"
 	"vxorps		%%xmm5, %%xmm5 , %%xmm5  \n\t"
 
 	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
@@ -224,8 +236,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	"vfmaddps %%xmm4,   (%6,%0,4), %%xmm14, %%xmm4 \n\t" 
 	"vfmaddps %%xmm5,   (%7,%0,4), %%xmm15, %%xmm5 \n\t" 
 	
-	"vaddps	  %%xmm4, %%xmm5, %%xmm6	       \n\t"
+	"vaddps	  %%xmm4, %%xmm5, %%xmm4	       \n\t"
 
+	"vfmaddps    (%3,%0,4) , %%xmm4,%%xmm8,%%xmm6 \n\t"
 	"vmovups  %%xmm6,   (%3,%0,4)		      \n\t"	// 4 * y
 
         "addq		$4 , %0	  	 	      \n\t"
@@ -241,10 +254,12 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
           "r" (ap[0]),  // 4
           "r" (ap[1]),  // 5
           "r" (ap[2]),  // 6
-          "r" (ap[3])   // 7
+          "r" (ap[3]),  // 7
+          "r" (alpha)   // 8
 	: "cc", 
 	  "%xmm4", "%xmm5", 
 	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", 
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

From 75207b114837c361ab36e878eca61dca7e784cfb Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sun, 7 Sep 2014 18:23:48 +0200
Subject: [PATCH 28/44] optimized sgemv_n for very small size of m

---
 kernel/x86_64/sgemv_n_4.c | 148 +++++++++++++++++++++++++++++++++++---
 1 file changed, 138 insertions(+), 10 deletions(-)

diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
index ee762ffce..0135306af 100644
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -438,25 +438,153 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 
 	}
 
-	if ( m3 == 0 ) return;
+	if ( m3 == 0 ) return(0);
 
-	j=0;
-	while ( j < m3 )
+	if ( m3 == 3 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		FLOAT temp2 = 0.0;
+		if ( lda == 3 && inc_x ==1 )
+		{
+
+			for( i = 0; i < ( n & -4 ); i+=4 )
+			{
+
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
+				temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+
+				temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9]  * x_ptr[3];
+				temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
+				temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
+
+				a_ptr += 12;
+				x_ptr += 4;
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += 3;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp2;
+		return(0);
+	}
+
+
+	if ( m3 == 2 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		if ( lda == 2 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4) ; i+=4 )
+			{
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
+				temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+				a_ptr += 8;
+				x_ptr += 4;
+
+			}
+
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0]   * x_ptr[0];
+				temp1 += a_ptr[1]   * x_ptr[0];
+				a_ptr += 2;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		return(0);
+	}
+
+	if ( m3 == 1 )
 	{
 		a_ptr = a;
 		x_ptr = x;
 		FLOAT temp = 0.0;
-		for( i = 0; i < n; i++ )
+		if ( lda == 1 && inc_x ==1 )
 		{
-			temp += a_ptr[0] * x_ptr[0];
-			a_ptr += lda;
-			x_ptr += inc_x;
+
+			for( i = 0; i < (n & -4); i+=4 )
+			{
+				temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
+	
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp += a_ptr[i] * x_ptr[i];
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp += a_ptr[0] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+			}
+
 		}
 		y_ptr[0] += alpha * temp;
-		y_ptr += inc_y;
-		a++;
-		j++;
+		return(0);
 	}
+
+
 	return(0);
 }
 

From 7b3932b3f348e88cfd9462463bb5ac1f6a5d3a8e Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sun, 7 Sep 2014 19:20:08 +0200
Subject: [PATCH 29/44] optimized sgemv_n kernel for nehalem

---
 kernel/x86_64/sgemv_n_microk_nehalem-4.c | 42 ++++++++++++++++--------
 1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c
index f87cfa425..77a1b11aa 100644
--- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c
+++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c
@@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #define HAVE_KERNEL_4x8 1
-static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) __attribute__ ((noinline));
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
 
-static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4)
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
 {
 
 	BLASLONG register i = 0;
@@ -55,11 +55,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	"shufps $0,  %%xmm2 , %%xmm2 \n\t"	
 	"shufps $0,  %%xmm3 , %%xmm3 \n\t"	
 
+	"movss    (%9), %%xmm6	     \n\t"	// alpha 
+	"shufps $0,  %%xmm6 , %%xmm6 \n\t"	
+
 
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
+	"xorps           %%xmm4 , %%xmm4	 \n\t"
 	"xorps           %%xmm5 , %%xmm5	 \n\t"
-	"movups	       (%3,%0,4), %%xmm4	 \n\t"	// 4 * y
+	"movups             (%3,%0,4), %%xmm7          \n\t" // 4 * y
 
 	".align 2				       \n\t"
 	"movups             (%4,%0,4), %%xmm8          \n\t" 
@@ -85,16 +89,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	"mulps		%%xmm1 , %%xmm9		       \n\t"
 	"mulps		%%xmm2 , %%xmm10	       \n\t"
 	"mulps		%%xmm3 , %%xmm11	       \n\t"
-        "addq		$4 , %8	  	 	       \n\t"
 	"addps		%%xmm8 , %%xmm4		       \n\t"
 	"addps		%%xmm9 , %%xmm5		       \n\t"
-        "addq		$4 , %0	  	 	       \n\t"
 	"addps		%%xmm10, %%xmm4	       	       \n\t"
 	"addps		%%xmm11, %%xmm5 	       \n\t"
-	"subq	        $4 , %1			       \n\t"		
 
-	"addps		%%xmm4 , %%xmm5 	       \n\t"
-	"movups  %%xmm5 , -16(%3,%0,4)		       \n\t"	// 4 * y
+        "addq		$4 , %8	  	 	       \n\t"
+	"addps		%%xmm5 , %%xmm4 	       \n\t"
+        "addq		$4 , %0	  	 	       \n\t"
+	"mulps		%%xmm6 , %%xmm4		       \n\t" 
+	"subq	        $4 , %1			       \n\t"		
+	"addps		%%xmm4 , %%xmm7 	       \n\t"
+
+	"movups  %%xmm7 , -16(%3,%0,4)		       \n\t"	// 4 * y
 
 	"jnz		.L01LOOP%=		       \n\t"
 
@@ -108,11 +115,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
           "r" (ap[1]),  // 5
           "r" (ap[2]),  // 6
           "r" (ap[3]),  // 7
-          "r" (lda4)    // 8
+          "r" (lda4),   // 8
+          "r" (alpha)   // 9
 	: "cc", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
@@ -124,9 +133,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 
 
 #define HAVE_KERNEL_4x4 1
-static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
 
-static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
 {
 
 	BLASLONG register i = 0;
@@ -142,9 +151,13 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	"shufps $0,  %%xmm14, %%xmm14\n\t"	
 	"shufps $0,  %%xmm15, %%xmm15\n\t"	
 
+	"movss    (%8), %%xmm6	     \n\t"	// alpha 
+	"shufps $0,  %%xmm6 , %%xmm6 \n\t"	
+
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
-	"movups	       (%3,%0,4), %%xmm4	 \n\t"	// 4 * y
+	"xorps           %%xmm4 , %%xmm4	 \n\t"
+	"movups	       (%3,%0,4), %%xmm7	 \n\t"	// 4 * y
 
 	"movups             (%4,%0,4), %%xmm8          \n\t" 
 	"movups             (%5,%0,4), %%xmm9          \n\t" 
@@ -161,6 +174,8 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	"addps		%%xmm10 , %%xmm4	       \n\t"
 	"addps		%%xmm4 , %%xmm11	       \n\t"
 
+	"mulps		%%xmm6 , %%xmm11	       \n\t" 
+	"addps		%%xmm7 , %%xmm11 	       \n\t"
 	"movups  %%xmm11, -16(%3,%0,4)		       \n\t"	// 4 * y
 
 	"jnz		.L01LOOP%=		       \n\t"
@@ -174,7 +189,8 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
           "r" (ap[0]),  // 4
           "r" (ap[1]),  // 5
           "r" (ap[2]),  // 6
-          "r" (ap[3])   // 7
+          "r" (ap[3]),  // 7
+          "r" (alpha)   // 8
 	: "cc", 
 	  "%xmm4", "%xmm5", 
 	  "%xmm6", "%xmm7", 

From 553e2754077eb0b2cb8782e8ecdb5e6eb9c8366b Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sun, 7 Sep 2014 20:53:30 +0200
Subject: [PATCH 30/44] optimized sgemv_n kernel for sandybridge

---
 kernel/x86_64/sgemv_n_microk_sandy-4.c | 112 +++++++++++++++++--------
 1 file changed, 79 insertions(+), 33 deletions(-)

diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c
index b4caca630..44c2b3f2b 100644
--- a/kernel/x86_64/sgemv_n_microk_sandy-4.c
+++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c
@@ -29,9 +29,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #define HAVE_KERNEL_4x8 1
-static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) __attribute__ ((noinline));
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
 
-static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4)
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
 {
 
 	BLASLONG register i = 0;
@@ -48,61 +48,75 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	"vbroadcastss  24(%2), %%ymm2 	 \n\t"	// x6 
 	"vbroadcastss  28(%2), %%ymm3 	 \n\t"	// x7 
 
+	"vbroadcastss    (%9), %%ymm6 	 \n\t"	// alpha 
+
         "testq          $0x04, %1               \n\t"
         "jz             .L08LABEL%=             \n\t"
 
-	"vmovups	(%3,%0,4), %%xmm4	  \n\t"	// 4 * y
+	"vxorps	  %%xmm4 , %%xmm4 , %%xmm4        \n\t"
+	"vxorps	  %%xmm5 , %%xmm5 , %%xmm5        \n\t"
+	"vmovups	(%3,%0,4), %%xmm7	  \n\t"	// 4 * y
 
 	"vmulps   (%4,%0,4), %%xmm12, %%xmm8      \n\t" 
 	"vmulps   (%5,%0,4), %%xmm13, %%xmm10     \n\t" 
 	"vmulps   (%6,%0,4), %%xmm14, %%xmm9      \n\t" 
 	"vmulps   (%7,%0,4), %%xmm15, %%xmm11     \n\t" 
 	"vaddps	  %%xmm4, %%xmm8 , %%xmm4	  \n\t"
-	"vaddps	  %%xmm4, %%xmm10, %%xmm4	  \n\t"
+	"vaddps	  %%xmm5, %%xmm10, %%xmm5	  \n\t"
 	"vaddps	  %%xmm4, %%xmm9 , %%xmm4	  \n\t"
-	"vaddps	  %%xmm4, %%xmm11, %%xmm4	  \n\t"
+	"vaddps	  %%xmm5, %%xmm11, %%xmm5	  \n\t"
 
 	"vmulps   (%4,%8,4), %%xmm0 , %%xmm8      \n\t" 
 	"vmulps   (%5,%8,4), %%xmm1 , %%xmm10     \n\t" 
 	"vmulps   (%6,%8,4), %%xmm2 , %%xmm9      \n\t" 
 	"vmulps   (%7,%8,4), %%xmm3 , %%xmm11     \n\t" 
 	"vaddps	  %%xmm4, %%xmm8 , %%xmm4	  \n\t"
-	"vaddps	  %%xmm4, %%xmm10, %%xmm4	  \n\t"
+	"vaddps	  %%xmm5, %%xmm10, %%xmm5	  \n\t"
 	"vaddps	  %%xmm4, %%xmm9 , %%xmm4	  \n\t"
-	"vaddps	  %%xmm4, %%xmm11, %%xmm4	  \n\t"
+	"vaddps	  %%xmm5, %%xmm11, %%xmm5	  \n\t"
 
-	"vmovups  %%xmm4,   (%3,%0,4)		  \n\t"	// 4 * y
+	"vaddps	  %%xmm5, %%xmm4 , %%xmm4	  \n\t"
+	"vmulps	  %%xmm6, %%xmm4 , %%xmm5	  \n\t"
+	"vaddps	  %%xmm5, %%xmm7 , %%xmm5	  \n\t"
+
+	"vmovups  %%xmm5,   (%3,%0,4)		  \n\t"	// 4 * y
 
         "addq		$4, %8	  	 	  \n\t"
         "addq		$4, %0	  	 	  \n\t"
 	"subq	        $4, %1			  \n\t"		
 
-        ".L08LABEL%=:                           \n\t"
+        ".L08LABEL%=:                             \n\t"
 
         "testq          $0x08, %1                 \n\t"
         "jz             .L16LABEL%=               \n\t"
 
-	"vmovups	(%3,%0,4), %%ymm4	  \n\t"	// 8 * y
+	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
+	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"
+	"vmovups	(%3,%0,4), %%ymm7	  \n\t"	// 8 * y
 
 	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
 	"vmulps   (%5,%0,4), %%ymm13, %%ymm10     \n\t" 
 	"vmulps   (%6,%0,4), %%ymm14, %%ymm9      \n\t" 
 	"vmulps   (%7,%0,4), %%ymm15, %%ymm11     \n\t" 
 	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
-	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm10, %%ymm5	  \n\t"
 	"vaddps	  %%ymm4, %%ymm9 , %%ymm4	  \n\t"
-	"vaddps	  %%ymm4, %%ymm11, %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
 
 	"vmulps   (%4,%8,4), %%ymm0 , %%ymm8      \n\t" 
 	"vmulps   (%5,%8,4), %%ymm1 , %%ymm10     \n\t" 
 	"vmulps   (%6,%8,4), %%ymm2 , %%ymm9      \n\t" 
 	"vmulps   (%7,%8,4), %%ymm3 , %%ymm11     \n\t" 
 	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
-	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm10, %%ymm5	  \n\t"
 	"vaddps	  %%ymm4, %%ymm9 , %%ymm4	  \n\t"
-	"vaddps	  %%ymm4, %%ymm11, %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
 
-	"vmovups  %%ymm4,   (%3,%0,4)		  \n\t"	// 8 * y
+	"vaddps	  %%ymm5, %%ymm4 , %%ymm4	  \n\t"
+	"vmulps	  %%ymm6, %%ymm4 , %%ymm5	  \n\t"
+	"vaddps	  %%ymm5, %%ymm7 , %%ymm5	  \n\t"
+
+	"vmovups  %%ymm5,   (%3,%0,4)		  \n\t"	// 8 * y
 
         "addq		$8, %8	  	 	  \n\t"
         "addq		$8, %0	  	 	  \n\t"
@@ -117,8 +131,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
-	"vmovups	(%3,%0,4), %%ymm4	 \n\t"	// 8 * y
-	"vmovups      32(%3,%0,4), %%ymm5	 \n\t"	// 8 * y
+	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
+	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"
 
 	"prefetcht0	 192(%4,%0,4)		  \n\t"
 	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
@@ -164,6 +178,12 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
 
+	"vmulps	  %%ymm6, %%ymm4 , %%ymm4	  \n\t"
+	"vmulps	  %%ymm6, %%ymm5 , %%ymm5	  \n\t"
+
+	"vaddps    (%3,%0,4), %%ymm4 , %%ymm4	 \n\t"	// 8 * y
+	"vaddps  32(%3,%0,4), %%ymm5 , %%ymm5	 \n\t"	// 8 * y
+
 	"vmovups  %%ymm4,   (%3,%0,4)		  \n\t"	// 8 * y
 	"vmovups  %%ymm5, 32(%3,%0,4)		  \n\t"	// 8 * y
 
@@ -185,11 +205,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
           "r" (ap[1]),  // 5
           "r" (ap[2]),  // 6
           "r" (ap[3]),  // 7
-          "r" (lda4)    // 8
+          "r" (lda4),   // 8
+          "r" (alpha)   // 9
 	: "cc", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
@@ -201,9 +223,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 
 
 #define HAVE_KERNEL_4x4 1
-static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
 
-static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
 {
 
 	BLASLONG register i = 0;
@@ -216,21 +238,29 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	"vbroadcastss   8(%2), %%ymm14	 \n\t"	// x2 
 	"vbroadcastss  12(%2), %%ymm15	 \n\t"	// x3 
 
+	"vbroadcastss    (%8), %%ymm6 	 \n\t"	// alpha 
+
         "testq          $0x04, %1               \n\t"
         "jz             .L08LABEL%=             \n\t"
 
-	"vmovups	(%3,%0,4), %%xmm4	  \n\t"	// 4 * y
+	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
+	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"
+	"vmovups	(%3,%0,4), %%xmm7	  \n\t"	// 4 * y
 
 	"vmulps   (%4,%0,4), %%xmm12, %%xmm8      \n\t" 
 	"vmulps   (%5,%0,4), %%xmm13, %%xmm10     \n\t" 
 	"vmulps   (%6,%0,4), %%xmm14, %%xmm9      \n\t" 
 	"vmulps   (%7,%0,4), %%xmm15, %%xmm11     \n\t" 
 	"vaddps	  %%xmm4, %%xmm8 , %%xmm4	  \n\t"
-	"vaddps	  %%xmm4, %%xmm10, %%xmm4	  \n\t"
+	"vaddps	  %%xmm5, %%xmm10, %%xmm5	  \n\t"
 	"vaddps	  %%xmm4, %%xmm9 , %%xmm4	  \n\t"
-	"vaddps	  %%xmm4, %%xmm11, %%xmm4	  \n\t"
+	"vaddps	  %%xmm5, %%xmm11, %%xmm5	  \n\t"
 
-	"vmovups  %%xmm4,   (%3,%0,4)		  \n\t"	// 4 * y
+	"vaddps	  %%xmm5, %%xmm4 , %%xmm4	  \n\t"
+	"vmulps	  %%xmm6, %%xmm4 , %%xmm5	  \n\t"
+	"vaddps	  %%xmm5, %%xmm7 , %%xmm5	  \n\t"
+
+	"vmovups  %%xmm5,   (%3,%0,4)		  \n\t"	// 4 * y
 
         "addq		$4, %0	  	 	  \n\t"
 	"subq	        $4, %1			  \n\t"		
@@ -240,18 +270,24 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
         "testq          $0x08, %1                 \n\t"
         "jz             .L16LABEL%=               \n\t"
 
-	"vmovups	(%3,%0,4), %%ymm4	  \n\t"	// 8 * y
+	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
+	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"
+	"vmovups	(%3,%0,4), %%ymm7	  \n\t"	// 8 * y
 
 	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
 	"vmulps   (%5,%0,4), %%ymm13, %%ymm10     \n\t" 
 	"vmulps   (%6,%0,4), %%ymm14, %%ymm9      \n\t" 
 	"vmulps   (%7,%0,4), %%ymm15, %%ymm11     \n\t" 
 	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
-	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm10, %%ymm5	  \n\t"
 	"vaddps	  %%ymm4, %%ymm9 , %%ymm4	  \n\t"
-	"vaddps	  %%ymm4, %%ymm11, %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
 
-	"vmovups  %%ymm4,   (%3,%0,4)		  \n\t"	// 8 * y
+	"vaddps	  %%ymm5, %%ymm4 , %%ymm4	  \n\t"
+	"vmulps	  %%ymm6, %%ymm4 , %%ymm5	  \n\t"
+	"vaddps	  %%ymm5, %%ymm7 , %%ymm5	  \n\t"
+
+	"vmovups  %%ymm5,   (%3,%0,4)		  \n\t"	// 8 * y
 
         "addq		$8, %0	  	 	  \n\t"
 	"subq	        $8, %1			  \n\t"		
@@ -265,8 +301,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
-	"vmovups	(%3,%0,4), %%ymm4	 \n\t"	// 8 * y
-	"vmovups      32(%3,%0,4), %%ymm5	 \n\t"	// 8 * y
+	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
+	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"
+	"vmovups	(%3,%0,4), %%ymm0	 \n\t"	// 8 * y
+	"vmovups      32(%3,%0,4), %%ymm1	 \n\t"	// 8 * y
 
 	"prefetcht0	 192(%4,%0,4)		  \n\t"
 	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
@@ -290,8 +328,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
 
-	"vmovups  %%ymm4,   (%3,%0,4)		  \n\t"	// 8 * y
-	"vmovups  %%ymm5, 32(%3,%0,4)		  \n\t"	// 8 * y
+	"vmulps	  %%ymm6, %%ymm4 , %%ymm4	  \n\t"
+	"vmulps	  %%ymm6, %%ymm5 , %%ymm5	  \n\t"
+
+	"vaddps	  %%ymm4, %%ymm0 , %%ymm0	  \n\t"
+	"vaddps	  %%ymm5, %%ymm1 , %%ymm1	  \n\t"
+
+	"vmovups  %%ymm0,   (%3,%0,4)		  \n\t"	// 8 * y
+	"vmovups  %%ymm1, 32(%3,%0,4)		  \n\t"	// 8 * y
 
         "addq		$16, %0	  	 	  \n\t"
 	"subq	        $16, %1			  \n\t"		
@@ -309,8 +353,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
           "r" (ap[0]),  // 4
           "r" (ap[1]),  // 5
           "r" (ap[2]),  // 6
-          "r" (ap[3])   // 7
+          "r" (ap[3]),  // 7
+          "r" (alpha)   // 8
 	: "cc", 
+	  "%xmm0", "%xmm1", 
 	  "%xmm4", "%xmm5", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",

From 80f77868758a6e99c2716dd12d8bfb63d6ed015f Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sun, 7 Sep 2014 21:13:57 +0200
Subject: [PATCH 31/44] enabled optimized sgemv kernels for piledriver

---
 kernel/x86_64/KERNEL.HASWELL    | 2 +-
 kernel/x86_64/KERNEL.PILEDRIVER | 4 ++--
 kernel/x86_64/sgemv_t_4.c       | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index c2c64939b..d0ac9c72f 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -1,4 +1,4 @@
-SGEMVNKERNEL = sgemv_n_4.c
+SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t.c
 
 DGEMVNKERNEL = dgemv_n.c
diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER
index 146a8768b..4f15e5a36 100644
--- a/kernel/x86_64/KERNEL.PILEDRIVER
+++ b/kernel/x86_64/KERNEL.PILEDRIVER
@@ -1,5 +1,5 @@
-SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = sgemv_t.c
+SGEMVNKERNEL = sgemv_n_4.c
+SGEMVTKERNEL = sgemv_t_4.c
 
 ZGEMVNKERNEL = zgemv_n_dup.S
 ZGEMVTKERNEL = zgemv_t.S
diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index 920322c4f..3316473af 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(NEHALEM)
 #include "sgemv_t_microk_nehalem-4.c"
-#elif defined(BULLDOZER)
+#elif defined(BULLDOZER) || defined(PILEDRIVER)
 #include "sgemv_t_microk_bulldozer-4.c"
 #elif defined(SANDYBRIDGE)
 #include "sgemv_t_microk_sandy-4.c"

From 2be5c7a640488796d98ac3cbb44004a39491da7f Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sun, 7 Sep 2014 21:48:42 +0200
Subject: [PATCH 32/44] bugfix for windows

---
 kernel/x86_64/sgemv_n_microk_sandy-4.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c
index 44c2b3f2b..c162eeeb6 100644
--- a/kernel/x86_64/sgemv_n_microk_sandy-4.c
+++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c
@@ -357,7 +357,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
           "r" (alpha)   // 8
 	: "cc", 
 	  "%xmm0", "%xmm1", 
+	  "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"

From cbbc80aad3586900443ed7fef1d0ff1814a80e9a Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Mon, 8 Sep 2014 10:13:39 +0200
Subject: [PATCH 33/44] added optimized sgemv_t kernel for haswell

---
 kernel/x86_64/sgemv_n_microk_haswell-4.c | 275 ++++++++++++-----------
 kernel/x86_64/sgemv_t_microk_haswell-4.c | 148 ++++++++++++
 2 files changed, 298 insertions(+), 125 deletions(-)
 create mode 100644 kernel/x86_64/sgemv_t_microk_haswell-4.c

diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c
index a2470a4b7..1e4498d9e 100644
--- a/kernel/x86_64/sgemv_n_microk_haswell-4.c
+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c
@@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #define HAVE_KERNEL_4x8 1
-static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) __attribute__ ((noinline));
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
 
-static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4)
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
 {
 
 	BLASLONG register i = 0;
@@ -47,10 +47,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	"vbroadcastss  24(%2), %%ymm2 	 \n\t"	// x6 
 	"vbroadcastss  28(%2), %%ymm3 	 \n\t"	// x7 
 
+	"vbroadcastss    (%9), %%ymm6 	 \n\t"	// alpha 
+
         "testq          $0x04, %1                      \n\t"
         "jz             .L08LABEL%=                    \n\t"
 
-	"vmovups	(%3,%0,4), %%xmm4	       \n\t"	// 4 * y
+	"vmovups	(%3,%0,4), %%xmm7	       \n\t"	// 4 * y
+	"vxorps		%%xmm4 , %%xmm4, %%xmm4        \n\t"
 	"vxorps		%%xmm5 , %%xmm5, %%xmm5        \n\t"
 
 	"vfmadd231ps   (%4,%0,4), %%xmm12, %%xmm4      \n\t" 
@@ -64,6 +67,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	"vfmadd231ps   (%7,%8,4), %%xmm3 , %%xmm5      \n\t" 
 
 	"vaddps		%%xmm4 , %%xmm5 , %%xmm5       \n\t"
+	"vmulps		%%xmm6 , %%xmm5 , %%xmm5       \n\t"
+	"vaddps		%%xmm7 , %%xmm5 , %%xmm5       \n\t"
 
 	"vmovups  %%xmm5,   (%3,%0,4)		       \n\t"	// 4 * y
 
@@ -76,7 +81,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
         "testq          $0x08, %1                      \n\t"
         "jz             .L16LABEL%=                    \n\t"
 
-	"vmovups	(%3,%0,4), %%ymm4	       \n\t"	// 8 * y
+	"vmovups	(%3,%0,4), %%ymm7	       \n\t"	// 8 * y
+	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"
 
 	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
@@ -90,6 +96,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	"vfmadd231ps   (%7,%8,4), %%ymm3 , %%ymm5      \n\t" 
 
 	"vaddps		%%ymm4 , %%ymm5 , %%ymm5       \n\t"
+	"vmulps		%%ymm6 , %%ymm5 , %%ymm5       \n\t"
+	"vaddps		%%ymm7 , %%ymm5 , %%ymm5       \n\t"
+
 
 	"vmovups  %%ymm5,   (%3,%0,4)		       \n\t"	// 8 * y
 
@@ -105,42 +114,160 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
-	// "prefetcht0	 192(%3,%0,4)		       \n\t"
-	"vmovups	(%3,%0,4), %%ymm4	 \n\t"	// 8 * y
-	"vmovups      32(%3,%0,4), %%ymm5	 \n\t"	// 8 * y
 
-	// "prefetcht0	 192(%4,%0,4)		       \n\t"
+	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
+	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"
+	"vmovups	(%3,%0,4), %%ymm8	       \n\t"	// 8 * y
+	"vmovups      32(%3,%0,4), %%ymm9	       \n\t"	// 8 * y
+
 	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
 	"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5      \n\t" 
-	// "prefetcht0	 192(%5,%0,4)		       \n\t"
 	"vfmadd231ps   (%5,%0,4), %%ymm13, %%ymm4      \n\t" 
 	"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5      \n\t" 
-	// "prefetcht0	 192(%6,%0,4)		       \n\t"
 	"vfmadd231ps   (%6,%0,4), %%ymm14, %%ymm4      \n\t" 
 	"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5      \n\t" 
-	// "prefetcht0	 192(%7,%0,4)		       \n\t"
 	"vfmadd231ps   (%7,%0,4), %%ymm15, %%ymm4      \n\t" 
 	"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5      \n\t" 
 
-	// "prefetcht0	 192(%4,%8,4)		       \n\t"
 	"vfmadd231ps   (%4,%8,4), %%ymm0 , %%ymm4      \n\t" 
-        "addq		$16, %0	  	 	      \n\t"
+        "addq		$16, %0	  	 	       \n\t"
 	"vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5      \n\t" 
-	// "prefetcht0	 192(%5,%8,4)		       \n\t"
 	"vfmadd231ps   (%5,%8,4), %%ymm1 , %%ymm4      \n\t" 
 	"vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5      \n\t" 
-	// "prefetcht0	 192(%6,%8,4)		       \n\t"
 	"vfmadd231ps   (%6,%8,4), %%ymm2 , %%ymm4      \n\t" 
 	"vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5      \n\t" 
-	// "prefetcht0	 192(%7,%8,4)		       \n\t"
 	"vfmadd231ps   (%7,%8,4), %%ymm3 , %%ymm4      \n\t" 
 	"vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5      \n\t" 
 
-        "addq		$16, %8	  	 	      \n\t"
-	"vmovups  %%ymm4,-64(%3,%0,4)		      \n\t"	// 8 * y
-	"subq	        $16, %1			      \n\t"		
-	"vmovups  %%ymm5,-32(%3,%0,4)		      \n\t"	// 8 * y
+	"vfmadd231ps     %%ymm6 , %%ymm4 , %%ymm8      \n\t"
+	"vfmadd231ps     %%ymm6 , %%ymm5 , %%ymm9      \n\t"
 
+        "addq		$16, %8	  	 	      \n\t"
+	"vmovups  %%ymm8,-64(%3,%0,4)		      \n\t"	// 8 * y
+	"subq	        $16, %1			      \n\t"		
+	"vmovups  %%ymm9,-32(%3,%0,4)		      \n\t"	// 8 * y
+
+	"jnz		.L01LOOP%=		      \n\t"
+
+        ".L16END%=:                             \n\t"
+	"vzeroupper			        \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (lda4),   // 8
+          "r" (alpha)   // 9
+	: "cc", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+	"vbroadcastss    (%2), %%ymm12	 \n\t"	// x0 
+	"vbroadcastss   4(%2), %%ymm13	 \n\t"	// x1 
+	"vbroadcastss   8(%2), %%ymm14	 \n\t"	// x2 
+	"vbroadcastss  12(%2), %%ymm15	 \n\t"	// x3 
+
+	"vbroadcastss    (%8), %%ymm6 	 \n\t"	// alpha 
+
+        "testq          $0x04, %1                      \n\t"
+        "jz             .L08LABEL%=                    \n\t"
+
+	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
+	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"
+	"vmovups	(%3,%0,4), %%xmm7	       \n\t"	// 4 * y
+
+	"vfmadd231ps   (%4,%0,4), %%xmm12, %%xmm4      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%xmm13, %%xmm5      \n\t" 
+	"vfmadd231ps   (%6,%0,4), %%xmm14, %%xmm4      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%xmm15, %%xmm5      \n\t" 
+
+	"vaddps		%%xmm4 , %%xmm5 , %%xmm5       \n\t"
+	"vmulps		%%xmm6 , %%xmm5 , %%xmm5       \n\t"
+	"vaddps		%%xmm7 , %%xmm5 , %%xmm5       \n\t"
+
+	"vmovups  %%xmm5,   (%3,%0,4)		       \n\t"	// 4 * y
+
+        "addq		$4 , %0	  	 	       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+
+        ".L08LABEL%=:                                  \n\t"
+
+        "testq          $0x08, %1                      \n\t"
+        "jz             .L16LABEL%=                    \n\t"
+
+	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
+	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"
+	"vmovups	(%3,%0,4), %%ymm7	       \n\t"	// 8 * y
+
+	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%ymm13, %%ymm5      \n\t" 
+	"vfmadd231ps   (%6,%0,4), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%ymm15, %%ymm5      \n\t" 
+
+	"vaddps		%%ymm4 , %%ymm5 , %%ymm5       \n\t"
+	"vmulps		%%ymm6 , %%ymm5 , %%ymm5       \n\t"
+	"vaddps		%%ymm7 , %%ymm5 , %%ymm5       \n\t"
+
+	"vmovups  %%ymm5,   (%3,%0,4)		       \n\t"	// 8 * y
+
+        "addq		$8 , %0	  	 	       \n\t"
+	"subq	        $8 , %1			       \n\t"		
+
+        ".L16LABEL%=:                                  \n\t"
+
+        "cmpq           $0, %1                         \n\t"
+        "je             .L16END%=                      \n\t"
+
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vmovups	(%3,%0,4), %%ymm8	 \n\t"	// 8 * y
+	"vmovups      32(%3,%0,4), %%ymm9	 \n\t"	// 8 * y
+
+	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%ymm13, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5      \n\t" 
+	"vfmadd231ps   (%6,%0,4), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%ymm15, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5      \n\t" 
+
+	"vfmadd231ps     %%ymm6 , %%ymm4 , %%ymm8      \n\t"
+	"vfmadd231ps     %%ymm6 , %%ymm5 , %%ymm9      \n\t"
+
+	"vmovups  %%ymm8,   (%3,%0,4)		      \n\t"	// 8 * y
+	"vmovups  %%ymm9, 32(%3,%0,4)		      \n\t"	// 8 * y
+
+        "addq		$16, %0	  	 	      \n\t"
+	"subq	        $16, %1			      \n\t"		
 	"jnz		.L01LOOP%=		      \n\t"
 
         ".L16END%=:                             \n\t"
@@ -156,113 +283,11 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
           "r" (ap[1]),  // 5
           "r" (ap[2]),  // 6
           "r" (ap[3]),  // 7
-          "r" (lda4)    // 8
-	: "cc", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm2", "%xmm3", 
-	  "%xmm4", "%xmm5", 
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-
-#define HAVE_KERNEL_4x4 1
-static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
-{
-
-	BLASLONG register i = 0;
-
-	__asm__  __volatile__
-	(
-	"vzeroupper			 \n\t"
-	"vbroadcastss    (%2), %%ymm12	 \n\t"	// x0 
-	"vbroadcastss   4(%2), %%ymm13	 \n\t"	// x1 
-	"vbroadcastss   8(%2), %%ymm14	 \n\t"	// x2 
-	"vbroadcastss  12(%2), %%ymm15	 \n\t"	// x3 
-
-        "testq          $0x04, %1                      \n\t"
-        "jz             .L08LABEL%=                    \n\t"
-
-	"vmovups	(%3,%0,4), %%xmm4	       \n\t"	// 4 * y
-
-	"vfmadd231ps   (%4,%0,4), %%xmm12, %%xmm4      \n\t" 
-	"vfmadd231ps   (%5,%0,4), %%xmm13, %%xmm4      \n\t" 
-	"vfmadd231ps   (%6,%0,4), %%xmm14, %%xmm4      \n\t" 
-	"vfmadd231ps   (%7,%0,4), %%xmm15, %%xmm4      \n\t" 
-
-	"vmovups  %%xmm4,   (%3,%0,4)		       \n\t"	// 4 * y
-
-        "addq		$4 , %0	  	 	       \n\t"
-	"subq	        $4 , %1			       \n\t"		
-
-        ".L08LABEL%=:                                  \n\t"
-
-        "testq          $0x08, %1                      \n\t"
-        "jz             .L16LABEL%=                    \n\t"
-
-	"vmovups	(%3,%0,4), %%ymm4	       \n\t"	// 8 * y
-
-	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
-	"vfmadd231ps   (%5,%0,4), %%ymm13, %%ymm4      \n\t" 
-	"vfmadd231ps   (%6,%0,4), %%ymm14, %%ymm4      \n\t" 
-	"vfmadd231ps   (%7,%0,4), %%ymm15, %%ymm4      \n\t" 
-
-	"vmovups  %%ymm4,   (%3,%0,4)		       \n\t"	// 8 * y
-
-        "addq		$8 , %0	  	 	       \n\t"
-	"subq	        $8 , %1			       \n\t"		
-
-        ".L16LABEL%=:                                  \n\t"
-
-        "cmpq           $0, %1                         \n\t"
-        "je             .L16END%=                      \n\t"
-
-
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	"vmovups	(%3,%0,4), %%ymm4	 \n\t"	// 8 * y
-	"vmovups      32(%3,%0,4), %%ymm5	 \n\t"	// 8 * y
-
-	"prefetcht0	 192(%4,%0,4)		       \n\t"
-	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
-	"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5      \n\t" 
-	"prefetcht0	 192(%5,%0,4)		       \n\t"
-	"vfmadd231ps   (%5,%0,4), %%ymm13, %%ymm4      \n\t" 
-	"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5      \n\t" 
-	"prefetcht0	 192(%6,%0,4)		       \n\t"
-	"vfmadd231ps   (%6,%0,4), %%ymm14, %%ymm4      \n\t" 
-	"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5      \n\t" 
-	"prefetcht0	 192(%7,%0,4)		       \n\t"
-	"vfmadd231ps   (%7,%0,4), %%ymm15, %%ymm4      \n\t" 
-	"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5      \n\t" 
-
-	"vmovups  %%ymm4,   (%3,%0,4)		      \n\t"	// 8 * y
-	"vmovups  %%ymm5, 32(%3,%0,4)		      \n\t"	// 8 * y
-
-        "addq		$16, %0	  	 	      \n\t"
-	"subq	        $16, %1			      \n\t"		
-	"jnz		.L01LOOP%=		      \n\t"
-
-        ".L16END%=:                             \n\t"
-	"vzeroupper			 \n\t"
-
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
-          "r" (x),      // 2
-          "r" (y),      // 3
-          "r" (ap[0]),  // 4
-          "r" (ap[1]),  // 5
-          "r" (ap[2]),  // 6
-          "r" (ap[3])   // 7
+          "r" (alpha)   // 8
 	: "cc", 
 	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", 
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);
diff --git a/kernel/x86_64/sgemv_t_microk_haswell-4.c b/kernel/x86_64/sgemv_t_microk_haswell-4.c
new file mode 100644
index 000000000..016cb35e7
--- /dev/null
+++ b/kernel/x86_64/sgemv_t_microk_haswell-4.c
@@ -0,0 +1,148 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			         \n\t"
+	"vxorps		%%ymm4 , %%ymm4, %%ymm4  \n\t"
+	"vxorps		%%ymm5 , %%ymm5, %%ymm5  \n\t"
+	"vxorps		%%ymm6 , %%ymm6, %%ymm6  \n\t"
+	"vxorps		%%ymm7 , %%ymm7, %%ymm7  \n\t"
+
+        "testq          $0x04, %1                      \n\t"
+        "jz             .L08LABEL%=                    \n\t"
+
+	"vmovups	(%2,%0,4), %%xmm12             \n\t"	// 4 * x
+
+	"vfmadd231ps   (%4,%0,4), %%xmm12, %%xmm4      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%xmm12, %%xmm5      \n\t" 
+	"vfmadd231ps   (%6,%0,4), %%xmm12, %%xmm6      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%xmm12, %%xmm7      \n\t" 
+
+        "addq		$4 , %0	  	 	      \n\t"
+	"subq	        $4 , %1			      \n\t"		
+
+        ".L08LABEL%=:                                  \n\t"
+
+        "testq          $0x08, %1                      \n\t"
+        "jz             .L16LABEL%=                    \n\t"
+
+	"vmovups	(%2,%0,4), %%ymm12             \n\t"	// 8 * x
+
+	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%ymm12, %%ymm5      \n\t" 
+	"vfmadd231ps   (%6,%0,4), %%ymm12, %%ymm6      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%ymm12, %%ymm7      \n\t" 
+
+        "addq		$8 , %0	  	 	      \n\t"
+	"subq	        $8 , %1			      \n\t"		
+
+        ".L16LABEL%=:                                  \n\t"
+
+        "cmpq           $0, %1                         \n\t"
+        "je             .L16END%=                      \n\t"
+
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"prefetcht0	 384(%2,%0,4)		 \n\t"
+	"vmovups	(%2,%0,4), %%ymm12       \n\t"	// 8 * x
+	"vmovups      32(%2,%0,4), %%ymm13       \n\t"	// 8 * x
+
+	"prefetcht0	 384(%4,%0,4)		       \n\t"
+	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%ymm12, %%ymm5      \n\t" 
+	"prefetcht0	 384(%5,%0,4)		       \n\t"
+	"vfmadd231ps 32(%4,%0,4), %%ymm13, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5      \n\t" 
+	"prefetcht0	 384(%6,%0,4)		       \n\t"
+	"vfmadd231ps   (%6,%0,4), %%ymm12, %%ymm6      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%ymm12, %%ymm7      \n\t" 
+	"prefetcht0	 384(%7,%0,4)		       \n\t"
+	"vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm6      \n\t" 
+	"vfmadd231ps 32(%7,%0,4), %%ymm13, %%ymm7      \n\t" 
+
+        "addq		$16, %0	  	 	      \n\t"
+	"subq	        $16, %1			      \n\t"		
+	"jnz		.L01LOOP%=		      \n\t"
+
+        ".L16END%=:                                   \n\t"
+
+	"vextractf128   $1 , %%ymm4, %%xmm12	      \n\t"
+	"vextractf128   $1 , %%ymm5, %%xmm13	      \n\t"
+	"vextractf128   $1 , %%ymm6, %%xmm14	      \n\t"
+	"vextractf128   $1 , %%ymm7, %%xmm15	      \n\t"
+
+	"vaddps		%%xmm4, %%xmm12, %%xmm4       \n\t"
+	"vaddps		%%xmm5, %%xmm13, %%xmm5       \n\t"
+	"vaddps		%%xmm6, %%xmm14, %%xmm6       \n\t"
+	"vaddps		%%xmm7, %%xmm15, %%xmm7       \n\t"
+
+        "vhaddps        %%xmm4, %%xmm4, %%xmm4  \n\t"
+        "vhaddps        %%xmm5, %%xmm5, %%xmm5  \n\t"
+        "vhaddps        %%xmm6, %%xmm6, %%xmm6  \n\t"
+        "vhaddps        %%xmm7, %%xmm7, %%xmm7  \n\t"
+
+        "vhaddps        %%xmm4, %%xmm4, %%xmm4  \n\t"
+        "vhaddps        %%xmm5, %%xmm5, %%xmm5  \n\t"
+        "vhaddps        %%xmm6, %%xmm6, %%xmm6  \n\t"
+        "vhaddps        %%xmm7, %%xmm7, %%xmm7  \n\t"
+
+        "vmovss         %%xmm4,    (%3)         \n\t"
+        "vmovss         %%xmm5,   4(%3)         \n\t"
+        "vmovss         %%xmm6,   8(%3)         \n\t"
+        "vmovss         %%xmm7,  12(%3)         \n\t"
+
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From 7c0a94ff472bcb01d666ca0bd6975c0b24267680 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Mon, 8 Sep 2014 10:54:33 +0200
Subject: [PATCH 34/44] bugfix in sgemv_n_microk_haswell-4.c

---
 kernel/x86_64/sgemv_n_microk_haswell-4.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c
index 1e4498d9e..8f56655a9 100644
--- a/kernel/x86_64/sgemv_n_microk_haswell-4.c
+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c
@@ -248,6 +248,8 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
+	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
+	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"
 	"vmovups	(%3,%0,4), %%ymm8	 \n\t"	// 8 * y
 	"vmovups      32(%3,%0,4), %%ymm9	 \n\t"	// 8 * y
 

From c4d9d4e5f8319a743df17564c4bf1a1a0c3670e2 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Mon, 8 Sep 2014 12:25:16 +0200
Subject: [PATCH 35/44] added haswell optimized kernel

---
 kernel/x86_64/sgemv_t_4.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index 3316473af..b0e883252 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -34,6 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "sgemv_t_microk_bulldozer-4.c"
 #elif defined(SANDYBRIDGE)
 #include "sgemv_t_microk_sandy-4.c"
+#elif defined(HASWELL)
+#include "sgemv_t_microk_haswell-4.c"
 #endif
 
 #define NBMAX 4096

From f511807fc07e4e62f07b4a880d3196b860796bec Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Mon, 8 Sep 2014 12:27:32 +0200
Subject: [PATCH 36/44] modified multithreading threshold

---
 interface/gemv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/gemv.c b/interface/gemv.c
index 64dc641d0..2dd82dce5 100644
--- a/interface/gemv.c
+++ b/interface/gemv.c
@@ -216,7 +216,7 @@ void CNAME(enum CBLAS_ORDER order,
   int  nthreads_avail = nthreads_max;
 
   double MNK = (double) m * (double) n;
-  if ( MNK <= (96.0 * 24.0  * (double) GEMM_MULTITHREAD_THRESHOLD)  )
+  if ( MNK <= (24.0 * 24.0  * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) )  )
         nthreads_max = 1;
 
   if ( nthreads_max > nthreads_avail )

From 658939faaada12ab40334f986a665d28eef2ef19 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Mon, 8 Sep 2014 15:22:35 +0200
Subject: [PATCH 37/44] optimized dgemv_n kernel for small sizes

---
 kernel/x86_64/KERNEL.NEHALEM             |   2 +-
 kernel/x86_64/dgemv_n_4.c                | 546 +++++++++++++++++++++++
 kernel/x86_64/dgemv_n_microk_nehalem-4.c | 265 +++++++++++
 3 files changed, 812 insertions(+), 1 deletion(-)
 create mode 100644 kernel/x86_64/dgemv_n_4.c
 create mode 100644 kernel/x86_64/dgemv_n_microk_nehalem-4.c

diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM
index 68c741cea..8feef5c31 100644
--- a/kernel/x86_64/KERNEL.NEHALEM
+++ b/kernel/x86_64/KERNEL.NEHALEM
@@ -11,7 +11,7 @@ SSYMV_L_KERNEL = ssymv_L.c
 
 SGEMVNKERNEL = sgemv_n_4.c
 SGEMVTKERNEL = sgemv_t_4.c
-DGEMVNKERNEL = dgemv_n.c
+DGEMVNKERNEL = dgemv_n_4.c
 
 SGEMMKERNEL    =  gemm_kernel_4x8_nehalem.S
 SGEMMINCOPY    =  gemm_ncopy_4.S
diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c
new file mode 100644
index 000000000..249df8009
--- /dev/null
+++ b/kernel/x86_64/dgemv_n_4.c
@@ -0,0 +1,546 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+
+#if defined(NEHALEM)
+#include "dgemv_n_microk_nehalem-4.c"
+#endif
+
+
+#define NBMAX 2048
+
+#ifndef HAVE_KERNEL_4x8
+
+static void dgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+	BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3;
+	FLOAT *b0,*b1,*b2,*b3;
+	FLOAT *x4;
+	FLOAT x[8];
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3];
+	b0 = a0 + lda4 ;
+	b1 = a1 + lda4 ;
+	b2 = a2 + lda4 ;
+	b3 = a3 + lda4 ;
+	x4 = x + 4;
+
+	for ( i=0; i<8; i++)
+		x[i] = xo[i] * *alpha;
+
+	for ( i=0; i< n; i+=4 )
+	{
+
+		y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];		
+		y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];		
+		y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];		
+		y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];		
+
+		y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3];		
+		y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3];		
+		y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3];		
+		y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3];		
+
+	}
+}
+	
+#endif
+
+
+#ifndef HAVE_KERNEL_4x4
+
+static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+{
+	BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3;
+	FLOAT x[4];
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3];
+
+	for ( i=0; i<4; i++)
+		x[i] = xo[i] * *alpha;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];		
+		y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];		
+		y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];		
+		y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];		
+	}
+}
+	
+#endif
+
+#ifndef HAVE_KERNEL_4x2
+
+static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"movsd    (%2)  , %%xmm12	 \n\t"	// x0 
+	"movsd    (%6)  , %%xmm4 	 \n\t"	// alpha 
+	"movsd   8(%2)  , %%xmm13	 \n\t"	// x1 
+        "mulsd  %%xmm4  , %%xmm12        \n\t"  // alpha 
+        "mulsd  %%xmm4  , %%xmm13        \n\t"  // alpha 
+	"shufpd $0,  %%xmm12, %%xmm12    \n\t"	
+	"shufpd $0,  %%xmm13, %%xmm13    \n\t"	
+
+	".align 16				       \n\t"
+	".L01LOOP%=:				       \n\t"
+	"movups	       (%3,%0,8), %%xmm4	       \n\t"	// 2 * y
+	"movups	     16(%3,%0,8), %%xmm5	       \n\t"	// 2 * y
+
+	"movups             (%4,%0,8), %%xmm8          \n\t" 
+	"movups             (%5,%0,8), %%xmm9          \n\t" 
+	"mulpd		%%xmm12, %%xmm8		       \n\t"
+	"mulpd		%%xmm13, %%xmm9		       \n\t"
+	"addpd		%%xmm8 , %%xmm4		       \n\t"
+	"addpd		%%xmm9 , %%xmm4		       \n\t"
+
+	"movups           16(%4,%0,8), %%xmm8          \n\t" 
+	"movups           16(%5,%0,8), %%xmm9          \n\t" 
+	"mulpd		%%xmm12, %%xmm8		       \n\t"
+	"mulpd		%%xmm13, %%xmm9		       \n\t"
+	"addpd		%%xmm8 , %%xmm5		       \n\t"
+	"addpd		%%xmm9 , %%xmm5		       \n\t"
+
+	"movups  %%xmm4 ,   (%3,%0,8)		       \n\t"	// 2 * y
+	"movups  %%xmm5 , 16(%3,%0,8)		       \n\t"	// 2 * y
+
+        "addq		$4 , %0	  	 	       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+	"jnz		.L01LOOP%=		       \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (alpha)   // 6
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+#endif
+
+#ifndef HAVE_KERNEL_4x2
+
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+        BLASLONG register i = 0;
+
+        __asm__  __volatile__
+        (
+        "movsd          (%2), %%xmm12            \n\t"  // x0 
+        "mulsd          (%5), %%xmm12            \n\t"  // alpha 
+        "shufpd $0,  %%xmm12, %%xmm12            \n\t"
+
+        ".align 16                               \n\t"
+        ".L01LOOP%=:                             \n\t"
+        "movups       (%4,%0,8), %%xmm8          \n\t"  // 2 * a
+        "movups     16(%4,%0,8), %%xmm9          \n\t"  // 2 * a
+        "movups       (%3,%0,8), %%xmm4          \n\t"  // 2 * y
+        "movups     16(%3,%0,8), %%xmm5          \n\t"  // 2 * y
+	"mulpd          %%xmm12, %%xmm8          \n\t"
+	"mulpd          %%xmm12, %%xmm9          \n\t"
+        "addpd          %%xmm8 , %%xmm4          \n\t"
+        "addpd          %%xmm9 , %%xmm5          \n\t"
+
+	"movups  %%xmm4 ,    (%3,%0,8)           \n\t"    // 2 * y
+	"movups  %%xmm5 ,  16(%3,%0,8)           \n\t"    // 2 * y
+
+        "addq           $4 , %0                  \n\t"
+        "subq           $4 , %1                  \n\t"
+
+        "jnz            .L01LOOP%=               \n\t"
+
+        :
+        :
+          "r" (i),      // 0    
+          "r" (n),      // 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap),     // 4
+          "r" (alpha)   // 5
+        : "cc",
+          "%xmm4", "%xmm5",
+          "%xmm6", "%xmm7",
+          "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+          "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+          "memory"
+        );
+
+}
+
+#endif
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+	BLASLONG i;
+	if ( inc_dest != 1 )
+	{
+		for ( i=0; i<n; i++ )
+		{
+			*dest += *src;
+			src++;
+			dest += inc_dest;
+		}
+		return;
+	}
+
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	FLOAT *ap[4];
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG m3;
+	BLASLONG n2;
+	BLASLONG lda4 =  lda << 2;
+	BLASLONG lda8 =  lda << 3;
+	FLOAT xbuffer[8],*ybuffer;
+
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
+	ybuffer = buffer;
+	
+        if ( inc_x == 1 )
+	{
+		n1 = n >> 3 ;
+		n2 = n &  7 ;
+	}
+	else
+	{
+		n1 = n >> 2 ;
+		n2 = n &  3 ;
+
+	}
+	
+        m3 = m & 3  ;
+        m1 = m & -4 ;
+        m2 = (m & (NBMAX-1)) - m3 ;
+
+
+	y_ptr = y;
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		a_ptr = a;
+		x_ptr = x;
+		
+		ap[0] = a_ptr;
+		ap[1] = a_ptr + lda;
+		ap[2] = ap[1] + lda;
+		ap[3] = ap[2] + lda;
+
+		if ( inc_y != 1 )
+			memset(ybuffer,0,NB*8);
+		else
+			ybuffer = y_ptr;
+
+		if ( inc_x == 1 )
+		{
+
+
+			for( i = 0; i < n1 ; i++)
+			{
+				dgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
+				ap[0] += lda8; 
+				ap[1] += lda8; 
+				ap[2] += lda8; 
+				ap[3] += lda8; 
+				a_ptr += lda8;
+				x_ptr += 8;	
+			}
+
+
+			if ( n2 & 4 )
+			{
+				dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				a_ptr += lda4;
+				x_ptr += 4;	
+			}
+
+			if ( n2 & 2 )
+			{
+				dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
+				a_ptr += lda*2;
+				x_ptr += 2;	
+			}
+
+
+			if ( n2 & 1 )
+			{
+				dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
+				a_ptr += lda;
+				x_ptr += 1;	
+
+			}
+
+
+		}
+		else
+		{
+
+			for( i = 0; i < n1 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[1] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[2] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[3] = x_ptr[0];
+				x_ptr += inc_x;	
+				dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				ap[2] += lda4; 
+				ap[3] += lda4; 
+				a_ptr += lda4;
+			}
+
+			for( i = 0; i < n2 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
+				a_ptr += lda;
+
+			}
+
+		}
+
+		a     += NB;
+		if ( inc_y != 1 )
+		{
+			add_y(NB,ybuffer,y_ptr,inc_y);
+			y_ptr += NB * inc_y;
+		}
+		else
+			y_ptr += NB ;
+
+	}
+
+	if ( m3 == 0 ) return(0);
+
+	if ( m3 == 3 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		FLOAT temp2 = 0.0;
+		if ( lda == 3 && inc_x ==1 )
+		{
+
+			for( i = 0; i < ( n & -4 ); i+=4 )
+			{
+
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
+				temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+
+				temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9]  * x_ptr[3];
+				temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
+				temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
+
+				a_ptr += 12;
+				x_ptr += 4;
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += 3;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp2;
+		return(0);
+	}
+
+
+	if ( m3 == 2 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		if ( lda == 2 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4) ; i+=4 )
+			{
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
+				temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+				a_ptr += 8;
+				x_ptr += 4;
+
+			}
+
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0]   * x_ptr[0];
+				temp1 += a_ptr[1]   * x_ptr[0];
+				a_ptr += 2;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		return(0);
+	}
+
+	if ( m3 == 1 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp = 0.0;
+		if ( lda == 1 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4); i+=4 )
+			{
+				temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
+	
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp += a_ptr[i] * x_ptr[i];
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp += a_ptr[0] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+			}
+
+		}
+		y_ptr[0] += alpha * temp;
+		return(0);
+	}
+
+
+	return(0);
+}
+
+
diff --git a/kernel/x86_64/dgemv_n_microk_nehalem-4.c b/kernel/x86_64/dgemv_n_microk_nehalem-4.c
new file mode 100644
index 000000000..e311326f1
--- /dev/null
+++ b/kernel/x86_64/dgemv_n_microk_nehalem-4.c
@@ -0,0 +1,265 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+
+#define HAVE_KERNEL_4x8 1
+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"movsd    (%2), %%xmm12	 \n\t"	// x0 
+	"movsd   8(%2), %%xmm13	 \n\t"	// x1 
+	"movsd  16(%2), %%xmm14	 \n\t"	// x2 
+	"movsd  24(%2), %%xmm15	 \n\t"	// x3 
+	"shufpd $0,  %%xmm12, %%xmm12\n\t"	
+	"shufpd $0,  %%xmm13, %%xmm13\n\t"	
+	"shufpd $0,  %%xmm14, %%xmm14\n\t"	
+	"shufpd $0,  %%xmm15, %%xmm15\n\t"	
+
+	"movsd  32(%2), %%xmm0	 \n\t"	// x4 
+	"movsd  40(%2), %%xmm1	 \n\t"	// x5 
+	"movsd  48(%2), %%xmm2	 \n\t"	// x6 
+	"movsd  56(%2), %%xmm3	 \n\t"	// x7 
+	"shufpd $0,  %%xmm0 , %%xmm0 \n\t"	
+	"shufpd $0,  %%xmm1 , %%xmm1 \n\t"	
+	"shufpd $0,  %%xmm2 , %%xmm2 \n\t"	
+	"shufpd $0,  %%xmm3 , %%xmm3 \n\t"	
+
+	"movsd    (%9), %%xmm6	     \n\t"	// alpha 
+	"shufpd $0,  %%xmm6 , %%xmm6 \n\t"	
+
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"xorpd           %%xmm4 , %%xmm4	 \n\t"
+	"xorpd           %%xmm5 , %%xmm5	 \n\t"
+	"movups             (%3,%0,8), %%xmm7          \n\t" // 2 * y
+
+	".align 2				       \n\t"
+	"movups             (%4,%0,8), %%xmm8          \n\t" 
+	"movups             (%5,%0,8), %%xmm9          \n\t" 
+	"movups             (%6,%0,8), %%xmm10         \n\t" 
+	"movups             (%7,%0,8), %%xmm11         \n\t" 
+	".align 2				       \n\t"
+	"mulpd		%%xmm12, %%xmm8		       \n\t"
+	"mulpd		%%xmm13, %%xmm9		       \n\t"
+	"mulpd		%%xmm14, %%xmm10	       \n\t"
+	"mulpd		%%xmm15, %%xmm11	       \n\t"
+	"addpd		%%xmm8 , %%xmm4		       \n\t"
+	"addpd		%%xmm9 , %%xmm5		       \n\t"
+	"addpd		%%xmm10, %%xmm4	               \n\t"
+	"addpd		%%xmm11, %%xmm5 	       \n\t"
+
+	"movups             (%4,%8,8), %%xmm8          \n\t" 
+	"movups             (%5,%8,8), %%xmm9          \n\t" 
+	"movups             (%6,%8,8), %%xmm10         \n\t" 
+	"movups             (%7,%8,8), %%xmm11         \n\t" 
+	".align 2				       \n\t"
+	"mulpd		%%xmm0 , %%xmm8		       \n\t"
+	"mulpd		%%xmm1 , %%xmm9		       \n\t"
+	"mulpd		%%xmm2 , %%xmm10	       \n\t"
+	"mulpd		%%xmm3 , %%xmm11	       \n\t"
+	"addpd		%%xmm8 , %%xmm4		       \n\t"
+	"addpd		%%xmm9 , %%xmm5		       \n\t"
+	"addpd		%%xmm10, %%xmm4	       	       \n\t"
+	"addpd		%%xmm11, %%xmm5 	       \n\t"
+
+	"addpd		%%xmm5 , %%xmm4 	       \n\t"
+	"mulpd		%%xmm6 , %%xmm4		       \n\t" 
+	"addpd		%%xmm4 , %%xmm7 	       \n\t"
+
+	"movups  %%xmm7 ,    (%3,%0,8)		       \n\t"	// 2 * y
+
+	"xorpd           %%xmm4 , %%xmm4	 \n\t"
+	"xorpd           %%xmm5 , %%xmm5	 \n\t"
+	"movups           16(%3,%0,8), %%xmm7          \n\t" // 2 * y
+
+	".align 2				       \n\t"
+	"movups           16(%4,%0,8), %%xmm8          \n\t" 
+	"movups           16(%5,%0,8), %%xmm9          \n\t" 
+	"movups           16(%6,%0,8), %%xmm10         \n\t" 
+	"movups           16(%7,%0,8), %%xmm11         \n\t" 
+	".align 2				       \n\t"
+	"mulpd		%%xmm12, %%xmm8		       \n\t"
+	"mulpd		%%xmm13, %%xmm9		       \n\t"
+	"mulpd		%%xmm14, %%xmm10	       \n\t"
+	"mulpd		%%xmm15, %%xmm11	       \n\t"
+	"addpd		%%xmm8 , %%xmm4		       \n\t"
+	"addpd		%%xmm9 , %%xmm5		       \n\t"
+	"addpd		%%xmm10, %%xmm4	               \n\t"
+	"addpd		%%xmm11, %%xmm5 	       \n\t"
+
+	"movups           16(%4,%8,8), %%xmm8          \n\t" 
+	"movups           16(%5,%8,8), %%xmm9          \n\t" 
+	"movups           16(%6,%8,8), %%xmm10         \n\t" 
+	"movups           16(%7,%8,8), %%xmm11         \n\t" 
+	".align 2				       \n\t"
+	"mulpd		%%xmm0 , %%xmm8		       \n\t"
+	"mulpd		%%xmm1 , %%xmm9		       \n\t"
+	"mulpd		%%xmm2 , %%xmm10	       \n\t"
+	"mulpd		%%xmm3 , %%xmm11	       \n\t"
+	"addpd		%%xmm8 , %%xmm4		       \n\t"
+	"addpd		%%xmm9 , %%xmm5		       \n\t"
+	"addpd		%%xmm10, %%xmm4	       	       \n\t"
+	"addpd		%%xmm11, %%xmm5 	       \n\t"
+
+        "addq		$4 , %8	  	 	       \n\t"
+	"addpd		%%xmm5 , %%xmm4 	       \n\t"
+	"mulpd		%%xmm6 , %%xmm4		       \n\t" 
+	"addpd		%%xmm4 , %%xmm7 	       \n\t"
+
+	"movups  %%xmm7 ,  16(%3,%0,8)		       \n\t"	// 2 * y
+
+        "addq		$4 , %0	  	 	       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+	"jnz		.L01LOOP%=		       \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (lda4),   // 8
+          "r" (alpha)   // 9
+	: "cc", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+
+
+#define HAVE_KERNEL_4x4 1
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"movsd    (%2), %%xmm12	 \n\t"	// x0 
+	"movsd   8(%2), %%xmm13	 \n\t"	// x1 
+	"movsd  16(%2), %%xmm14	 \n\t"	// x2 
+	"movsd  24(%2), %%xmm15	 \n\t"	// x3 
+	"shufpd $0,  %%xmm12, %%xmm12\n\t"	
+	"shufpd $0,  %%xmm13, %%xmm13\n\t"	
+	"shufpd $0,  %%xmm14, %%xmm14\n\t"	
+	"shufpd $0,  %%xmm15, %%xmm15\n\t"	
+
+	"movsd    (%8), %%xmm6	     \n\t"	// alpha 
+	"shufpd $0,  %%xmm6 , %%xmm6 \n\t"	
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"xorpd           %%xmm4 , %%xmm4	 \n\t"
+	"xorpd           %%xmm5 , %%xmm5	 \n\t"
+	"movups	       (%3,%0,8), %%xmm7	 \n\t"	// 2 * y
+
+	"movups             (%4,%0,8), %%xmm8          \n\t" 
+	"movups             (%5,%0,8), %%xmm9          \n\t" 
+	"movups             (%6,%0,8), %%xmm10         \n\t" 
+	"movups             (%7,%0,8), %%xmm11         \n\t" 
+	"mulpd		%%xmm12, %%xmm8		       \n\t"
+	"mulpd		%%xmm13, %%xmm9		       \n\t"
+	"mulpd		%%xmm14, %%xmm10	       \n\t"
+	"mulpd		%%xmm15, %%xmm11	       \n\t"
+	"addpd		%%xmm8 , %%xmm4		       \n\t"
+	"addpd		%%xmm9 , %%xmm4		       \n\t"
+	"addpd		%%xmm10 , %%xmm4	       \n\t"
+	"addpd		%%xmm4 , %%xmm11	       \n\t"
+
+	"mulpd		%%xmm6 , %%xmm11	       \n\t" 
+	"addpd		%%xmm7 , %%xmm11 	       \n\t"
+	"movups  %%xmm11,    (%3,%0,8)		       \n\t"	// 2 * y
+
+	"xorpd           %%xmm4 , %%xmm4	 \n\t"
+	"xorpd           %%xmm5 , %%xmm5	 \n\t"
+	"movups	     16(%3,%0,8), %%xmm7	 \n\t"	// 2 * y
+
+	"movups           16(%4,%0,8), %%xmm8          \n\t" 
+	"movups           16(%5,%0,8), %%xmm9          \n\t" 
+	"movups           16(%6,%0,8), %%xmm10         \n\t" 
+	"movups           16(%7,%0,8), %%xmm11         \n\t" 
+	"mulpd		%%xmm12, %%xmm8		       \n\t"
+	"mulpd		%%xmm13, %%xmm9		       \n\t"
+	"mulpd		%%xmm14, %%xmm10	       \n\t"
+	"mulpd		%%xmm15, %%xmm11	       \n\t"
+	"addpd		%%xmm8 , %%xmm4		       \n\t"
+	"addpd		%%xmm9 , %%xmm4		       \n\t"
+	"addpd		%%xmm10 , %%xmm4	       \n\t"
+	"addpd		%%xmm4 , %%xmm11	       \n\t"
+
+	"mulpd		%%xmm6 , %%xmm11	       \n\t" 
+	"addpd		%%xmm7 , %%xmm11 	       \n\t"
+	"movups  %%xmm11,  16(%3,%0,8)		       \n\t"	// 2 * y
+
+        "addq		$4 , %0	  	 	       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+	"jnz		.L01LOOP%=		       \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (alpha)   // 8
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From cd34e9701b552bff542c335943aec70e159037ba Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Mon, 8 Sep 2014 19:15:31 +0200
Subject: [PATCH 38/44] removed obsolete files

---
 kernel/x86_64/sgemv_n_avx.c              | 218 -----------
 kernel/x86_64/sgemv_n_microk_bulldozer.c | 451 ---------------------
 kernel/x86_64/sgemv_n_microk_haswell.c   | 461 ----------------------
 kernel/x86_64/sgemv_n_microk_sandy.c     | 473 -----------------------
 kernel/x86_64/sgemv_t_avx.c              | 232 -----------
 kernel/x86_64/sgemv_t_microk_bulldozer.c |  99 -----
 kernel/x86_64/sgemv_t_microk_haswell.c   | 100 -----
 kernel/x86_64/sgemv_t_microk_sandy.c     | 106 -----
 8 files changed, 2140 deletions(-)
 delete mode 100644 kernel/x86_64/sgemv_n_avx.c
 delete mode 100644 kernel/x86_64/sgemv_n_microk_bulldozer.c
 delete mode 100644 kernel/x86_64/sgemv_n_microk_haswell.c
 delete mode 100644 kernel/x86_64/sgemv_n_microk_sandy.c
 delete mode 100644 kernel/x86_64/sgemv_t_avx.c
 delete mode 100644 kernel/x86_64/sgemv_t_microk_bulldozer.c
 delete mode 100644 kernel/x86_64/sgemv_t_microk_haswell.c
 delete mode 100644 kernel/x86_64/sgemv_t_microk_sandy.c

diff --git a/kernel/x86_64/sgemv_n_avx.c b/kernel/x86_64/sgemv_n_avx.c
deleted file mode 100644
index 57aaad4b4..000000000
--- a/kernel/x86_64/sgemv_n_avx.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-
-#include "common.h"
-
-#if defined(BULLDOZER) || defined(PILEDRIVER)
-#include "sgemv_n_microk_bulldozer.c"
-#elif defined(HASWELL)
-#include "sgemv_n_microk_haswell.c"
-#else
-#include "sgemv_n_microk_sandy.c"
-#endif
-
-static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
-{
-	BLASLONG i;
-	for ( i=0; i<n; i++ )
-	{
-		*dest = *src;
-		dest++;
-		src += inc_src;
-	}
-}
-
-static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
-{
-	BLASLONG i;
-	for ( i=0; i<n; i++ )
-	{
-		*dest += *src;
-		src++;
-		dest += inc_dest;
-	}
-}
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
-{
-	BLASLONG i;
-	BLASLONG j;
-	FLOAT *a_ptr;
-	FLOAT *x_ptr;
-	FLOAT *y_ptr;
-	BLASLONG n1;
-	BLASLONG m1;
-	BLASLONG register m2;
-	BLASLONG register n2;
-	FLOAT *xbuffer,*ybuffer;
-	xbuffer = buffer;
-	ybuffer = xbuffer + 2048 + 256;
-	
-	n1 = n / 512 ;
-	n2 = n % 512 ;
-
-	m1 = m / 64;
-	m2 = m % 64;
-
-	y_ptr = y;
-	x_ptr = x;
-
-	for (j=0; j<n1; j++)
-	{
-
-		if ( inc_x == 1 )
-			xbuffer = x_ptr;
-		else
-			copy_x(512,x_ptr,xbuffer,inc_x);
-
-		a_ptr = a + j * 512 * lda;
-		y_ptr = y;
-
-		for(i = 0; i<m1; i++ )
-		{
-			sgemv_kernel_64(512,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(64,ybuffer,y_ptr,inc_y);
-			y_ptr += 64 * inc_y;
-			a_ptr += 64;			
-
-		}
-
-		if ( m2 & 32 )
-		{
-			sgemv_kernel_32(512,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(32,ybuffer,y_ptr,inc_y);
-			y_ptr += 32 * inc_y;
-			a_ptr += 32;			
-
-		}
-
-		if ( m2 & 16 )
-		{
-			sgemv_kernel_16(512,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(16,ybuffer,y_ptr,inc_y);
-			y_ptr += 16 * inc_y;
-			a_ptr += 16;			
-		}
-		if ( m2 & 8 )
-		{
-			sgemv_kernel_8(512,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(8,ybuffer,y_ptr,inc_y);
-			y_ptr += 8 * inc_y;
-			a_ptr += 8;			
-		}
-		if ( m2 & 4 )
-		{
-			sgemv_kernel_4(512,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(4,ybuffer,y_ptr,inc_y);
-			y_ptr += 4 * inc_y;
-			a_ptr += 4;			
-		}
-		if ( m2 & 2 )
-		{
-			sgemv_kernel_2(512,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(2,ybuffer,y_ptr,inc_y);
-			y_ptr += 2 * inc_y;
-			a_ptr += 2;			
-		}
-		if ( m2 & 1 )
-		{
-			sgemv_kernel_1(512,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(1,ybuffer,y_ptr,inc_y);
-		}
-		x_ptr += 512 * inc_x;
-
-	}
-
-	if ( n2 > 0 )
-	{
-
-		if ( inc_x == 1 )
-			xbuffer = x_ptr;
-		else
-			copy_x(n2,x_ptr,xbuffer,inc_x);
-
-		a_ptr = a + n1 * 512 * lda;
-		y_ptr = y;
-
-		for(i = 0; i<m1; i++ )
-		{
-			sgemv_kernel_64(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(64,ybuffer,y_ptr,inc_y);
-			y_ptr += 64 * inc_y;
-			a_ptr += 64;			
-
-		}
-
-		if ( m2 & 32 )
-		{
-			sgemv_kernel_32(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(32,ybuffer,y_ptr,inc_y);
-			y_ptr += 32 * inc_y;
-			a_ptr += 32;			
-
-		}
-		if ( m2 & 16 )
-		{
-			sgemv_kernel_16(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(16,ybuffer,y_ptr,inc_y);
-			y_ptr += 16 * inc_y;
-			a_ptr += 16;			
-		}
-		if ( m2 & 8 )
-		{
-			sgemv_kernel_8(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(8,ybuffer,y_ptr,inc_y);
-			y_ptr += 8 * inc_y;
-			a_ptr += 8;			
-		}
-		if ( m2 & 4 )
-		{
-			sgemv_kernel_4(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(4,ybuffer,y_ptr,inc_y);
-			y_ptr += 4 * inc_y;
-			a_ptr += 4;			
-		}
-		if ( m2 & 2 )
-		{
-			sgemv_kernel_2(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(2,ybuffer,y_ptr,inc_y);
-			y_ptr += 2 * inc_y;
-			a_ptr += 2;			
-		}
-		if ( m2 & 1 )
-		{
-			sgemv_kernel_1(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(1,ybuffer,y_ptr,inc_y);
-		}
-
-
-	}
-	return(0);
-}
-
-
diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer.c b/kernel/x86_64/sgemv_n_microk_bulldozer.c
deleted file mode 100644
index 1b07f0291..000000000
--- a/kernel/x86_64/sgemv_n_microk_bulldozer.c
+++ /dev/null
@@ -1,451 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-static void  sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	float *pre = a + lda*3;
-
-	__asm__  __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-	"movq		%6,	 %%r8\n\t"		// address for prefetch
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-
-	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
-	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
-	"vxorps		%%ymm10, %%ymm10, %%ymm10\n\t"	// set to zero
-	"vxorps		%%ymm11, %%ymm11, %%ymm11\n\t"	// set to zero
-	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
-	"vxorps		%%ymm13, %%ymm13, %%ymm13\n\t"	// set to zero
-	"vxorps		%%ymm14, %%ymm14, %%ymm14\n\t"	// set to zero
-	"vxorps		%%ymm15, %%ymm15, %%ymm15\n\t"	// set to zero
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-	"nop					 \n\t"
-	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
-
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"vfmaddps %%ymm8 ,   0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-	"vfmaddps %%ymm9 ,   8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp
-	"prefetcht0   128(%%r8)\n\t"			// Prefetch
-	"vfmaddps %%ymm10,  16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp
-	"vfmaddps %%ymm11,  24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp
-	"prefetcht0   192(%%r8)\n\t"			// Prefetch
-	"vfmaddps %%ymm12,  32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
-	"vfmaddps %%ymm13,  40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
-	"vfmaddps %%ymm14,  48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp
-	"vfmaddps %%ymm15,  56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
-	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
-	"vmulps		%%ymm10, %%ymm1,  %%ymm10\n\t"  // scale by alpha
-	"vmulps		%%ymm11, %%ymm1,  %%ymm11\n\t"  // scale by alpha
-	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
-	"vmulps		%%ymm13, %%ymm1,  %%ymm13\n\t"  // scale by alpha
-	"vmulps		%%ymm14, %%ymm1,  %%ymm14\n\t"  // scale by alpha
-	"vmulps		%%ymm15, %%ymm1,  %%ymm15\n\t"  // scale by alpha
-
-	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm10, 16*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm11, 24*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm12, 32*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm13, 40*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm14, 48*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm15, 56*4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y),      // 5
-	  "m" (pre)	// 6
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-
-static void  sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	float *pre = a + lda*3;
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-	"movq		%6,	 %%r8\n\t"		// address for prefetch
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-
-	"vxorps		%%xmm8 , %%xmm8 , %%xmm8 \n\t"	// set to zero
-	"vxorps		%%xmm9 , %%xmm9 , %%xmm9 \n\t"	// set to zero
-	"vxorps		%%xmm10, %%xmm10, %%xmm10\n\t"	// set to zero
-	"vxorps		%%xmm11, %%xmm11, %%xmm11\n\t"	// set to zero
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
-	"vxorps		%%xmm14, %%xmm14, %%xmm14\n\t"	// set to zero
-	"vxorps		%%xmm15, %%xmm15, %%xmm15\n\t"	// set to zero
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%xmm0	 \n\t"	// load values of c
-	"nop					 \n\t"
-	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
-
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"vfmaddps %%xmm8 ,   0*4(%%rsi), %%xmm0, %%xmm8 \n\t" // multiply a and c and add to temp
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-	"vfmaddps %%xmm9 ,   4*4(%%rsi), %%xmm0, %%xmm9 \n\t" // multiply a and c and add to temp
-	"vfmaddps %%xmm10,   8*4(%%rsi), %%xmm0, %%xmm10\n\t" // multiply a and c and add to temp
-	"vfmaddps %%xmm11,  12*4(%%rsi), %%xmm0, %%xmm11\n\t" // multiply a and c and add to temp
-	"vfmaddps %%xmm12,  16*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
-	"vfmaddps %%xmm13,  20*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp
-	"vfmaddps %%xmm14,  24*4(%%rsi), %%xmm0, %%xmm14\n\t" // multiply a and c and add to temp
-	"vfmaddps %%xmm15,  28*4(%%rsi), %%xmm0, %%xmm15\n\t" // multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%xmm8 , %%xmm1,  %%xmm8 \n\t"  // scale by alpha
-	"vmulps		%%xmm9 , %%xmm1,  %%xmm9 \n\t"  // scale by alpha
-	"vmulps		%%xmm10, %%xmm1,  %%xmm10\n\t"  // scale by alpha
-	"vmulps		%%xmm11, %%xmm1,  %%xmm11\n\t"  // scale by alpha
-	"vmulps		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
-	"vmulps		%%xmm13, %%xmm1,  %%xmm13\n\t"  // scale by alpha
-	"vmulps		%%xmm14, %%xmm1,  %%xmm14\n\t"  // scale by alpha
-	"vmulps		%%xmm15, %%xmm1,  %%xmm15\n\t"  // scale by alpha
-
-	"vmovups	%%xmm8 ,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%xmm9 ,  4*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%xmm10,  8*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%xmm11, 12*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%xmm12, 16*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%xmm13, 20*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%xmm14, 24*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%xmm15, 28*4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y),      // 5
-	  "m" (pre)	// 6
-	);
-
-} 
-
-static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-	float *pre = a + lda*3;
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-	"movq		%6,	 %%r8\n\t"		// address for prefetch
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-
-	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
-	"vxorps		%%ymm13, %%ymm13, %%ymm13\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-
-	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-
-	"vfmaddps %%ymm12,   0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
-	"vfmaddps %%ymm13,   8*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
-
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
-	"vmulps		%%ymm13, %%ymm1,  %%ymm13\n\t"  // scale by alpha
-
-	"vmovups	%%ymm12,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm13,  8*4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y),      // 5
-	  "m" (pre)	// 6
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-static void  sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-
-	"vfmaddps %%ymm12,   0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
-
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
-
-	"vmovups	%%ymm12,     (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-static void  sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%xmm0	 \n\t"	// load values of c
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-
-	"vfmaddps %%xmm12,   0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
-
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
-
-	"vmovups	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-static void  sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-
-	"vfmaddss %%xmm12,   0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
-	"vfmaddss %%xmm13,   1*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp
-
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
-	"vmulss		%%xmm13, %%xmm1,  %%xmm13\n\t"  // scale by alpha
-
-	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovss 	%%xmm13,    4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-
-static void  sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-
-	"vfmaddss %%xmm12,   0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
-
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
-
-	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
diff --git a/kernel/x86_64/sgemv_n_microk_haswell.c b/kernel/x86_64/sgemv_n_microk_haswell.c
deleted file mode 100644
index 9db3869d2..000000000
--- a/kernel/x86_64/sgemv_n_microk_haswell.c
+++ /dev/null
@@ -1,461 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-static void  sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	float *pre = a + lda*2;
-
-	__asm__  __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-	"movq		%6,	 %%r8\n\t"		// address for prefetch
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-
-	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
-	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
-	"vxorps		%%ymm10, %%ymm10, %%ymm10\n\t"	// set to zero
-	"vxorps		%%ymm11, %%ymm11, %%ymm11\n\t"	// set to zero
-	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
-	"vxorps		%%ymm13, %%ymm13, %%ymm13\n\t"	// set to zero
-	"vxorps		%%ymm14, %%ymm14, %%ymm14\n\t"	// set to zero
-	"vxorps		%%ymm15, %%ymm15, %%ymm15\n\t"	// set to zero
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
-
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"vfmadd231ps   0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp
-	"vfmadd231ps   8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-	"vfmadd231ps  16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp
-	"vfmadd231ps  24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp
-	"prefetcht0  128(%%r8)\n\t"			// Prefetch
-	"vfmadd231ps  32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
-	"vfmadd231ps  40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
-	"prefetcht0  192(%%r8)\n\t"			// Prefetch
-	"vfmadd231ps  48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp
-	"vfmadd231ps  56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
-	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
-	"vmulps		%%ymm10, %%ymm1,  %%ymm10\n\t"  // scale by alpha
-	"vmulps		%%ymm11, %%ymm1,  %%ymm11\n\t"  // scale by alpha
-	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
-	"vmulps		%%ymm13, %%ymm1,  %%ymm13\n\t"  // scale by alpha
-	"vmulps		%%ymm14, %%ymm1,  %%ymm14\n\t"  // scale by alpha
-	"vmulps		%%ymm15, %%ymm1,  %%ymm15\n\t"  // scale by alpha
-
-	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm10, 16*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm11, 24*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm12, 32*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm13, 40*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm14, 48*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm15, 56*4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y),      // 5
-	  "m" (pre)	// 6
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-
-static void  sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	float *pre = a + lda*3;
-
-	__asm__  __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-	"movq		%6,	 %%r8\n\t"		// address for prefetch
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-
-	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
-	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
-	"vxorps		%%ymm10, %%ymm10, %%ymm10\n\t"	// set to zero
-	"vxorps		%%ymm11, %%ymm11, %%ymm11\n\t"	// set to zero
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-	"nop					 \n\t"
-	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
-
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-
-	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
-	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
-	"vmulps  16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
-	"vmulps  24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
-
-	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
-	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
-	"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
-	"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
-
-
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
-	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
-	"vmulps		%%ymm10, %%ymm1,  %%ymm10\n\t"  // scale by alpha
-	"vmulps		%%ymm11, %%ymm1,  %%ymm11\n\t"  // scale by alpha
-
-	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm10, 16*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm11, 24*4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y),      // 5
-	  "m" (pre)	// 6
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "memory"
-	);
-
-
-
-} 
-
-static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-	float *pre = a + lda*3;
-	
-	__asm__  __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-	"movq		%6,	 %%r8\n\t"		// address for prefetch
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-
-	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
-	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-	"nop					 \n\t"
-	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
-
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-
-	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
-	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
-
-	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
-	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
-	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
-
-	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y),      // 5
-	  "m" (pre)	// 6
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "memory"
-	);
-
-
-} 
-
-
-static void  sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-	
-	__asm__  __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-
-	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
-	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
-	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "memory"
-	);
-
-
-} 
-
-
-static void  sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%xmm0	 \n\t"	// load values of c
-
-	"vmulps   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
-	"vaddps %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
-
-	"vmovups	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-static void  sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
-
-	"vmulps   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
-	"vmulps   1*4(%%rsi), %%xmm0, %%xmm5 \n\t" 		// multiply a and c and add to temp
-
-	"vaddps %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp
-	"vaddps %%xmm13, %%xmm5, %%xmm13     \n\t" 		// multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
-	"vmulss		%%xmm13, %%xmm1,  %%xmm13\n\t"  // scale by alpha
-
-	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovss 	%%xmm13,    4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-
-static void  sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-
-	"vmulss   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
-	"vaddss %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp
-
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
-
-	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
diff --git a/kernel/x86_64/sgemv_n_microk_sandy.c b/kernel/x86_64/sgemv_n_microk_sandy.c
deleted file mode 100644
index 9bdb06600..000000000
--- a/kernel/x86_64/sgemv_n_microk_sandy.c
+++ /dev/null
@@ -1,473 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-static void  sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	float *pre = a + lda*2;
-
-	__asm__  __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-	"movq		%6,	 %%r8\n\t"		// address for prefetch
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-
-	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
-	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
-	"vxorps		%%ymm10, %%ymm10, %%ymm10\n\t"	// set to zero
-	"vxorps		%%ymm11, %%ymm11, %%ymm11\n\t"	// set to zero
-	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
-	"vxorps		%%ymm13, %%ymm13, %%ymm13\n\t"	// set to zero
-	"vxorps		%%ymm14, %%ymm14, %%ymm14\n\t"	// set to zero
-	"vxorps		%%ymm15, %%ymm15, %%ymm15\n\t"	// set to zero
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-	"nop					 \n\t"
-	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
-
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
-	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-	"vmulps  16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
-	"vmulps  24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
-
-	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
-	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
-	"prefetcht0  128(%%r8)\n\t"			// Prefetch
-	"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
-	"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
-
-	"prefetcht0  192(%%r8)\n\t"			// Prefetch
-	"vmulps  32*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
-	"vmulps  40*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
-	"vmulps  48*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
-	"vmulps  56*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
-
-	"vaddps %%ymm12, %%ymm4, %%ymm12\n\t" // multiply a and c and add to temp
-	"vaddps %%ymm13, %%ymm5, %%ymm13\n\t" // multiply a and c and add to temp
-	"vaddps %%ymm14, %%ymm6, %%ymm14\n\t" // multiply a and c and add to temp
-	"vaddps %%ymm15, %%ymm7, %%ymm15\n\t" // multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
-	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
-	"vmulps		%%ymm10, %%ymm1,  %%ymm10\n\t"  // scale by alpha
-	"vmulps		%%ymm11, %%ymm1,  %%ymm11\n\t"  // scale by alpha
-	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
-	"vmulps		%%ymm13, %%ymm1,  %%ymm13\n\t"  // scale by alpha
-	"vmulps		%%ymm14, %%ymm1,  %%ymm14\n\t"  // scale by alpha
-	"vmulps		%%ymm15, %%ymm1,  %%ymm15\n\t"  // scale by alpha
-
-	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm10, 16*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm11, 24*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm12, 32*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm13, 40*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm14, 48*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm15, 56*4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y),      // 5
-	  "m" (pre)	// 6
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-
-static void  sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	float *pre = a + lda*3;
-
-	__asm__  __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-	"movq		%6,	 %%r8\n\t"		// address for prefetch
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-
-	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
-	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
-	"vxorps		%%ymm10, %%ymm10, %%ymm10\n\t"	// set to zero
-	"vxorps		%%ymm11, %%ymm11, %%ymm11\n\t"	// set to zero
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-	"nop					 \n\t"
-	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
-
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-
-	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
-	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
-	"vmulps  16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
-	"vmulps  24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
-
-	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
-	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
-	"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
-	"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
-
-
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
-	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
-	"vmulps		%%ymm10, %%ymm1,  %%ymm10\n\t"  // scale by alpha
-	"vmulps		%%ymm11, %%ymm1,  %%ymm11\n\t"  // scale by alpha
-
-	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm10, 16*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm11, 24*4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y),      // 5
-	  "m" (pre)	// 6
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "memory"
-	);
-
-
-
-} 
-
-static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-	float *pre = a + lda*3;
-	
-	__asm__  __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-	"movq		%6,	 %%r8\n\t"		// address for prefetch
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-
-	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
-	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-	"nop					 \n\t"
-	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
-
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-
-	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
-	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
-
-	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
-	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
-	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
-
-	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y),      // 5
-	  "m" (pre)	// 6
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "memory"
-	);
-
-
-} 
-
-
-static void  sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-	
-	__asm__  __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-
-	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
-	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
-	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "memory"
-	);
-
-
-} 
-
-
-static void  sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%xmm0	 \n\t"	// load values of c
-
-	"vmulps   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
-	"vaddps %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
-
-	"vmovups	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-static void  sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
-
-	"vmulps   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
-	"vmulps   1*4(%%rsi), %%xmm0, %%xmm5 \n\t" 		// multiply a and c and add to temp
-
-	"vaddps %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp
-	"vaddps %%xmm13, %%xmm5, %%xmm13     \n\t" 		// multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
-	"vmulss		%%xmm13, %%xmm1,  %%xmm13\n\t"  // scale by alpha
-
-	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovss 	%%xmm13,    4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-
-static void  sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-
-	"vmulss   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
-	"vaddss %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp
-
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
-
-	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
diff --git a/kernel/x86_64/sgemv_t_avx.c b/kernel/x86_64/sgemv_t_avx.c
deleted file mode 100644
index 55fb3d623..000000000
--- a/kernel/x86_64/sgemv_t_avx.c
+++ /dev/null
@@ -1,232 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-
-#include "common.h"
-
-#if defined(BULLDOZER) || defined(PILEDRIVER)
-#include "sgemv_t_microk_bulldozer.c"
-#elif defined(HASWELL)
-#include "sgemv_t_microk_haswell.c"
-#else
-#include "sgemv_t_microk_sandy.c"
-#endif
-
-static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
-{
-	BLASLONG i;
-	for ( i=0; i<n; i++ )
-	{
-		*dest = *src;
-		dest++;
-		src += inc_src;
-	}
-}
-
-static void  sgemv_kernel_1( BLASLONG n, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, FLOAT *y)
-{
-
-	FLOAT register temp0 = 0.0;
-	BLASLONG i;
-	for ( i=0; i<n ; i++)
-	{
-		temp0 += a[i] * x[i];
-	}
-	temp0 *= alpha ;
-	*y += temp0;
-}
-
-
-
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
-{
-	BLASLONG i;
-	BLASLONG j;
-	FLOAT *a_ptr;
-	FLOAT *x_ptr;
-	FLOAT *y_ptr;
-	FLOAT *a_ptrl;
-	BLASLONG m1;
-	BLASLONG register m2;
-	FLOAT *xbuffer;
-	xbuffer = buffer;
-	BLASLONG register Mblock;
-
-	m1 = m / 1024 ;
-	m2 = m % 1024 ;
-
-	x_ptr = x;
-	a_ptr = a;
-
-	for (j=0; j<m1; j++)
-	{
-
-		if ( inc_x == 1 )
-			xbuffer = x_ptr;
-		else
-			copy_x(1024,x_ptr,xbuffer,inc_x);
-
-		y_ptr = y;
-		a_ptrl = a_ptr;
-
-		for(i = 0; i<n; i++ )
-		{
-			sgemv_kernel_16(1024,alpha,a_ptrl,lda,xbuffer,y_ptr);
-			y_ptr += inc_y;
-			a_ptrl += lda;
-		}
-		a_ptr += 1024;	
-		x_ptr += 1024 * inc_x;
-	}
-
-	if ( m2 == 0 ) return(0);
-
-	Mblock = 512;
-	while ( Mblock >= 16 )
-	{
-	  if ( m2 & Mblock)
-	  {
-
-		if ( inc_x == 1 )
-			xbuffer = x_ptr;
-		else
-			copy_x(Mblock,x_ptr,xbuffer,inc_x);
-
-		y_ptr = y;
-		a_ptrl = a_ptr;
-
-		for(i = 0; i<n; i++ )
-		{
-			sgemv_kernel_16(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
-			y_ptr += inc_y;
-			a_ptrl += lda;
-		}
-		a_ptr += Mblock;	
-		x_ptr += Mblock * inc_x;
-
-
- 	  }
-	  Mblock /= 2;
-
-	}
-
-        if ( m2 & Mblock)
-	{
-
-		if ( inc_x == 1 )
-			xbuffer = x_ptr;
-		else
-			copy_x(Mblock,x_ptr,xbuffer,inc_x);
-
-		y_ptr = y;
-		a_ptrl = a_ptr;
-
-		for(i = 0; i<n; i++ )
-		{
-			sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
-			y_ptr += inc_y;
-			a_ptrl += lda;
-		}
-		a_ptr += Mblock;	
-		x_ptr += Mblock * inc_x;
-
-
- 	}
-	Mblock /= 2;
-
-
-        if ( m2 & Mblock)
-	{
-
-		if ( inc_x == 1 )
-			xbuffer = x_ptr;
-		else
-			copy_x(Mblock,x_ptr,xbuffer,inc_x);
-
-		y_ptr = y;
-		a_ptrl = a_ptr;
-
-		for(i = 0; i<n; i++ )
-		{
-			sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
-			y_ptr += inc_y;
-			a_ptrl += lda;
-		}
-		a_ptr += Mblock;	
-		x_ptr += Mblock * inc_x;
-
-
- 	}
-	Mblock /= 2;
-
-        if ( m2 & Mblock)
-	{
-
-		if ( inc_x == 1 )
-			xbuffer = x_ptr;
-		else
-			copy_x(Mblock,x_ptr,xbuffer,inc_x);
-
-		y_ptr = y;
-		a_ptrl = a_ptr;
-
-		for(i = 0; i<n; i++ )
-		{
-			sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
-			y_ptr += inc_y;
-			a_ptrl += lda;
-		}
-		a_ptr += Mblock;	
-		x_ptr += Mblock * inc_x;
-
-
- 	}
-	Mblock /= 2;
-
-        if ( m2 & Mblock)
-	{
-
-		xbuffer = x_ptr;
-
-		y_ptr = y;
-		a_ptrl = a_ptr;
-
-		for(i = 0; i<n; i++ )
-		{
-			sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
-			y_ptr += inc_y;
-			a_ptrl += lda;
-		}
-
-
- 	}
-
-	return(0);
-}
-
-
diff --git a/kernel/x86_64/sgemv_t_microk_bulldozer.c b/kernel/x86_64/sgemv_t_microk_bulldozer.c
deleted file mode 100644
index 56b12a1e8..000000000
--- a/kernel/x86_64/sgemv_t_microk_bulldozer.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-	//n = n / 16;
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"leaq	(, %%rcx,4), %%rcx       \n\t"		// scale lda by size of float
-	"leaq	(%%rsi,%%rcx,1), %%r8    \n\t"		// pointer to next line
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
-	"vxorps		%%xmm14, %%xmm14, %%xmm14\n\t"	// set to zero
-	"vxorps		%%xmm15, %%xmm15, %%xmm15\n\t"	// set to zero
-
-	"sarq		$4, %%rax		 \n\t"	// n = n / 16
-
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	// "prefetcht0	512(%%rsi)		 \n\t"
-	"prefetcht0	(%%r8)		 	 \n\t" //prefetch next line of a
-	"vmovups	(%%rsi), %%xmm4		 \n\t"
-	"vmovups     4*4(%%rsi), %%xmm5		 \n\t"
-	"vmovups     8*4(%%rsi), %%xmm6		 \n\t"
-	"vmovups    12*4(%%rsi), %%xmm7		 \n\t"
-
-	"vfmaddps %%xmm12,   0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp
-	"vfmaddps %%xmm13,   4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp
-	"vfmaddps %%xmm14,   8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp
-	"vfmaddps %%xmm15,  12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp
-
-        "addq		$16*4    ,   %%r8	 \n\t"  // increment prefetch pointer 
-        "addq		$16*4    ,   %%rsi	 \n\t"  // increment pointer of a 
-        "addq		$16*4    ,   %%rdi	 \n\t"  // increment pointer of c 
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vaddps		%%xmm12, %%xmm14, %%xmm12\n\t"	
-	"vaddps		%%xmm13, %%xmm15, %%xmm13\n\t"	
-	"vaddps		%%xmm12, %%xmm13, %%xmm12\n\t"	
-	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
-	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
-
-	"vfmaddss	(%%rdx), %%xmm12, %%xmm1, %%xmm12\n\t"
-	"vmovss		%%xmm12, (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-
diff --git a/kernel/x86_64/sgemv_t_microk_haswell.c b/kernel/x86_64/sgemv_t_microk_haswell.c
deleted file mode 100644
index ecb9845bb..000000000
--- a/kernel/x86_64/sgemv_t_microk_haswell.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-	//n = n / 16;
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"leaq	(, %%rcx,4), %%rcx       \n\t"		// scale lda by size of float
-	"leaq	(%%rsi,%%rcx,1), %%r8    \n\t"		// pointer to next line
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
-	"vxorps		%%xmm14, %%xmm14, %%xmm14\n\t"	// set to zero
-	"vxorps		%%xmm15, %%xmm15, %%xmm15\n\t"	// set to zero
-
-	"sarq		$4, %%rax		 \n\t"	// n = n / 16
-
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	// "prefetcht0	512(%%rsi)		 \n\t"
-	"prefetcht0	(%%r8)		 	 \n\t" //prefetch next line of a
-	"vmovups	(%%rsi), %%xmm4		 \n\t"
-	"vmovups     4*4(%%rsi), %%xmm5		 \n\t"
-	"vmovups     8*4(%%rsi), %%xmm6		 \n\t"
-	"vmovups    12*4(%%rsi), %%xmm7		 \n\t"
-
-	"vfmadd231ps   0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp
-	"vfmadd231ps   4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp
-	"vfmadd231ps   8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp
-	"vfmadd231ps  12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp
-
-        "addq		$16*4    ,   %%r8	 \n\t"  // increment prefetch pointer 
-        "addq		$16*4    ,   %%rsi	 \n\t"  // increment pointer of a 
-        "addq		$16*4    ,   %%rdi	 \n\t"  // increment pointer of c 
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vaddps		%%xmm12, %%xmm14, %%xmm12\n\t"	
-	"vaddps		%%xmm13, %%xmm15, %%xmm13\n\t"	
-	"vaddps		%%xmm12, %%xmm13, %%xmm12\n\t"	
-	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
-	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
-
-	"vmulss	        %%xmm12, %%xmm1, %%xmm12\n\t"
-	"vaddss		(%%rdx), %%xmm12,%%xmm12\n\t"
-	"vmovss		%%xmm12, (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-
diff --git a/kernel/x86_64/sgemv_t_microk_sandy.c b/kernel/x86_64/sgemv_t_microk_sandy.c
deleted file mode 100644
index 4ecd6d3d0..000000000
--- a/kernel/x86_64/sgemv_t_microk_sandy.c
+++ /dev/null
@@ -1,106 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-	//n = n / 16;
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"leaq	(, %%rcx,4), %%rcx       \n\t"		// scale lda by size of float
-	"leaq	(%%rsi,%%rcx,1), %%r8    \n\t"		// pointer to next line
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
-	"vxorps		%%xmm14, %%xmm14, %%xmm14\n\t"	// set to zero
-	"vxorps		%%xmm15, %%xmm15, %%xmm15\n\t"	// set to zero
-
-	"sarq		$4, %%rax		 \n\t"	// n = n / 16
-
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	// "prefetcht0	512(%%rsi)		 \n\t"
-	"prefetcht0	(%%r8)		 	 \n\t" //prefetch next line of a
-	"vmovups	(%%rsi), %%xmm4		 \n\t"
-	"vmovups     4*4(%%rsi), %%xmm5		 \n\t"
-	"vmovups     8*4(%%rsi), %%xmm6		 \n\t"
-	"vmovups    12*4(%%rsi), %%xmm7		 \n\t"
-
-	"vmulps      0*4(%%rdi), %%xmm4, %%xmm8 \n\t" // multiply a and c and add to temp
-	"vmulps      4*4(%%rdi), %%xmm5, %%xmm9 \n\t" // multiply a and c and add to temp
-	"vmulps      8*4(%%rdi), %%xmm6, %%xmm10\n\t" // multiply a and c and add to temp
-	"vmulps     12*4(%%rdi), %%xmm7, %%xmm11\n\t" // multiply a and c and add to temp
-
-	"vaddps		%%xmm12, %%xmm8 , %%xmm12\n\t"	
-	"vaddps		%%xmm13, %%xmm9 , %%xmm13\n\t"	
-	"vaddps		%%xmm14, %%xmm10, %%xmm14\n\t"	
-	"vaddps		%%xmm15, %%xmm11, %%xmm15\n\t"	
-
-        "addq		$16*4    ,   %%r8	 \n\t"  // increment prefetch pointer 
-        "addq		$16*4    ,   %%rsi	 \n\t"  // increment pointer of a 
-        "addq		$16*4    ,   %%rdi	 \n\t"  // increment pointer of c 
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vaddps		%%xmm12, %%xmm14, %%xmm12\n\t"	
-	"vaddps		%%xmm13, %%xmm15, %%xmm13\n\t"	
-	"vaddps		%%xmm12, %%xmm13, %%xmm12\n\t"	
-	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
-	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
-
-	"vmulss		%%xmm12, %%xmm1, %%xmm12 \n\t"
-	"vaddss	       (%%rdx), %%xmm12, %%xmm12\n\t"
-	"vmovss		%%xmm12, (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-

From 44f2bf9bae7b25356fc0179d6b935de4edadc637 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 9 Sep 2014 13:34:22 +0200
Subject: [PATCH 39/44] added optimized dgemv_t kernel for haswell

---
 kernel/x86_64/dgemv_t_4.c                | 623 +++++++++++++++++++++++
 kernel/x86_64/dgemv_t_microk_haswell-4.c | 127 +++++
 2 files changed, 750 insertions(+)
 create mode 100644 kernel/x86_64/dgemv_t_4.c
 create mode 100644 kernel/x86_64/dgemv_t_microk_haswell-4.c

diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c
new file mode 100644
index 000000000..0d0409bec
--- /dev/null
+++ b/kernel/x86_64/dgemv_t_4.c
@@ -0,0 +1,623 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+/*
+#if defined(NEHALEM)
+#include "dgemv_t_microk_nehalem-4.c"
+#elif defined(BULLDOZER) || defined(PILEDRIVER)
+#include "dgemv_t_microk_bulldozer-4.c"
+#elif defined(SANDYBRIDGE)
+#include "dgemv_t_microk_sandy-4.c"
+#elif defined(HASWELL)
+#include "dgemv_t_microk_haswell-4.c"
+#endif
+*/
+
+#define NBMAX 2048
+
+#ifndef HAVE_KERNEL_4x4
+
+static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3];
+	FLOAT temp0 = 0.0;
+	FLOAT temp1 = 0.0;
+	FLOAT temp2 = 0.0;
+	FLOAT temp3 = 0.0;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];		
+		temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];		
+		temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];		
+		temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];		
+	}
+	y[0] = temp0;
+	y[1] = temp1;
+	y[2] = temp2;
+	y[3] = temp3;
+}
+	
+#endif
+
+static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y)  __attribute__ ((noinline));
+
+static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+
+	i=0;
+
+        __asm__  __volatile__
+ 	(
+	"xorpd %%xmm10 , %%xmm10		\n\t"
+	"xorpd %%xmm11 , %%xmm11		\n\t"
+		
+	"testq	$2 , %1				\n\t"
+	"jz	.L01LABEL%=			\n\t"
+
+	"movups  (%5,%0,8) , %%xmm14		\n\t" // x
+	"movups  (%3,%0,8) , %%xmm12		\n\t" // ap0
+	"movups  (%4,%0,8) , %%xmm13		\n\t" // ap1
+	"mulpd   %%xmm14   , %%xmm12 		\n\t"
+	"mulpd   %%xmm14   , %%xmm13 		\n\t"
+        "addq           $2 , %0                 \n\t"
+	"addpd   %%xmm12   , %%xmm10		\n\t"
+        "subq           $2 , %1                 \n\t"
+	"addpd   %%xmm13   , %%xmm11		\n\t"
+
+        ".L01LABEL%=:                           \n\t"
+
+	"cmpq	$0, %1				\n\t"
+	"je	.L01END%=			\n\t"
+
+        ".align 16                              \n\t"
+        ".L01LOOP%=:                            \n\t"
+
+	"movups  (%5,%0,8) , %%xmm14		\n\t" // x
+	"movups  (%3,%0,8) , %%xmm12		\n\t" // ap0
+	"movups  (%4,%0,8) , %%xmm13		\n\t" // ap1
+	"mulpd   %%xmm14   , %%xmm12 		\n\t"
+	"mulpd   %%xmm14   , %%xmm13 		\n\t"
+	"addpd   %%xmm12   , %%xmm10		\n\t"
+	"addpd   %%xmm13   , %%xmm11		\n\t"
+
+	"movups  16(%5,%0,8) , %%xmm14		\n\t" // x
+	"movups  16(%3,%0,8) , %%xmm12		\n\t" // ap0
+	"movups  16(%4,%0,8) , %%xmm13		\n\t" // ap1
+	"mulpd   %%xmm14   , %%xmm12 		\n\t"
+	"mulpd   %%xmm14   , %%xmm13 		\n\t"
+	"addpd   %%xmm12   , %%xmm10		\n\t"
+	"addpd   %%xmm13   , %%xmm11		\n\t"
+
+        "addq           $4 , %0                 \n\t"
+        "subq           $4 , %1                 \n\t"
+        "jnz            .L01LOOP%=              \n\t"
+
+        ".L01END%=:                             \n\t"
+
+	"haddpd        %%xmm10, %%xmm10         \n\t"
+	"haddpd        %%xmm11, %%xmm11         \n\t"
+
+	"movsd	       %%xmm10, (%2)	        \n\t"
+	"movsd	       %%xmm11,8(%2)	        \n\t"
+
+        :
+   	:
+	"r" (i),	 // 0
+	"r" (n),	 // 1
+        "r" (y),         // 2    
+        "r" (ap0),       // 3
+        "r" (ap1),       // 4
+        "r" (x)          // 5
+        : "cc",
+       	"%xmm4", "%xmm5", "%xmm10", "%xmm11",
+       	"%xmm12", "%xmm13", "%xmm14", "%xmm15",
+       	"memory"
+       	);
+
+
+}
+	
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)  __attribute__ ((noinline));
+
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+
+	i=0;
+
+        __asm__  __volatile__
+ 	(
+	"xorpd %%xmm9  , %%xmm9 		\n\t"
+	"xorpd %%xmm10 , %%xmm10		\n\t"
+	
+	"testq	$2 , %1				\n\t"
+	"jz	.L01LABEL%=			\n\t"
+
+	"movups  (%3,%0,8) , %%xmm12		\n\t"
+	"movups  (%4,%0,8) , %%xmm11		\n\t"
+	"mulpd   %%xmm11   , %%xmm12 		\n\t"
+        "addq           $2 , %0                 \n\t"
+	"addpd   %%xmm12   , %%xmm10		\n\t"
+        "subq           $2 , %1                 \n\t"
+
+        ".L01LABEL%=:                           \n\t"
+
+	"cmpq	$0, %1				\n\t"
+	"je	.L01END%=			\n\t"
+
+        ".align 16                              \n\t"
+        ".L01LOOP%=:                            \n\t"
+
+	"movups    (%3,%0,8) , %%xmm12		\n\t"
+	"movups  16(%3,%0,8) , %%xmm14		\n\t"
+	"movups    (%4,%0,8) , %%xmm11		\n\t"
+	"movups  16(%4,%0,8) , %%xmm13		\n\t"
+	"mulpd   %%xmm11   , %%xmm12 		\n\t"
+	"mulpd   %%xmm13   , %%xmm14 		\n\t"
+        "addq           $4 , %0                 \n\t"
+	"addpd   %%xmm12   , %%xmm10		\n\t"
+        "subq           $4 , %1                 \n\t"
+	"addpd   %%xmm14   , %%xmm9 		\n\t"
+
+        "jnz            .L01LOOP%=              \n\t"
+
+        ".L01END%=:                             \n\t"
+
+	"addpd	       %%xmm9 , %%xmm10         \n\t"
+	"haddpd        %%xmm10, %%xmm10         \n\t"
+
+	"movsd	       %%xmm10, (%2)	        \n\t"
+
+        :
+   	:
+	"r" (i),	 // 0
+	"r" (n),	 // 1
+        "r" (y),         // 2    
+        "r" (ap),        // 3
+        "r" (x)          // 4
+        : "cc",
+       	"%xmm9", "%xmm10" ,
+       	"%xmm11", "%xmm12", "%xmm13", "%xmm14",
+       	"memory"
+       	);
+
+
+}
+	
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
+{
+        BLASLONG i;
+        for ( i=0; i<n; i++ )
+        {
+                *dest = *src;
+                dest++;
+                src += inc_src;
+        }
+}
+
+static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+
+        BLASLONG i;
+
+	if ( inc_dest != 1 )
+	{
+        	for ( i=0; i<n; i++ )
+        	{
+                	*dest += src[i]  * da;
+                	dest  += inc_dest;
+		}
+		return;
+        }
+
+	i=0;
+
+        __asm__  __volatile__
+ 	(
+	"movsd	 (%2) , %%xmm10                 \n\t"
+	"shufpd  $0 , %%xmm10 , %%xmm10		\n\t"
+
+        ".align 16                              \n\t"
+        ".L01LOOP%=:                            \n\t"
+
+	"movups  (%3,%0,8) , %%xmm12		\n\t"
+	"movups  (%4,%0,8) , %%xmm11		\n\t"
+	"mulpd   %%xmm10   , %%xmm12 		\n\t"
+        "addq           $2 , %0                 \n\t"
+	"addpd   %%xmm12   , %%xmm11		\n\t"
+        "subq           $2 , %1                 \n\t"
+	"movups  %%xmm11, -16(%4,%0,8)		\n\t"
+
+        "jnz            .L01LOOP%=              \n\t"
+
+        :
+   	:
+	"r" (i),	  // 0
+	"r" (n),	  // 1
+        "r" (&da),        // 2    
+        "r" (src),        // 3
+        "r" (dest)        // 4
+        : "cc",
+       	"%xmm10", "%xmm11", "%xmm12",
+       	"memory"
+       	);
+
+
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG register i;
+	BLASLONG register j;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	BLASLONG n0;
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG m3;
+	BLASLONG n2;
+	FLOAT ybuffer[4],*xbuffer;
+	FLOAT *ytemp;
+
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
+	xbuffer = buffer;
+	ytemp   = buffer + NBMAX;
+	
+	n0 = n / NBMAX;
+        n1 = (n % NBMAX)  >> 2 ;
+        n2 = n & 3  ;
+
+	m3 = m & 3  ;
+        m1 = m & -4 ;
+        m2 = (m & (NBMAX-1)) - m3 ;
+
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		y_ptr = y;
+		a_ptr = a;
+		x_ptr = x;
+
+		if ( inc_x == 1 )
+			xbuffer = x_ptr;
+		else
+			copy_x(NB,x_ptr,xbuffer,inc_x);
+
+
+		FLOAT *ap[4];
+		FLOAT *yp;
+		BLASLONG register lda4 = 4 * lda;
+		ap[0] = a_ptr;
+		ap[1] = a_ptr + lda;
+		ap[2] = ap[1] + lda;
+		ap[3] = ap[2] + lda;
+
+		if ( n0 > 0 )
+		{
+			BLASLONG nb1 = NBMAX / 4;
+			for( j=0; j<n0; j++)
+			{
+
+				yp = ytemp;
+				for( i = 0; i < nb1  ; i++)
+				{
+					dgemv_kernel_4x4(NB,ap,xbuffer,yp);
+					ap[0] += lda4 ;
+					ap[1] += lda4 ;
+					ap[2] += lda4 ;
+					ap[3] += lda4 ;
+					yp += 4;
+				}
+				add_y(nb1*4, alpha, ytemp, y_ptr, inc_y );
+				y_ptr += nb1 * inc_y * 4;
+				a_ptr += nb1 * lda4 ;
+
+			}
+
+		}
+
+
+		yp = ytemp;
+
+		for( i = 0; i < n1 ; i++)
+		{
+			dgemv_kernel_4x4(NB,ap,xbuffer,yp);
+			ap[0] += lda4 ;
+			ap[1] += lda4 ;
+			ap[2] += lda4 ;
+			ap[3] += lda4 ;
+			yp += 4;
+		}
+		if ( n1 > 0 )
+		{
+			add_y(n1*4, alpha, ytemp, y_ptr, inc_y );
+			y_ptr += n1 * inc_y * 4;
+			a_ptr += n1 * lda4 ;
+		}
+
+		if ( n2 & 2 )
+		{
+
+			dgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer);
+			a_ptr  += lda * 2;
+			*y_ptr += ybuffer[0] * alpha;
+			y_ptr  += inc_y;
+			*y_ptr += ybuffer[1] * alpha;
+			y_ptr  += inc_y;
+
+		}
+
+		if ( n2 & 1 )
+		{
+
+			dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
+			a_ptr  += lda;
+			*y_ptr += ybuffer[0] * alpha;
+			y_ptr  += inc_y;
+
+		}
+		a += NB;
+		x += NB * inc_x;	
+	}
+
+	if ( m3 == 0 ) return(0);
+
+	x_ptr = x;
+	a_ptr = a;
+	if ( m3 == 3 )
+	{
+		FLOAT xtemp0 = *x_ptr * alpha;
+		x_ptr += inc_x;
+		FLOAT xtemp1 = *x_ptr * alpha;
+		x_ptr += inc_x;
+		FLOAT xtemp2 = *x_ptr * alpha;
+
+		FLOAT *aj = a_ptr;
+		y_ptr = y;
+
+		if ( lda == 3 && inc_y == 1 )
+		{
+
+			for ( j=0; j< ( n & -4) ; j+=4 )
+			{
+
+				y_ptr[j]   += aj[0] * xtemp0 + aj[1]  * xtemp1 + aj[2]  * xtemp2;
+				y_ptr[j+1] += aj[3] * xtemp0 + aj[4]  * xtemp1 + aj[5]  * xtemp2;
+				y_ptr[j+2] += aj[6] * xtemp0 + aj[7]  * xtemp1 + aj[8]  * xtemp2;
+				y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
+			 	aj        += 12;
+			}
+
+			for ( ; j<n; j++ )
+			{
+				y_ptr[j]  += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+			 	aj        += 3;
+			}
+
+		}
+		else
+		{
+
+			if ( inc_y == 1 )
+			{
+
+				BLASLONG register lda2 = lda << 1;
+				BLASLONG register lda4 = lda << 2;
+				BLASLONG register lda3 = lda2 + lda;
+
+				for ( j=0; j< ( n & -4 ); j+=4 )
+				{
+
+					y_ptr[j]    += *aj        * xtemp0 + *(aj+1)      * xtemp1 + *(aj+2)      * xtemp2;
+					y_ptr[j+1]  += *(aj+lda)  * xtemp0 + *(aj+lda+1)  * xtemp1 + *(aj+lda+2)  * xtemp2;
+					y_ptr[j+2]  += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2;
+					y_ptr[j+3]  += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2;
+			 		aj          += lda4;
+				}
+
+				for ( ; j< n ; j++ )
+				{
+
+					y_ptr[j]    += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ;
+			 		aj          += lda;
+				}
+
+			}
+			else
+			{
+
+				for ( j=0; j<n; j++ )
+				{
+					*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
+				 	y_ptr += inc_y;
+			 		aj    += lda;
+				}
+
+
+			}
+
+		}
+		return(0);
+	}
+
+	if ( m3 == 2 )
+	{
+		FLOAT xtemp0 = *x_ptr * alpha;
+		x_ptr += inc_x;
+		FLOAT xtemp1 = *x_ptr * alpha;
+
+		FLOAT *aj = a_ptr;
+		y_ptr = y;
+
+		if ( lda == 2 && inc_y == 1 )
+		{
+
+			for ( j=0; j< ( n & -4) ; j+=4 )
+			{
+				y_ptr[j]   += aj[0] * xtemp0 + aj[1] * xtemp1 ;
+				y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ;
+				y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ;
+				y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ;
+			 	aj         += 8;
+
+			}
+
+			for ( ; j<n; j++ )
+			{
+				y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
+			 	aj       += 2;
+			}
+
+		}
+		else
+		{
+			if ( inc_y == 1 )
+			{
+
+				BLASLONG register lda2 = lda << 1;
+				BLASLONG register lda4 = lda << 2;
+				BLASLONG register lda3 = lda2 + lda;
+
+				for ( j=0; j< ( n & -4 ); j+=4 )
+				{
+
+					y_ptr[j]    += *aj        * xtemp0 + *(aj+1)      * xtemp1 ;
+					y_ptr[j+1]  += *(aj+lda)  * xtemp0 + *(aj+lda+1)  * xtemp1 ;
+					y_ptr[j+2]  += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ;
+					y_ptr[j+3]  += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ;
+			 		aj          += lda4;
+				}
+
+				for ( ; j< n ; j++ )
+				{
+
+					y_ptr[j]    += *aj * xtemp0 + *(aj+1) * xtemp1 ;
+			 		aj          += lda;
+				}
+
+			}
+			else
+			{
+				for ( j=0; j<n; j++ )
+				{
+					*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
+			 		y_ptr += inc_y;
+			 		aj    += lda;
+				}
+			}
+
+		}
+		return(0);
+
+	}
+
+	FLOAT xtemp = *x_ptr * alpha;
+	FLOAT *aj = a_ptr;
+	y_ptr = y;
+	if ( lda == 1 && inc_y == 1 )
+	{
+		for ( j=0; j< ( n & -4) ; j+=4 )
+		{
+			y_ptr[j]   += aj[j]   * xtemp;
+			y_ptr[j+1] += aj[j+1] * xtemp;
+			y_ptr[j+2] += aj[j+2] * xtemp;
+			y_ptr[j+3] += aj[j+3] * xtemp;
+		}
+		for ( ; j<n   ; j++ )
+		{
+			y_ptr[j] += aj[j] * xtemp;
+		}
+
+
+
+	}
+	else
+	{
+		if ( inc_y == 1 )
+		{
+
+			BLASLONG register lda2 = lda << 1;
+			BLASLONG register lda4 = lda << 2;
+			BLASLONG register lda3 = lda2 + lda;
+			for ( j=0; j< ( n & -4 ); j+=4 )
+			{
+				y_ptr[j]    += *aj        * xtemp;
+				y_ptr[j+1]  += *(aj+lda)  * xtemp;
+				y_ptr[j+2]  += *(aj+lda2) * xtemp;
+				y_ptr[j+3]  += *(aj+lda3) * xtemp;
+		 		aj          += lda4  ;
+			}
+
+			for ( ; j<n; j++ )
+			{
+				y_ptr[j]  += *aj * xtemp;
+		 		aj        += lda;
+			}
+
+		}
+		else
+		{
+			for ( j=0; j<n; j++ )
+			{
+				*y_ptr += *aj * xtemp;
+		 		y_ptr += inc_y;
+		 		aj    += lda;
+			}
+
+		}
+	}
+
+	return(0);
+}
+
+
diff --git a/kernel/x86_64/dgemv_t_microk_haswell-4.c b/kernel/x86_64/dgemv_t_microk_haswell-4.c
new file mode 100644
index 000000000..410225500
--- /dev/null
+++ b/kernel/x86_64/dgemv_t_microk_haswell-4.c
@@ -0,0 +1,127 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			         \n\t"
+	"vxorpd		%%ymm4 , %%ymm4, %%ymm4  \n\t"
+	"vxorpd		%%ymm5 , %%ymm5, %%ymm5  \n\t"
+	"vxorpd		%%ymm6 , %%ymm6, %%ymm6  \n\t"
+	"vxorpd		%%ymm7 , %%ymm7, %%ymm7  \n\t"
+
+        "testq          $0x04, %1                      \n\t"
+        "jz             .L08LABEL%=                    \n\t"
+
+	"vmovups	(%2,%0,8), %%ymm12       \n\t"	// 4 * x
+
+	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231pd   (%5,%0,8), %%ymm12, %%ymm5      \n\t" 
+	"vfmadd231pd   (%6,%0,8), %%ymm12, %%ymm6      \n\t" 
+	"vfmadd231pd   (%7,%0,8), %%ymm12, %%ymm7      \n\t" 
+
+        "addq		$4 , %0	  	 	      \n\t"
+	"subq	        $4 , %1			      \n\t"		
+
+        ".L08LABEL%=:                                  \n\t"
+
+        "cmpq           $0, %1                         \n\t"
+        "je             .L16END%=                      \n\t"
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"prefetcht0	 384(%2,%0,8)		 \n\t"
+	"vmovups	(%2,%0,8), %%ymm12       \n\t"	// 4 * x
+	"vmovups      32(%2,%0,8), %%ymm13       \n\t"	// 4 * x
+
+	"prefetcht0	 384(%4,%0,8)		       \n\t"
+	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231pd   (%5,%0,8), %%ymm12, %%ymm5      \n\t" 
+	"prefetcht0	 384(%5,%0,8)		       \n\t"
+	"vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5      \n\t" 
+	"prefetcht0	 384(%6,%0,8)		       \n\t"
+	"vfmadd231pd   (%6,%0,8), %%ymm12, %%ymm6      \n\t" 
+	"vfmadd231pd   (%7,%0,8), %%ymm12, %%ymm7      \n\t" 
+	"prefetcht0	 384(%7,%0,8)		       \n\t"
+	"vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm6      \n\t" 
+	"vfmadd231pd 32(%7,%0,8), %%ymm13, %%ymm7      \n\t" 
+
+        "addq		$8 , %0	  	 	      \n\t"
+	"subq	        $8 , %1			      \n\t"		
+	"jnz		.L01LOOP%=		      \n\t"
+
+        ".L16END%=:                                   \n\t"
+
+	"vextractf128   $1 , %%ymm4, %%xmm12	      \n\t"
+	"vextractf128   $1 , %%ymm5, %%xmm13	      \n\t"
+	"vextractf128   $1 , %%ymm6, %%xmm14	      \n\t"
+	"vextractf128   $1 , %%ymm7, %%xmm15	      \n\t"
+
+	"vaddpd		%%xmm4, %%xmm12, %%xmm4       \n\t"
+	"vaddpd		%%xmm5, %%xmm13, %%xmm5       \n\t"
+	"vaddpd		%%xmm6, %%xmm14, %%xmm6       \n\t"
+	"vaddpd		%%xmm7, %%xmm15, %%xmm7       \n\t"
+
+        "vhaddpd        %%xmm4, %%xmm4, %%xmm4  \n\t"
+        "vhaddpd        %%xmm5, %%xmm5, %%xmm5  \n\t"
+        "vhaddpd        %%xmm6, %%xmm6, %%xmm6  \n\t"
+        "vhaddpd        %%xmm7, %%xmm7, %%xmm7  \n\t"
+
+        "vmovsd         %%xmm4,    (%3)         \n\t"
+        "vmovsd         %%xmm5,   8(%3)         \n\t"
+        "vmovsd         %%xmm6,  16(%3)         \n\t"
+        "vmovsd         %%xmm7,  24(%3)         \n\t"
+
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From e73a0113ecbe556f81f7a42bd887faaaa742c2a3 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 9 Sep 2014 13:54:55 +0200
Subject: [PATCH 40/44] added optimized gemv kernels

---
 kernel/x86_64/KERNEL.HASWELL | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index d0ac9c72f..8aab560c4 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -1,8 +1,8 @@
-SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = sgemv_t.c
+SGEMVNKERNEL = sgemv_n_4.c
+SGEMVTKERNEL = sgemv_t_4.c
 
-DGEMVNKERNEL = dgemv_n.c
-DGEMVTKERNEL = dgemv_t.c
+DGEMVNKERNEL = dgemv_n_4.c
+DGEMVTKERNEL = dgemv_t_4.c
 
 ZGEMVNKERNEL = zgemv_n.c
 ZGEMVTKERNEL = zgemv_t.c

From debc6d1a056f7d2763dfb4de01cf1b4780a2536b Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 9 Sep 2014 14:04:44 +0200
Subject: [PATCH 41/44] bugfix in KERNEL.HASWELL

---
 kernel/x86_64/KERNEL.HASWELL | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index 8aab560c4..9a5c54ffc 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -1,7 +1,7 @@
 SGEMVNKERNEL = sgemv_n_4.c
 SGEMVTKERNEL = sgemv_t_4.c
 
-DGEMVNKERNEL = dgemv_n_4.c
+DGEMVNKERNEL = dgemv_n.c
 DGEMVTKERNEL = dgemv_t_4.c
 
 ZGEMVNKERNEL = zgemv_n.c

From 8109d8232c66c4119044fa3111947d88afae9eb6 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 9 Sep 2014 14:38:08 +0200
Subject: [PATCH 42/44] optimized dgemv_t kernel for haswell

---
 kernel/x86_64/dgemv_t_4.c                | 10 +---------
 kernel/x86_64/dgemv_t_microk_haswell-4.c | 22 +++++++++++-----------
 2 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c
index 0d0409bec..ebec7d2c3 100644
--- a/kernel/x86_64/dgemv_t_4.c
+++ b/kernel/x86_64/dgemv_t_4.c
@@ -28,17 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-/*
-#if defined(NEHALEM)
-#include "dgemv_t_microk_nehalem-4.c"
-#elif defined(BULLDOZER) || defined(PILEDRIVER)
-#include "dgemv_t_microk_bulldozer-4.c"
-#elif defined(SANDYBRIDGE)
-#include "dgemv_t_microk_sandy-4.c"
-#elif defined(HASWELL)
+#if defined(HASWELL)
 #include "dgemv_t_microk_haswell-4.c"
 #endif
-*/
 
 #define NBMAX 2048
 
diff --git a/kernel/x86_64/dgemv_t_microk_haswell-4.c b/kernel/x86_64/dgemv_t_microk_haswell-4.c
index 410225500..33b43515d 100644
--- a/kernel/x86_64/dgemv_t_microk_haswell-4.c
+++ b/kernel/x86_64/dgemv_t_microk_haswell-4.c
@@ -61,25 +61,25 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
-	"prefetcht0	 384(%2,%0,8)		 \n\t"
+	// "prefetcht0	 384(%2,%0,8)		 \n\t"
 	"vmovups	(%2,%0,8), %%ymm12       \n\t"	// 4 * x
 	"vmovups      32(%2,%0,8), %%ymm13       \n\t"	// 4 * x
 
-	"prefetcht0	 384(%4,%0,8)		       \n\t"
+	// "prefetcht0	 384(%4,%0,8)		       \n\t"
 	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
 	"vfmadd231pd   (%5,%0,8), %%ymm12, %%ymm5      \n\t" 
-	"prefetcht0	 384(%5,%0,8)		       \n\t"
-	"vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4      \n\t" 
-	"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5      \n\t" 
-	"prefetcht0	 384(%6,%0,8)		       \n\t"
+	// "prefetcht0	 384(%5,%0,8)		       \n\t"
 	"vfmadd231pd   (%6,%0,8), %%ymm12, %%ymm6      \n\t" 
 	"vfmadd231pd   (%7,%0,8), %%ymm12, %%ymm7      \n\t" 
-	"prefetcht0	 384(%7,%0,8)		       \n\t"
-	"vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm6      \n\t" 
-	"vfmadd231pd 32(%7,%0,8), %%ymm13, %%ymm7      \n\t" 
+	// "prefetcht0	 384(%6,%0,8)		       \n\t"
+	"vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5      \n\t" 
+        "addq		$8 , %0	  	 	       \n\t"
+	// "prefetcht0	 384(%7,%0,8)		       \n\t"
+	"vfmadd231pd -32(%6,%0,8), %%ymm13, %%ymm6     \n\t" 
+	"subq	        $8 , %1			       \n\t"		
+	"vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7     \n\t" 
 
-        "addq		$8 , %0	  	 	      \n\t"
-	"subq	        $8 , %1			      \n\t"		
 	"jnz		.L01LOOP%=		      \n\t"
 
         ".L16END%=:                                   \n\t"

From faab7a181d72023c11b098da9cabc49a2ae3701d Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 9 Sep 2014 15:32:32 +0200
Subject: [PATCH 43/44] added optimized dgemv_n kernel for haswell

---
 kernel/x86_64/dgemv_n_microk_haswell-4.c | 247 +++++++++++++++++++++++
 1 file changed, 247 insertions(+)
 create mode 100644 kernel/x86_64/dgemv_n_microk_haswell-4.c

diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c
new file mode 100644
index 000000000..2c77f3469
--- /dev/null
+++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c
@@ -0,0 +1,247 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+
+#define HAVE_KERNEL_4x8 1
+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+	"vbroadcastsd    (%2), %%ymm12	 \n\t"	// x0 
+	"vbroadcastsd   8(%2), %%ymm13	 \n\t"	// x1 
+	"vbroadcastsd  16(%2), %%ymm14	 \n\t"	// x2 
+	"vbroadcastsd  24(%2), %%ymm15	 \n\t"	// x3 
+	"vbroadcastsd  32(%2), %%ymm0 	 \n\t"	// x4 
+	"vbroadcastsd  40(%2), %%ymm1 	 \n\t"	// x5 
+	"vbroadcastsd  48(%2), %%ymm2 	 \n\t"	// x6 
+	"vbroadcastsd  56(%2), %%ymm3 	 \n\t"	// x7 
+
+	"vbroadcastsd    (%9), %%ymm6 	 \n\t"	// alpha 
+
+        "testq          $0x04, %1                      \n\t"
+        "jz             .L8LABEL%=                     \n\t"
+
+	"vmovupd	(%3,%0,8), %%ymm7	       \n\t"	// 4 * y
+	"vxorpd		%%ymm4 , %%ymm4, %%ymm4        \n\t"
+	"vxorpd		%%ymm5 , %%ymm5, %%ymm5        \n\t"
+
+	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231pd   (%5,%0,8), %%ymm13, %%ymm5      \n\t" 
+	"vfmadd231pd   (%6,%0,8), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231pd   (%7,%0,8), %%ymm15, %%ymm5      \n\t" 
+
+	"vfmadd231pd   (%4,%8,8), %%ymm0 , %%ymm4      \n\t" 
+	"vfmadd231pd   (%5,%8,8), %%ymm1 , %%ymm5      \n\t" 
+	"vfmadd231pd   (%6,%8,8), %%ymm2 , %%ymm4      \n\t" 
+	"vfmadd231pd   (%7,%8,8), %%ymm3 , %%ymm5      \n\t" 
+
+	"vaddpd		%%ymm4 , %%ymm5 , %%ymm5       \n\t"
+	"vmulpd		%%ymm6 , %%ymm5 , %%ymm5       \n\t"
+	"vaddpd		%%ymm7 , %%ymm5 , %%ymm5       \n\t"
+
+
+	"vmovupd  %%ymm5,   (%3,%0,8)		       \n\t"	// 4 * y
+
+        "addq		$4 , %8	  	 	       \n\t"
+        "addq		$4 , %0	  	 	       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+
+        ".L8LABEL%=:                                   \n\t"
+
+        "cmpq           $0, %1                         \n\t"
+        "je             .L16END%=                      \n\t"
+
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+
+	"vxorpd		%%ymm4 , %%ymm4, %%ymm4        \n\t"
+	"vxorpd		%%ymm5 , %%ymm5, %%ymm5        \n\t"
+	"vmovupd	(%3,%0,8), %%ymm8	       \n\t"	// 4 * y
+	"vmovupd      32(%3,%0,8), %%ymm9	       \n\t"	// 4 * y
+
+	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5      \n\t" 
+	"vfmadd231pd   (%5,%0,8), %%ymm13, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5      \n\t" 
+	"vfmadd231pd   (%6,%0,8), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5      \n\t" 
+	"vfmadd231pd   (%7,%0,8), %%ymm15, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5      \n\t" 
+
+	"vfmadd231pd   (%4,%8,8), %%ymm0 , %%ymm4      \n\t" 
+        "addq		$8 , %0	  	 	       \n\t"
+	"vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5      \n\t" 
+	"vfmadd231pd   (%5,%8,8), %%ymm1 , %%ymm4      \n\t" 
+	"vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5      \n\t" 
+	"vfmadd231pd   (%6,%8,8), %%ymm2 , %%ymm4      \n\t" 
+	"vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5      \n\t" 
+	"vfmadd231pd   (%7,%8,8), %%ymm3 , %%ymm4      \n\t" 
+	"vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5      \n\t" 
+
+	"vfmadd231pd     %%ymm6 , %%ymm4 , %%ymm8      \n\t"
+	"vfmadd231pd     %%ymm6 , %%ymm5 , %%ymm9      \n\t"
+
+        "addq		$8 , %8	  	 	      \n\t"
+	"vmovupd  %%ymm8,-64(%3,%0,8)		      \n\t"	// 4 * y
+	"subq	        $8 , %1			      \n\t"		
+	"vmovupd  %%ymm9,-32(%3,%0,8)		      \n\t"	// 4 * y
+
+	"jnz		.L01LOOP%=		      \n\t"
+
+        ".L16END%=:                             \n\t"
+	"vzeroupper			        \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (lda4),   // 8
+          "r" (alpha)   // 9
+	: "cc", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+
+#define HAVE_KERNEL_4x4 1
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+	"vbroadcastsd    (%2), %%ymm12	 \n\t"	// x0 
+	"vbroadcastsd   8(%2), %%ymm13	 \n\t"	// x1 
+	"vbroadcastsd  16(%2), %%ymm14	 \n\t"	// x2 
+	"vbroadcastsd  24(%2), %%ymm15	 \n\t"	// x3 
+
+	"vbroadcastsd    (%8), %%ymm6 	 \n\t"	// alpha 
+
+        "testq          $0x04, %1                      \n\t"
+        "jz             .L8LABEL%=                     \n\t"
+
+	"vxorpd		%%ymm4 , %%ymm4, %%ymm4        \n\t"
+	"vxorpd		%%ymm5 , %%ymm5, %%ymm5        \n\t"
+	"vmovupd	(%3,%0,8), %%ymm7	       \n\t"	// 4 * y
+
+	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231pd   (%5,%0,8), %%ymm13, %%ymm5      \n\t" 
+	"vfmadd231pd   (%6,%0,8), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231pd   (%7,%0,8), %%ymm15, %%ymm5      \n\t" 
+
+	"vaddpd		%%ymm4 , %%ymm5 , %%ymm5       \n\t"
+	"vmulpd		%%ymm6 , %%ymm5 , %%ymm5       \n\t"
+	"vaddpd		%%ymm7 , %%ymm5 , %%ymm5       \n\t"
+
+	"vmovupd  %%ymm5,   (%3,%0,8)		       \n\t"	// 4 * y
+
+        "addq		$4 , %0	  	 	       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+
+        ".L8LABEL%=:                                   \n\t"
+
+        "cmpq           $0, %1                         \n\t"
+        "je             .L8END%=                       \n\t"
+
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vxorpd		%%ymm4 , %%ymm4, %%ymm4        \n\t"
+	"vxorpd		%%ymm5 , %%ymm5, %%ymm5        \n\t"
+	"vmovupd	(%3,%0,8), %%ymm8	       \n\t"	// 4 * y
+	"vmovupd      32(%3,%0,8), %%ymm9	       \n\t"	// 4 * y
+
+	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5      \n\t" 
+	"vfmadd231pd   (%5,%0,8), %%ymm13, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5      \n\t" 
+	"vfmadd231pd   (%6,%0,8), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5      \n\t" 
+	"vfmadd231pd   (%7,%0,8), %%ymm15, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5      \n\t" 
+
+	"vfmadd231pd     %%ymm6 , %%ymm4 , %%ymm8      \n\t"
+	"vfmadd231pd     %%ymm6 , %%ymm5 , %%ymm9      \n\t"
+
+	"vmovupd  %%ymm8,   (%3,%0,8)		      \n\t"	// 4 * y
+	"vmovupd  %%ymm9, 32(%3,%0,8)		      \n\t"	// 4 * y
+
+        "addq		$8 , %0	  	 	      \n\t"
+	"subq	        $8 , %1			      \n\t"		
+	"jnz		.L01LOOP%=		      \n\t"
+
+        ".L8END%=:                                    \n\t"
+	"vzeroupper			              \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (alpha)   // 8
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+

From baa46e4fba439a4dea4eed9fe82d0cd164f77a5a Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Tue, 9 Sep 2014 16:17:45 +0200
Subject: [PATCH 44/44] added and tested optimized dgemv_n kernel for haswell

---
 kernel/x86_64/KERNEL.HASWELL | 2 +-
 kernel/x86_64/dgemv_n_4.c    | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index 9a5c54ffc..8aab560c4 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -1,7 +1,7 @@
 SGEMVNKERNEL = sgemv_n_4.c
 SGEMVTKERNEL = sgemv_t_4.c
 
-DGEMVNKERNEL = dgemv_n.c
+DGEMVNKERNEL = dgemv_n_4.c
 DGEMVTKERNEL = dgemv_t_4.c
 
 ZGEMVNKERNEL = zgemv_n.c
diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c
index 249df8009..371fd73ee 100644
--- a/kernel/x86_64/dgemv_n_4.c
+++ b/kernel/x86_64/dgemv_n_4.c
@@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(NEHALEM)
 #include "dgemv_n_microk_nehalem-4.c"
+#elif defined(HASWELL)
+#include "dgemv_n_microk_haswell-4.c"
 #endif