Merge pull request #445 from wernsaar/develop

A lot of optimizations for gemv kernels
2014-09-10 16:28:14 +08:00 · 2014-09-10 16:28:14 +08:00 · d13e92f07e
parent 868f8a8756 baa46e4fba
commit d13e92f07e
34 changed files with 4871 additions and 2190 deletions
--- a/benchmark/gemv.c
+++ b/benchmark/gemv.c
@ -128,6 +128,7 @@ int MAIN__(int argc, char *argv[]){
  blasint inc_x=1,inc_y=1;
  blasint n=0;
  int has_param_n = 0;
+  int has_param_m = 0;
  int loops = 1;
  int l;
  char *p;
@ -145,29 +146,38 @@ int MAIN__(int argc, char *argv[]){
  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}

+
+  int tomax = to;
+
  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
  if ((p = getenv("OPENBLAS_INCY")))   inc_y = atoi(p);
  if ((p = getenv("OPENBLAS_TRANS")))  trans=*p;
  if ((p = getenv("OPENBLAS_PARAM_N"))) {
 	  n = atoi(p);
-	  if ((n>0) && (n<=to)) has_param_n = 1;
+	  if ((n>0)) has_param_n = 1;
+  	  if ( n > tomax ) tomax = n;
+  }
+  if ( has_param_n == 0 )
+  	if ((p = getenv("OPENBLAS_PARAM_M"))) {
+		  m = atoi(p);
+		  if ((m>0)) has_param_m = 1;
+  	  	  if ( m > tomax ) tomax = m;
  	}

-  if ( has_param_n == 1 )
-    fprintf(stderr, "From : %3d  To : %3d Step = %3d Trans = '%c' N = %d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,n,inc_x,inc_y,loops);
-  else
+
+
  fprintf(stderr, "From : %3d  To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops);

-  if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+  if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){
    fprintf(stderr,"Out of Memory!!\n");exit(1);
  }

-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){
    fprintf(stderr,"Out of Memory!!\n");exit(1);
  }

-  if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
+  if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){
    fprintf(stderr,"Out of Memory!!\n");exit(1);
  }

@ -177,22 +187,20 @@ int MAIN__(int argc, char *argv[]){

  fprintf(stderr, "   SIZE       Flops\n");

-  for(m = from; m <= to; m += step)
+  if (has_param_m == 0)
  {

+  	for(m = from; m <= to; m += step)
+  	{
   		timeg=0;
-
   		if ( has_param_n == 0 ) n = m;
-
   		fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
-
   		for(j = 0; j < m; j++){
      			for(i = 0; i < n * COMPSIZE; i++){
 				a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
      			}
   		}

-
    		for (l=0; l<loops; l++)
    		{

@ -204,24 +212,56 @@ int MAIN__(int argc, char *argv[]){
 				y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
   			}
    			gettimeofday( &start, (struct timezone *)0);
-
    			GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
-
    			gettimeofday( &stop, (struct timezone *)0);
-
    			time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
 			timeg += time1;

    		}

    		timeg /= loops;

-    fprintf(stderr,
-	    " %10.2f MFlops\n",
-	    COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
+    		fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);

  	}
+  }
+  else
+  {
+
+  	for(n = from; n <= to; n += step)
+  	{
+   		timeg=0;
+   		fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
+   		for(j = 0; j < m; j++){
+      			for(i = 0; i < n * COMPSIZE; i++){
+				a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+      			}
+   		}
+
+    		for (l=0; l<loops; l++)
+    		{
+
+   			for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
+				x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   			}
+
+   			for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
+				y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   			}
+    			gettimeofday( &start, (struct timezone *)0);
+    			GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
+    			gettimeofday( &stop, (struct timezone *)0);
+    			time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+			timeg += time1;
+
+    		}
+
+    		timeg /= loops;
+
+    		fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
+
+  	}
+  }

  return 0;
 }
--- a/benchmark/tplot-header
+++ b/benchmark/tplot-header
@ -0,0 +1,42 @@
+# **********************************************************************************
+# Copyright (c) 2014, The OpenBLAS Project
+# All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# 3. Neither the name of the OpenBLAS project nor the names of
+# its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# **********************************************************************************
+
+set term x11 font sans;
+set ylabel "MFlops";
+set xlabel "Size";
+set grid xtics;
+set grid ytics;
+set key left;
+set timestamp "generated on %Y-%m-%d by `whoami`"
+set title "Sgemv\nTRANS=T\nBulldozer"
+plot '1-THREAD' smooth bezier, '2-THREADS' smooth bezier, '4-THREADS' smooth bezier;
+set output "print.png";
+show title;
+show plot;
+show output;
+
+
--- a/common_x86_64.h
+++ b/common_x86_64.h
@ -46,6 +46,7 @@
 #define	__volatile__
 #endif

+/*
 #ifdef HAVE_SSE2
 #define MB   __asm__ __volatile__ ("mfence");
 #define WMB  __asm__ __volatile__ ("sfence");
@ -53,6 +54,10 @@
 #define MB
 #define WMB
 #endif
+*/
+
+#define MB
+#define WMB

 static void __inline blas_lock(volatile BLASULONG *address){

@ -99,7 +104,9 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
 			     : "0" (op));
 }

+/*
 #define WHEREAMI
+*/

 static inline int WhereAmI(void){
  int eax, ebx, ecx, edx;
@ -111,6 +118,7 @@ static inline int WhereAmI(void){
  return apicid;
 }

+
 #ifdef CORE_BARCELONA
 #define IFLUSH		gotoblas_iflush()
 #define IFLUSH_HALF	gotoblas_iflush_half()
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@ -251,7 +251,11 @@ void blas_set_parameter(void){

  env_var_t p;
  int factor;
+#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL)
+  int size = 16;
+#else
  int size = get_L2_size();
+#endif

 #if defined(CORE_KATMAI)  || defined(CORE_COPPERMINE) || defined(CORE_BANIAS)
  size >>= 7;
--- a/interface/gemv.c
+++ b/interface/gemv.c
@ -216,7 +216,7 @@ void CNAME(enum CBLAS_ORDER order,
  int  nthreads_avail = nthreads_max;

  double MNK = (double) m * (double) n;
-  if ( MNK <= (500.0 * 100.0  * (double) GEMM_MULTITHREAD_THRESHOLD)  )
+  if ( MNK <= (24.0 * 24.0  * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) )  )
        nthreads_max = 1;

  if ( nthreads_max > nthreads_avail )
--- a/kernel/x86_64/KERNEL.BULLDOZER
+++ b/kernel/x86_64/KERNEL.BULLDOZER
@ -10,8 +10,8 @@ DSYMV_L_KERNEL = dsymv_L.c
 SSYMV_U_KERNEL = ssymv_U.c
 SSYMV_L_KERNEL = ssymv_L.c

-SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = sgemv_t.c
+SGEMVNKERNEL = sgemv_n_4.c
+SGEMVTKERNEL = sgemv_t_4.c

 ZGEMVNKERNEL = zgemv_n_dup.S
 ZGEMVTKERNEL = zgemv_t.c
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@ -1,8 +1,8 @@
-SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = sgemv_t.c
+SGEMVNKERNEL = sgemv_n_4.c
+SGEMVTKERNEL = sgemv_t_4.c

-DGEMVNKERNEL = dgemv_n.c
-DGEMVTKERNEL = dgemv_t.c
+DGEMVNKERNEL = dgemv_n_4.c
+DGEMVTKERNEL = dgemv_t_4.c

 ZGEMVNKERNEL = zgemv_n.c
 ZGEMVTKERNEL = zgemv_t.c
--- a/kernel/x86_64/KERNEL.NEHALEM
+++ b/kernel/x86_64/KERNEL.NEHALEM
@ -9,9 +9,9 @@ DSYMV_L_KERNEL = dsymv_L.c
 SSYMV_U_KERNEL = ssymv_U.c
 SSYMV_L_KERNEL = ssymv_L.c

-SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = sgemv_t.c
-DGEMVNKERNEL = dgemv_n.c
+SGEMVNKERNEL = sgemv_n_4.c
+SGEMVTKERNEL = sgemv_t_4.c
+DGEMVNKERNEL = dgemv_n_4.c

 SGEMMKERNEL    =  gemm_kernel_4x8_nehalem.S
 SGEMMINCOPY    =  gemm_ncopy_4.S
--- a/kernel/x86_64/KERNEL.PILEDRIVER
+++ b/kernel/x86_64/KERNEL.PILEDRIVER
@ -1,5 +1,5 @@
-SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = sgemv_t.c
+SGEMVNKERNEL = sgemv_n_4.c
+SGEMVTKERNEL = sgemv_t_4.c

 ZGEMVNKERNEL = zgemv_n_dup.S
 ZGEMVTKERNEL = zgemv_t.S
--- a/kernel/x86_64/KERNEL.SANDYBRIDGE
+++ b/kernel/x86_64/KERNEL.SANDYBRIDGE
@ -1,5 +1,5 @@
-SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = sgemv_t.c
+SGEMVNKERNEL = sgemv_n_4.c
+SGEMVTKERNEL = sgemv_t_4.c

 ZGEMVNKERNEL = zgemv_n.c

--- a/kernel/x86_64/dgemv_n_4.c
+++ b/kernel/x86_64/dgemv_n_4.c
@ -0,0 +1,548 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+
+#if defined(NEHALEM)
+#include "dgemv_n_microk_nehalem-4.c"
+#elif defined(HASWELL)
+#include "dgemv_n_microk_haswell-4.c"
+#endif
+
+
+#define NBMAX 2048
+
+#ifndef HAVE_KERNEL_4x8
+
+static void dgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+	BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3;
+	FLOAT *b0,*b1,*b2,*b3;
+	FLOAT *x4;
+	FLOAT x[8];
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3];
+	b0 = a0 + lda4 ;
+	b1 = a1 + lda4 ;
+	b2 = a2 + lda4 ;
+	b3 = a3 + lda4 ;
+	x4 = x + 4;
+
+	for ( i=0; i<8; i++)
+		x[i] = xo[i] * *alpha;
+
+	for ( i=0; i< n; i+=4 )
+	{
+
+		y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];		
+		y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];		
+		y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];		
+		y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];		
+
+		y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3];		
+		y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3];		
+		y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3];		
+		y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3];		
+
+	}
+}
+	
+#endif
+
+
+#ifndef HAVE_KERNEL_4x4
+
+static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+{
+	BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3;
+	FLOAT x[4];
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3];
+
+	for ( i=0; i<4; i++)
+		x[i] = xo[i] * *alpha;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];		
+		y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];		
+		y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];		
+		y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];		
+	}
+}
+	
+#endif
+
+#ifndef HAVE_KERNEL_4x2
+
+static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"movsd    (%2)  , %%xmm12	 \n\t"	// x0 
+	"movsd    (%6)  , %%xmm4 	 \n\t"	// alpha 
+	"movsd   8(%2)  , %%xmm13	 \n\t"	// x1 
+        "mulsd  %%xmm4  , %%xmm12        \n\t"  // alpha 
+        "mulsd  %%xmm4  , %%xmm13        \n\t"  // alpha 
+	"shufpd $0,  %%xmm12, %%xmm12    \n\t"	
+	"shufpd $0,  %%xmm13, %%xmm13    \n\t"	
+
+	".align 16				       \n\t"
+	".L01LOOP%=:				       \n\t"
+	"movups	       (%3,%0,8), %%xmm4	       \n\t"	// 2 * y
+	"movups	     16(%3,%0,8), %%xmm5	       \n\t"	// 2 * y
+
+	"movups             (%4,%0,8), %%xmm8          \n\t" 
+	"movups             (%5,%0,8), %%xmm9          \n\t" 
+	"mulpd		%%xmm12, %%xmm8		       \n\t"
+	"mulpd		%%xmm13, %%xmm9		       \n\t"
+	"addpd		%%xmm8 , %%xmm4		       \n\t"
+	"addpd		%%xmm9 , %%xmm4		       \n\t"
+
+	"movups           16(%4,%0,8), %%xmm8          \n\t" 
+	"movups           16(%5,%0,8), %%xmm9          \n\t" 
+	"mulpd		%%xmm12, %%xmm8		       \n\t"
+	"mulpd		%%xmm13, %%xmm9		       \n\t"
+	"addpd		%%xmm8 , %%xmm5		       \n\t"
+	"addpd		%%xmm9 , %%xmm5		       \n\t"
+
+	"movups  %%xmm4 ,   (%3,%0,8)		       \n\t"	// 2 * y
+	"movups  %%xmm5 , 16(%3,%0,8)		       \n\t"	// 2 * y
+
+        "addq		$4 , %0	  	 	       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+	"jnz		.L01LOOP%=		       \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (alpha)   // 6
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+#endif
+
+#ifndef HAVE_KERNEL_4x2
+
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+        BLASLONG register i = 0;
+
+        __asm__  __volatile__
+        (
+        "movsd          (%2), %%xmm12            \n\t"  // x0 
+        "mulsd          (%5), %%xmm12            \n\t"  // alpha 
+        "shufpd $0,  %%xmm12, %%xmm12            \n\t"
+
+        ".align 16                               \n\t"
+        ".L01LOOP%=:                             \n\t"
+        "movups       (%4,%0,8), %%xmm8          \n\t"  // 2 * a
+        "movups     16(%4,%0,8), %%xmm9          \n\t"  // 2 * a
+        "movups       (%3,%0,8), %%xmm4          \n\t"  // 2 * y
+        "movups     16(%3,%0,8), %%xmm5          \n\t"  // 2 * y
+	"mulpd          %%xmm12, %%xmm8          \n\t"
+	"mulpd          %%xmm12, %%xmm9          \n\t"
+        "addpd          %%xmm8 , %%xmm4          \n\t"
+        "addpd          %%xmm9 , %%xmm5          \n\t"
+
+	"movups  %%xmm4 ,    (%3,%0,8)           \n\t"    // 2 * y
+	"movups  %%xmm5 ,  16(%3,%0,8)           \n\t"    // 2 * y
+
+        "addq           $4 , %0                  \n\t"
+        "subq           $4 , %1                  \n\t"
+
+        "jnz            .L01LOOP%=               \n\t"
+
+        :
+        :
+          "r" (i),      // 0    
+          "r" (n),      // 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap),     // 4
+          "r" (alpha)   // 5
+        : "cc",
+          "%xmm4", "%xmm5",
+          "%xmm6", "%xmm7",
+          "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+          "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+          "memory"
+        );
+
+}
+
+#endif
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+	BLASLONG i;
+	if ( inc_dest != 1 )
+	{
+		for ( i=0; i<n; i++ )
+		{
+			*dest += *src;
+			src++;
+			dest += inc_dest;
+		}
+		return;
+	}
+
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	FLOAT *ap[4];
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG m3;
+	BLASLONG n2;
+	BLASLONG lda4 =  lda << 2;
+	BLASLONG lda8 =  lda << 3;
+	FLOAT xbuffer[8],*ybuffer;
+
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
+	ybuffer = buffer;
+	
+        if ( inc_x == 1 )
+	{
+		n1 = n >> 3 ;
+		n2 = n &  7 ;
+	}
+	else
+	{
+		n1 = n >> 2 ;
+		n2 = n &  3 ;
+
+	}
+	
+        m3 = m & 3  ;
+        m1 = m & -4 ;
+        m2 = (m & (NBMAX-1)) - m3 ;
+
+
+	y_ptr = y;
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		a_ptr = a;
+		x_ptr = x;
+		
+		ap[0] = a_ptr;
+		ap[1] = a_ptr + lda;
+		ap[2] = ap[1] + lda;
+		ap[3] = ap[2] + lda;
+
+		if ( inc_y != 1 )
+			memset(ybuffer,0,NB*8);
+		else
+			ybuffer = y_ptr;
+
+		if ( inc_x == 1 )
+		{
+
+
+			for( i = 0; i < n1 ; i++)
+			{
+				dgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
+				ap[0] += lda8; 
+				ap[1] += lda8; 
+				ap[2] += lda8; 
+				ap[3] += lda8; 
+				a_ptr += lda8;
+				x_ptr += 8;	
+			}
+
+
+			if ( n2 & 4 )
+			{
+				dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				a_ptr += lda4;
+				x_ptr += 4;	
+			}
+
+			if ( n2 & 2 )
+			{
+				dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
+				a_ptr += lda*2;
+				x_ptr += 2;	
+			}
+
+
+			if ( n2 & 1 )
+			{
+				dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
+				a_ptr += lda;
+				x_ptr += 1;	
+
+			}
+
+
+		}
+		else
+		{
+
+			for( i = 0; i < n1 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[1] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[2] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[3] = x_ptr[0];
+				x_ptr += inc_x;	
+				dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				ap[2] += lda4; 
+				ap[3] += lda4; 
+				a_ptr += lda4;
+			}
+
+			for( i = 0; i < n2 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
+				a_ptr += lda;
+
+			}
+
+		}
+
+		a     += NB;
+		if ( inc_y != 1 )
+		{
+			add_y(NB,ybuffer,y_ptr,inc_y);
+			y_ptr += NB * inc_y;
+		}
+		else
+			y_ptr += NB ;
+
+	}
+
+	if ( m3 == 0 ) return(0);
+
+	if ( m3 == 3 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		FLOAT temp2 = 0.0;
+		if ( lda == 3 && inc_x ==1 )
+		{
+
+			for( i = 0; i < ( n & -4 ); i+=4 )
+			{
+
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
+				temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+
+				temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9]  * x_ptr[3];
+				temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
+				temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
+
+				a_ptr += 12;
+				x_ptr += 4;
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += 3;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp2;
+		return(0);
+	}
+
+
+	if ( m3 == 2 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		if ( lda == 2 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4) ; i+=4 )
+			{
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
+				temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+				a_ptr += 8;
+				x_ptr += 4;
+
+			}
+
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0]   * x_ptr[0];
+				temp1 += a_ptr[1]   * x_ptr[0];
+				a_ptr += 2;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		return(0);
+	}
+
+	if ( m3 == 1 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp = 0.0;
+		if ( lda == 1 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4); i+=4 )
+			{
+				temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
+	
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp += a_ptr[i] * x_ptr[i];
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp += a_ptr[0] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+			}
+
+		}
+		y_ptr[0] += alpha * temp;
+		return(0);
+	}
+
+
+	return(0);
+}
+
+
--- a/kernel/x86_64/dgemv_n_microk_haswell-4.c
+++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c
@ -0,0 +1,247 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+
+#define HAVE_KERNEL_4x8 1
+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+	"vbroadcastsd    (%2), %%ymm12	 \n\t"	// x0 
+	"vbroadcastsd   8(%2), %%ymm13	 \n\t"	// x1 
+	"vbroadcastsd  16(%2), %%ymm14	 \n\t"	// x2 
+	"vbroadcastsd  24(%2), %%ymm15	 \n\t"	// x3 
+	"vbroadcastsd  32(%2), %%ymm0 	 \n\t"	// x4 
+	"vbroadcastsd  40(%2), %%ymm1 	 \n\t"	// x5 
+	"vbroadcastsd  48(%2), %%ymm2 	 \n\t"	// x6 
+	"vbroadcastsd  56(%2), %%ymm3 	 \n\t"	// x7 
+
+	"vbroadcastsd    (%9), %%ymm6 	 \n\t"	// alpha 
+
+        "testq          $0x04, %1                      \n\t"
+        "jz             .L8LABEL%=                     \n\t"
+
+	"vmovupd	(%3,%0,8), %%ymm7	       \n\t"	// 4 * y
+	"vxorpd		%%ymm4 , %%ymm4, %%ymm4        \n\t"
+	"vxorpd		%%ymm5 , %%ymm5, %%ymm5        \n\t"
+
+	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231pd   (%5,%0,8), %%ymm13, %%ymm5      \n\t" 
+	"vfmadd231pd   (%6,%0,8), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231pd   (%7,%0,8), %%ymm15, %%ymm5      \n\t" 
+
+	"vfmadd231pd   (%4,%8,8), %%ymm0 , %%ymm4      \n\t" 
+	"vfmadd231pd   (%5,%8,8), %%ymm1 , %%ymm5      \n\t" 
+	"vfmadd231pd   (%6,%8,8), %%ymm2 , %%ymm4      \n\t" 
+	"vfmadd231pd   (%7,%8,8), %%ymm3 , %%ymm5      \n\t" 
+
+	"vaddpd		%%ymm4 , %%ymm5 , %%ymm5       \n\t"
+	"vmulpd		%%ymm6 , %%ymm5 , %%ymm5       \n\t"
+	"vaddpd		%%ymm7 , %%ymm5 , %%ymm5       \n\t"
+
+
+	"vmovupd  %%ymm5,   (%3,%0,8)		       \n\t"	// 4 * y
+
+        "addq		$4 , %8	  	 	       \n\t"
+        "addq		$4 , %0	  	 	       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+
+        ".L8LABEL%=:                                   \n\t"
+
+        "cmpq           $0, %1                         \n\t"
+        "je             .L16END%=                      \n\t"
+
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+
+	"vxorpd		%%ymm4 , %%ymm4, %%ymm4        \n\t"
+	"vxorpd		%%ymm5 , %%ymm5, %%ymm5        \n\t"
+	"vmovupd	(%3,%0,8), %%ymm8	       \n\t"	// 4 * y
+	"vmovupd      32(%3,%0,8), %%ymm9	       \n\t"	// 4 * y
+
+	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5      \n\t" 
+	"vfmadd231pd   (%5,%0,8), %%ymm13, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5      \n\t" 
+	"vfmadd231pd   (%6,%0,8), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5      \n\t" 
+	"vfmadd231pd   (%7,%0,8), %%ymm15, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5      \n\t" 
+
+	"vfmadd231pd   (%4,%8,8), %%ymm0 , %%ymm4      \n\t" 
+        "addq		$8 , %0	  	 	       \n\t"
+	"vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5      \n\t" 
+	"vfmadd231pd   (%5,%8,8), %%ymm1 , %%ymm4      \n\t" 
+	"vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5      \n\t" 
+	"vfmadd231pd   (%6,%8,8), %%ymm2 , %%ymm4      \n\t" 
+	"vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5      \n\t" 
+	"vfmadd231pd   (%7,%8,8), %%ymm3 , %%ymm4      \n\t" 
+	"vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5      \n\t" 
+
+	"vfmadd231pd     %%ymm6 , %%ymm4 , %%ymm8      \n\t"
+	"vfmadd231pd     %%ymm6 , %%ymm5 , %%ymm9      \n\t"
+
+        "addq		$8 , %8	  	 	      \n\t"
+	"vmovupd  %%ymm8,-64(%3,%0,8)		      \n\t"	// 4 * y
+	"subq	        $8 , %1			      \n\t"		
+	"vmovupd  %%ymm9,-32(%3,%0,8)		      \n\t"	// 4 * y
+
+	"jnz		.L01LOOP%=		      \n\t"
+
+        ".L16END%=:                             \n\t"
+	"vzeroupper			        \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (lda4),   // 8
+          "r" (alpha)   // 9
+	: "cc", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+
+#define HAVE_KERNEL_4x4 1
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+	"vbroadcastsd    (%2), %%ymm12	 \n\t"	// x0 
+	"vbroadcastsd   8(%2), %%ymm13	 \n\t"	// x1 
+	"vbroadcastsd  16(%2), %%ymm14	 \n\t"	// x2 
+	"vbroadcastsd  24(%2), %%ymm15	 \n\t"	// x3 
+
+	"vbroadcastsd    (%8), %%ymm6 	 \n\t"	// alpha 
+
+        "testq          $0x04, %1                      \n\t"
+        "jz             .L8LABEL%=                     \n\t"
+
+	"vxorpd		%%ymm4 , %%ymm4, %%ymm4        \n\t"
+	"vxorpd		%%ymm5 , %%ymm5, %%ymm5        \n\t"
+	"vmovupd	(%3,%0,8), %%ymm7	       \n\t"	// 4 * y
+
+	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231pd   (%5,%0,8), %%ymm13, %%ymm5      \n\t" 
+	"vfmadd231pd   (%6,%0,8), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231pd   (%7,%0,8), %%ymm15, %%ymm5      \n\t" 
+
+	"vaddpd		%%ymm4 , %%ymm5 , %%ymm5       \n\t"
+	"vmulpd		%%ymm6 , %%ymm5 , %%ymm5       \n\t"
+	"vaddpd		%%ymm7 , %%ymm5 , %%ymm5       \n\t"
+
+	"vmovupd  %%ymm5,   (%3,%0,8)		       \n\t"	// 4 * y
+
+        "addq		$4 , %0	  	 	       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+
+        ".L8LABEL%=:                                   \n\t"
+
+        "cmpq           $0, %1                         \n\t"
+        "je             .L8END%=                       \n\t"
+
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vxorpd		%%ymm4 , %%ymm4, %%ymm4        \n\t"
+	"vxorpd		%%ymm5 , %%ymm5, %%ymm5        \n\t"
+	"vmovupd	(%3,%0,8), %%ymm8	       \n\t"	// 4 * y
+	"vmovupd      32(%3,%0,8), %%ymm9	       \n\t"	// 4 * y
+
+	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5      \n\t" 
+	"vfmadd231pd   (%5,%0,8), %%ymm13, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5      \n\t" 
+	"vfmadd231pd   (%6,%0,8), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5      \n\t" 
+	"vfmadd231pd   (%7,%0,8), %%ymm15, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5      \n\t" 
+
+	"vfmadd231pd     %%ymm6 , %%ymm4 , %%ymm8      \n\t"
+	"vfmadd231pd     %%ymm6 , %%ymm5 , %%ymm9      \n\t"
+
+	"vmovupd  %%ymm8,   (%3,%0,8)		      \n\t"	// 4 * y
+	"vmovupd  %%ymm9, 32(%3,%0,8)		      \n\t"	// 4 * y
+
+        "addq		$8 , %0	  	 	      \n\t"
+	"subq	        $8 , %1			      \n\t"		
+	"jnz		.L01LOOP%=		      \n\t"
+
+        ".L8END%=:                                    \n\t"
+	"vzeroupper			              \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (alpha)   // 8
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
--- a/kernel/x86_64/dgemv_n_microk_nehalem-4.c
+++ b/kernel/x86_64/dgemv_n_microk_nehalem-4.c
@ -0,0 +1,265 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+
+#define HAVE_KERNEL_4x8 1
+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"movsd    (%2), %%xmm12	 \n\t"	// x0 
+	"movsd   8(%2), %%xmm13	 \n\t"	// x1 
+	"movsd  16(%2), %%xmm14	 \n\t"	// x2 
+	"movsd  24(%2), %%xmm15	 \n\t"	// x3 
+	"shufpd $0,  %%xmm12, %%xmm12\n\t"	
+	"shufpd $0,  %%xmm13, %%xmm13\n\t"	
+	"shufpd $0,  %%xmm14, %%xmm14\n\t"	
+	"shufpd $0,  %%xmm15, %%xmm15\n\t"	
+
+	"movsd  32(%2), %%xmm0	 \n\t"	// x4 
+	"movsd  40(%2), %%xmm1	 \n\t"	// x5 
+	"movsd  48(%2), %%xmm2	 \n\t"	// x6 
+	"movsd  56(%2), %%xmm3	 \n\t"	// x7 
+	"shufpd $0,  %%xmm0 , %%xmm0 \n\t"	
+	"shufpd $0,  %%xmm1 , %%xmm1 \n\t"	
+	"shufpd $0,  %%xmm2 , %%xmm2 \n\t"	
+	"shufpd $0,  %%xmm3 , %%xmm3 \n\t"	
+
+	"movsd    (%9), %%xmm6	     \n\t"	// alpha 
+	"shufpd $0,  %%xmm6 , %%xmm6 \n\t"	
+
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"xorpd           %%xmm4 , %%xmm4	 \n\t"
+	"xorpd           %%xmm5 , %%xmm5	 \n\t"
+	"movups             (%3,%0,8), %%xmm7          \n\t" // 2 * y
+
+	".align 2				       \n\t"
+	"movups             (%4,%0,8), %%xmm8          \n\t" 
+	"movups             (%5,%0,8), %%xmm9          \n\t" 
+	"movups             (%6,%0,8), %%xmm10         \n\t" 
+	"movups             (%7,%0,8), %%xmm11         \n\t" 
+	".align 2				       \n\t"
+	"mulpd		%%xmm12, %%xmm8		       \n\t"
+	"mulpd		%%xmm13, %%xmm9		       \n\t"
+	"mulpd		%%xmm14, %%xmm10	       \n\t"
+	"mulpd		%%xmm15, %%xmm11	       \n\t"
+	"addpd		%%xmm8 , %%xmm4		       \n\t"
+	"addpd		%%xmm9 , %%xmm5		       \n\t"
+	"addpd		%%xmm10, %%xmm4	               \n\t"
+	"addpd		%%xmm11, %%xmm5 	       \n\t"
+
+	"movups             (%4,%8,8), %%xmm8          \n\t" 
+	"movups             (%5,%8,8), %%xmm9          \n\t" 
+	"movups             (%6,%8,8), %%xmm10         \n\t" 
+	"movups             (%7,%8,8), %%xmm11         \n\t" 
+	".align 2				       \n\t"
+	"mulpd		%%xmm0 , %%xmm8		       \n\t"
+	"mulpd		%%xmm1 , %%xmm9		       \n\t"
+	"mulpd		%%xmm2 , %%xmm10	       \n\t"
+	"mulpd		%%xmm3 , %%xmm11	       \n\t"
+	"addpd		%%xmm8 , %%xmm4		       \n\t"
+	"addpd		%%xmm9 , %%xmm5		       \n\t"
+	"addpd		%%xmm10, %%xmm4	       	       \n\t"
+	"addpd		%%xmm11, %%xmm5 	       \n\t"
+
+	"addpd		%%xmm5 , %%xmm4 	       \n\t"
+	"mulpd		%%xmm6 , %%xmm4		       \n\t" 
+	"addpd		%%xmm4 , %%xmm7 	       \n\t"
+
+	"movups  %%xmm7 ,    (%3,%0,8)		       \n\t"	// 2 * y
+
+	"xorpd           %%xmm4 , %%xmm4	 \n\t"
+	"xorpd           %%xmm5 , %%xmm5	 \n\t"
+	"movups           16(%3,%0,8), %%xmm7          \n\t" // 2 * y
+
+	".align 2				       \n\t"
+	"movups           16(%4,%0,8), %%xmm8          \n\t" 
+	"movups           16(%5,%0,8), %%xmm9          \n\t" 
+	"movups           16(%6,%0,8), %%xmm10         \n\t" 
+	"movups           16(%7,%0,8), %%xmm11         \n\t" 
+	".align 2				       \n\t"
+	"mulpd		%%xmm12, %%xmm8		       \n\t"
+	"mulpd		%%xmm13, %%xmm9		       \n\t"
+	"mulpd		%%xmm14, %%xmm10	       \n\t"
+	"mulpd		%%xmm15, %%xmm11	       \n\t"
+	"addpd		%%xmm8 , %%xmm4		       \n\t"
+	"addpd		%%xmm9 , %%xmm5		       \n\t"
+	"addpd		%%xmm10, %%xmm4	               \n\t"
+	"addpd		%%xmm11, %%xmm5 	       \n\t"
+
+	"movups           16(%4,%8,8), %%xmm8          \n\t" 
+	"movups           16(%5,%8,8), %%xmm9          \n\t" 
+	"movups           16(%6,%8,8), %%xmm10         \n\t" 
+	"movups           16(%7,%8,8), %%xmm11         \n\t" 
+	".align 2				       \n\t"
+	"mulpd		%%xmm0 , %%xmm8		       \n\t"
+	"mulpd		%%xmm1 , %%xmm9		       \n\t"
+	"mulpd		%%xmm2 , %%xmm10	       \n\t"
+	"mulpd		%%xmm3 , %%xmm11	       \n\t"
+	"addpd		%%xmm8 , %%xmm4		       \n\t"
+	"addpd		%%xmm9 , %%xmm5		       \n\t"
+	"addpd		%%xmm10, %%xmm4	       	       \n\t"
+	"addpd		%%xmm11, %%xmm5 	       \n\t"
+
+        "addq		$4 , %8	  	 	       \n\t"
+	"addpd		%%xmm5 , %%xmm4 	       \n\t"
+	"mulpd		%%xmm6 , %%xmm4		       \n\t" 
+	"addpd		%%xmm4 , %%xmm7 	       \n\t"
+
+	"movups  %%xmm7 ,  16(%3,%0,8)		       \n\t"	// 2 * y
+
+        "addq		$4 , %0	  	 	       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+	"jnz		.L01LOOP%=		       \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (lda4),   // 8
+          "r" (alpha)   // 9
+	: "cc", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+
+
+#define HAVE_KERNEL_4x4 1
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"movsd    (%2), %%xmm12	 \n\t"	// x0 
+	"movsd   8(%2), %%xmm13	 \n\t"	// x1 
+	"movsd  16(%2), %%xmm14	 \n\t"	// x2 
+	"movsd  24(%2), %%xmm15	 \n\t"	// x3 
+	"shufpd $0,  %%xmm12, %%xmm12\n\t"	
+	"shufpd $0,  %%xmm13, %%xmm13\n\t"	
+	"shufpd $0,  %%xmm14, %%xmm14\n\t"	
+	"shufpd $0,  %%xmm15, %%xmm15\n\t"	
+
+	"movsd    (%8), %%xmm6	     \n\t"	// alpha 
+	"shufpd $0,  %%xmm6 , %%xmm6 \n\t"	
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"xorpd           %%xmm4 , %%xmm4	 \n\t"
+	"xorpd           %%xmm5 , %%xmm5	 \n\t"
+	"movups	       (%3,%0,8), %%xmm7	 \n\t"	// 2 * y
+
+	"movups             (%4,%0,8), %%xmm8          \n\t" 
+	"movups             (%5,%0,8), %%xmm9          \n\t" 
+	"movups             (%6,%0,8), %%xmm10         \n\t" 
+	"movups             (%7,%0,8), %%xmm11         \n\t" 
+	"mulpd		%%xmm12, %%xmm8		       \n\t"
+	"mulpd		%%xmm13, %%xmm9		       \n\t"
+	"mulpd		%%xmm14, %%xmm10	       \n\t"
+	"mulpd		%%xmm15, %%xmm11	       \n\t"
+	"addpd		%%xmm8 , %%xmm4		       \n\t"
+	"addpd		%%xmm9 , %%xmm4		       \n\t"
+	"addpd		%%xmm10 , %%xmm4	       \n\t"
+	"addpd		%%xmm4 , %%xmm11	       \n\t"
+
+	"mulpd		%%xmm6 , %%xmm11	       \n\t" 
+	"addpd		%%xmm7 , %%xmm11 	       \n\t"
+	"movups  %%xmm11,    (%3,%0,8)		       \n\t"	// 2 * y
+
+	"xorpd           %%xmm4 , %%xmm4	 \n\t"
+	"xorpd           %%xmm5 , %%xmm5	 \n\t"
+	"movups	     16(%3,%0,8), %%xmm7	 \n\t"	// 2 * y
+
+	"movups           16(%4,%0,8), %%xmm8          \n\t" 
+	"movups           16(%5,%0,8), %%xmm9          \n\t" 
+	"movups           16(%6,%0,8), %%xmm10         \n\t" 
+	"movups           16(%7,%0,8), %%xmm11         \n\t" 
+	"mulpd		%%xmm12, %%xmm8		       \n\t"
+	"mulpd		%%xmm13, %%xmm9		       \n\t"
+	"mulpd		%%xmm14, %%xmm10	       \n\t"
+	"mulpd		%%xmm15, %%xmm11	       \n\t"
+	"addpd		%%xmm8 , %%xmm4		       \n\t"
+	"addpd		%%xmm9 , %%xmm4		       \n\t"
+	"addpd		%%xmm10 , %%xmm4	       \n\t"
+	"addpd		%%xmm4 , %%xmm11	       \n\t"
+
+	"mulpd		%%xmm6 , %%xmm11	       \n\t" 
+	"addpd		%%xmm7 , %%xmm11 	       \n\t"
+	"movups  %%xmm11,  16(%3,%0,8)		       \n\t"	// 2 * y
+
+        "addq		$4 , %0	  	 	       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+	"jnz		.L01LOOP%=		       \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (alpha)   // 8
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
--- a/kernel/x86_64/dgemv_t_4.c
+++ b/kernel/x86_64/dgemv_t_4.c
@ -0,0 +1,615 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(HASWELL)
+#include "dgemv_t_microk_haswell-4.c"
+#endif
+
+#define NBMAX 2048
+
+#ifndef HAVE_KERNEL_4x4
+
+static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3];
+	FLOAT temp0 = 0.0;
+	FLOAT temp1 = 0.0;
+	FLOAT temp2 = 0.0;
+	FLOAT temp3 = 0.0;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];		
+		temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];		
+		temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];		
+		temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];		
+	}
+	y[0] = temp0;
+	y[1] = temp1;
+	y[2] = temp2;
+	y[3] = temp3;
+}
+	
+#endif
+
+static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y)  __attribute__ ((noinline));
+
+static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+
+	i=0;
+
+        __asm__  __volatile__
+ 	(
+	"xorpd %%xmm10 , %%xmm10		\n\t"
+	"xorpd %%xmm11 , %%xmm11		\n\t"
+		
+	"testq	$2 , %1				\n\t"
+	"jz	.L01LABEL%=			\n\t"
+
+	"movups  (%5,%0,8) , %%xmm14		\n\t" // x
+	"movups  (%3,%0,8) , %%xmm12		\n\t" // ap0
+	"movups  (%4,%0,8) , %%xmm13		\n\t" // ap1
+	"mulpd   %%xmm14   , %%xmm12 		\n\t"
+	"mulpd   %%xmm14   , %%xmm13 		\n\t"
+        "addq           $2 , %0                 \n\t"
+	"addpd   %%xmm12   , %%xmm10		\n\t"
+        "subq           $2 , %1                 \n\t"
+	"addpd   %%xmm13   , %%xmm11		\n\t"
+
+        ".L01LABEL%=:                           \n\t"
+
+	"cmpq	$0, %1				\n\t"
+	"je	.L01END%=			\n\t"
+
+        ".align 16                              \n\t"
+        ".L01LOOP%=:                            \n\t"
+
+	"movups  (%5,%0,8) , %%xmm14		\n\t" // x
+	"movups  (%3,%0,8) , %%xmm12		\n\t" // ap0
+	"movups  (%4,%0,8) , %%xmm13		\n\t" // ap1
+	"mulpd   %%xmm14   , %%xmm12 		\n\t"
+	"mulpd   %%xmm14   , %%xmm13 		\n\t"
+	"addpd   %%xmm12   , %%xmm10		\n\t"
+	"addpd   %%xmm13   , %%xmm11		\n\t"
+
+	"movups  16(%5,%0,8) , %%xmm14		\n\t" // x
+	"movups  16(%3,%0,8) , %%xmm12		\n\t" // ap0
+	"movups  16(%4,%0,8) , %%xmm13		\n\t" // ap1
+	"mulpd   %%xmm14   , %%xmm12 		\n\t"
+	"mulpd   %%xmm14   , %%xmm13 		\n\t"
+	"addpd   %%xmm12   , %%xmm10		\n\t"
+	"addpd   %%xmm13   , %%xmm11		\n\t"
+
+        "addq           $4 , %0                 \n\t"
+        "subq           $4 , %1                 \n\t"
+        "jnz            .L01LOOP%=              \n\t"
+
+        ".L01END%=:                             \n\t"
+
+	"haddpd        %%xmm10, %%xmm10         \n\t"
+	"haddpd        %%xmm11, %%xmm11         \n\t"
+
+	"movsd	       %%xmm10, (%2)	        \n\t"
+	"movsd	       %%xmm11,8(%2)	        \n\t"
+
+        :
+   	:
+	"r" (i),	 // 0
+	"r" (n),	 // 1
+        "r" (y),         // 2    
+        "r" (ap0),       // 3
+        "r" (ap1),       // 4
+        "r" (x)          // 5
+        : "cc",
+       	"%xmm4", "%xmm5", "%xmm10", "%xmm11",
+       	"%xmm12", "%xmm13", "%xmm14", "%xmm15",
+       	"memory"
+       	);
+
+
+}
+	
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)  __attribute__ ((noinline));
+
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+
+	i=0;
+
+        __asm__  __volatile__
+ 	(
+	"xorpd %%xmm9  , %%xmm9 		\n\t"
+	"xorpd %%xmm10 , %%xmm10		\n\t"
+	
+	"testq	$2 , %1				\n\t"
+	"jz	.L01LABEL%=			\n\t"
+
+	"movups  (%3,%0,8) , %%xmm12		\n\t"
+	"movups  (%4,%0,8) , %%xmm11		\n\t"
+	"mulpd   %%xmm11   , %%xmm12 		\n\t"
+        "addq           $2 , %0                 \n\t"
+	"addpd   %%xmm12   , %%xmm10		\n\t"
+        "subq           $2 , %1                 \n\t"
+
+        ".L01LABEL%=:                           \n\t"
+
+	"cmpq	$0, %1				\n\t"
+	"je	.L01END%=			\n\t"
+
+        ".align 16                              \n\t"
+        ".L01LOOP%=:                            \n\t"
+
+	"movups    (%3,%0,8) , %%xmm12		\n\t"
+	"movups  16(%3,%0,8) , %%xmm14		\n\t"
+	"movups    (%4,%0,8) , %%xmm11		\n\t"
+	"movups  16(%4,%0,8) , %%xmm13		\n\t"
+	"mulpd   %%xmm11   , %%xmm12 		\n\t"
+	"mulpd   %%xmm13   , %%xmm14 		\n\t"
+        "addq           $4 , %0                 \n\t"
+	"addpd   %%xmm12   , %%xmm10		\n\t"
+        "subq           $4 , %1                 \n\t"
+	"addpd   %%xmm14   , %%xmm9 		\n\t"
+
+        "jnz            .L01LOOP%=              \n\t"
+
+        ".L01END%=:                             \n\t"
+
+	"addpd	       %%xmm9 , %%xmm10         \n\t"
+	"haddpd        %%xmm10, %%xmm10         \n\t"
+
+	"movsd	       %%xmm10, (%2)	        \n\t"
+
+        :
+   	:
+	"r" (i),	 // 0
+	"r" (n),	 // 1
+        "r" (y),         // 2    
+        "r" (ap),        // 3
+        "r" (x)          // 4
+        : "cc",
+       	"%xmm9", "%xmm10" ,
+       	"%xmm11", "%xmm12", "%xmm13", "%xmm14",
+       	"memory"
+       	);
+
+
+}
+	
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
+{
+        BLASLONG i;
+        for ( i=0; i<n; i++ )
+        {
+                *dest = *src;
+                dest++;
+                src += inc_src;
+        }
+}
+
+static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+
+        BLASLONG i;
+
+	if ( inc_dest != 1 )
+	{
+        	for ( i=0; i<n; i++ )
+        	{
+                	*dest += src[i]  * da;
+                	dest  += inc_dest;
+		}
+		return;
+        }
+
+	i=0;
+
+        __asm__  __volatile__
+ 	(
+	"movsd	 (%2) , %%xmm10                 \n\t"
+	"shufpd  $0 , %%xmm10 , %%xmm10		\n\t"
+
+        ".align 16                              \n\t"
+        ".L01LOOP%=:                            \n\t"
+
+	"movups  (%3,%0,8) , %%xmm12		\n\t"
+	"movups  (%4,%0,8) , %%xmm11		\n\t"
+	"mulpd   %%xmm10   , %%xmm12 		\n\t"
+        "addq           $2 , %0                 \n\t"
+	"addpd   %%xmm12   , %%xmm11		\n\t"
+        "subq           $2 , %1                 \n\t"
+	"movups  %%xmm11, -16(%4,%0,8)		\n\t"
+
+        "jnz            .L01LOOP%=              \n\t"
+
+        :
+   	:
+	"r" (i),	  // 0
+	"r" (n),	  // 1
+        "r" (&da),        // 2    
+        "r" (src),        // 3
+        "r" (dest)        // 4
+        : "cc",
+       	"%xmm10", "%xmm11", "%xmm12",
+       	"memory"
+       	);
+
+
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG register i;
+	BLASLONG register j;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	BLASLONG n0;
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG m3;
+	BLASLONG n2;
+	FLOAT ybuffer[4],*xbuffer;
+	FLOAT *ytemp;
+
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
+	xbuffer = buffer;
+	ytemp   = buffer + NBMAX;
+	
+	n0 = n / NBMAX;
+        n1 = (n % NBMAX)  >> 2 ;
+        n2 = n & 3  ;
+
+	m3 = m & 3  ;
+        m1 = m & -4 ;
+        m2 = (m & (NBMAX-1)) - m3 ;
+
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		y_ptr = y;
+		a_ptr = a;
+		x_ptr = x;
+
+		if ( inc_x == 1 )
+			xbuffer = x_ptr;
+		else
+			copy_x(NB,x_ptr,xbuffer,inc_x);
+
+
+		FLOAT *ap[4];
+		FLOAT *yp;
+		BLASLONG register lda4 = 4 * lda;
+		ap[0] = a_ptr;
+		ap[1] = a_ptr + lda;
+		ap[2] = ap[1] + lda;
+		ap[3] = ap[2] + lda;
+
+		if ( n0 > 0 )
+		{
+			BLASLONG nb1 = NBMAX / 4;
+			for( j=0; j<n0; j++)
+			{
+
+				yp = ytemp;
+				for( i = 0; i < nb1  ; i++)
+				{
+					dgemv_kernel_4x4(NB,ap,xbuffer,yp);
+					ap[0] += lda4 ;
+					ap[1] += lda4 ;
+					ap[2] += lda4 ;
+					ap[3] += lda4 ;
+					yp += 4;
+				}
+				add_y(nb1*4, alpha, ytemp, y_ptr, inc_y );
+				y_ptr += nb1 * inc_y * 4;
+				a_ptr += nb1 * lda4 ;
+
+			}
+
+		}
+
+
+		yp = ytemp;
+
+		for( i = 0; i < n1 ; i++)
+		{
+			dgemv_kernel_4x4(NB,ap,xbuffer,yp);
+			ap[0] += lda4 ;
+			ap[1] += lda4 ;
+			ap[2] += lda4 ;
+			ap[3] += lda4 ;
+			yp += 4;
+		}
+		if ( n1 > 0 )
+		{
+			add_y(n1*4, alpha, ytemp, y_ptr, inc_y );
+			y_ptr += n1 * inc_y * 4;
+			a_ptr += n1 * lda4 ;
+		}
+
+		if ( n2 & 2 )
+		{
+
+			dgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer);
+			a_ptr  += lda * 2;
+			*y_ptr += ybuffer[0] * alpha;
+			y_ptr  += inc_y;
+			*y_ptr += ybuffer[1] * alpha;
+			y_ptr  += inc_y;
+
+		}
+
+		if ( n2 & 1 )
+		{
+
+			dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
+			a_ptr  += lda;
+			*y_ptr += ybuffer[0] * alpha;
+			y_ptr  += inc_y;
+
+		}
+		a += NB;
+		x += NB * inc_x;	
+	}
+
+	if ( m3 == 0 ) return(0);
+
+	x_ptr = x;
+	a_ptr = a;
+	if ( m3 == 3 )
+	{
+		FLOAT xtemp0 = *x_ptr * alpha;
+		x_ptr += inc_x;
+		FLOAT xtemp1 = *x_ptr * alpha;
+		x_ptr += inc_x;
+		FLOAT xtemp2 = *x_ptr * alpha;
+
+		FLOAT *aj = a_ptr;
+		y_ptr = y;
+
+		if ( lda == 3 && inc_y == 1 )
+		{
+
+			for ( j=0; j< ( n & -4) ; j+=4 )
+			{
+
+				y_ptr[j]   += aj[0] * xtemp0 + aj[1]  * xtemp1 + aj[2]  * xtemp2;
+				y_ptr[j+1] += aj[3] * xtemp0 + aj[4]  * xtemp1 + aj[5]  * xtemp2;
+				y_ptr[j+2] += aj[6] * xtemp0 + aj[7]  * xtemp1 + aj[8]  * xtemp2;
+				y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
+			 	aj        += 12;
+			}
+
+			for ( ; j<n; j++ )
+			{
+				y_ptr[j]  += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+			 	aj        += 3;
+			}
+
+		}
+		else
+		{
+
+			if ( inc_y == 1 )
+			{
+
+				BLASLONG register lda2 = lda << 1;
+				BLASLONG register lda4 = lda << 2;
+				BLASLONG register lda3 = lda2 + lda;
+
+				for ( j=0; j< ( n & -4 ); j+=4 )
+				{
+
+					y_ptr[j]    += *aj        * xtemp0 + *(aj+1)      * xtemp1 + *(aj+2)      * xtemp2;
+					y_ptr[j+1]  += *(aj+lda)  * xtemp0 + *(aj+lda+1)  * xtemp1 + *(aj+lda+2)  * xtemp2;
+					y_ptr[j+2]  += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2;
+					y_ptr[j+3]  += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2;
+			 		aj          += lda4;
+				}
+
+				for ( ; j< n ; j++ )
+				{
+
+					y_ptr[j]    += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ;
+			 		aj          += lda;
+				}
+
+			}
+			else
+			{
+
+				for ( j=0; j<n; j++ )
+				{
+					*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
+				 	y_ptr += inc_y;
+			 		aj    += lda;
+				}
+
+
+			}
+
+		}
+		return(0);
+	}
+
+	if ( m3 == 2 )
+	{
+		FLOAT xtemp0 = *x_ptr * alpha;
+		x_ptr += inc_x;
+		FLOAT xtemp1 = *x_ptr * alpha;
+
+		FLOAT *aj = a_ptr;
+		y_ptr = y;
+
+		if ( lda == 2 && inc_y == 1 )
+		{
+
+			for ( j=0; j< ( n & -4) ; j+=4 )
+			{
+				y_ptr[j]   += aj[0] * xtemp0 + aj[1] * xtemp1 ;
+				y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ;
+				y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ;
+				y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ;
+			 	aj         += 8;
+
+			}
+
+			for ( ; j<n; j++ )
+			{
+				y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
+			 	aj       += 2;
+			}
+
+		}
+		else
+		{
+			if ( inc_y == 1 )
+			{
+
+				BLASLONG register lda2 = lda << 1;
+				BLASLONG register lda4 = lda << 2;
+				BLASLONG register lda3 = lda2 + lda;
+
+				for ( j=0; j< ( n & -4 ); j+=4 )
+				{
+
+					y_ptr[j]    += *aj        * xtemp0 + *(aj+1)      * xtemp1 ;
+					y_ptr[j+1]  += *(aj+lda)  * xtemp0 + *(aj+lda+1)  * xtemp1 ;
+					y_ptr[j+2]  += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ;
+					y_ptr[j+3]  += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ;
+			 		aj          += lda4;
+				}
+
+				for ( ; j< n ; j++ )
+				{
+
+					y_ptr[j]    += *aj * xtemp0 + *(aj+1) * xtemp1 ;
+			 		aj          += lda;
+				}
+
+			}
+			else
+			{
+				for ( j=0; j<n; j++ )
+				{
+					*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
+			 		y_ptr += inc_y;
+			 		aj    += lda;
+				}
+			}
+
+		}
+		return(0);
+
+	}
+
+	FLOAT xtemp = *x_ptr * alpha;
+	FLOAT *aj = a_ptr;
+	y_ptr = y;
+	if ( lda == 1 && inc_y == 1 )
+	{
+		for ( j=0; j< ( n & -4) ; j+=4 )
+		{
+			y_ptr[j]   += aj[j]   * xtemp;
+			y_ptr[j+1] += aj[j+1] * xtemp;
+			y_ptr[j+2] += aj[j+2] * xtemp;
+			y_ptr[j+3] += aj[j+3] * xtemp;
+		}
+		for ( ; j<n   ; j++ )
+		{
+			y_ptr[j] += aj[j] * xtemp;
+		}
+
+
+
+	}
+	else
+	{
+		if ( inc_y == 1 )
+		{
+
+			BLASLONG register lda2 = lda << 1;
+			BLASLONG register lda4 = lda << 2;
+			BLASLONG register lda3 = lda2 + lda;
+			for ( j=0; j< ( n & -4 ); j+=4 )
+			{
+				y_ptr[j]    += *aj        * xtemp;
+				y_ptr[j+1]  += *(aj+lda)  * xtemp;
+				y_ptr[j+2]  += *(aj+lda2) * xtemp;
+				y_ptr[j+3]  += *(aj+lda3) * xtemp;
+		 		aj          += lda4  ;
+			}
+
+			for ( ; j<n; j++ )
+			{
+				y_ptr[j]  += *aj * xtemp;
+		 		aj        += lda;
+			}
+
+		}
+		else
+		{
+			for ( j=0; j<n; j++ )
+			{
+				*y_ptr += *aj * xtemp;
+		 		y_ptr += inc_y;
+		 		aj    += lda;
+			}
+
+		}
+	}
+
+	return(0);
+}
+
+
--- a/kernel/x86_64/dgemv_t_microk_haswell-4.c
+++ b/kernel/x86_64/dgemv_t_microk_haswell-4.c
@ -0,0 +1,127 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			         \n\t"
+	"vxorpd		%%ymm4 , %%ymm4, %%ymm4  \n\t"
+	"vxorpd		%%ymm5 , %%ymm5, %%ymm5  \n\t"
+	"vxorpd		%%ymm6 , %%ymm6, %%ymm6  \n\t"
+	"vxorpd		%%ymm7 , %%ymm7, %%ymm7  \n\t"
+
+        "testq          $0x04, %1                      \n\t"
+        "jz             .L08LABEL%=                    \n\t"
+
+	"vmovups	(%2,%0,8), %%ymm12       \n\t"	// 4 * x
+
+	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231pd   (%5,%0,8), %%ymm12, %%ymm5      \n\t" 
+	"vfmadd231pd   (%6,%0,8), %%ymm12, %%ymm6      \n\t" 
+	"vfmadd231pd   (%7,%0,8), %%ymm12, %%ymm7      \n\t" 
+
+        "addq		$4 , %0	  	 	      \n\t"
+	"subq	        $4 , %1			      \n\t"		
+
+        ".L08LABEL%=:                                  \n\t"
+
+        "cmpq           $0, %1                         \n\t"
+        "je             .L16END%=                      \n\t"
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	// "prefetcht0	 384(%2,%0,8)		 \n\t"
+	"vmovups	(%2,%0,8), %%ymm12       \n\t"	// 4 * x
+	"vmovups      32(%2,%0,8), %%ymm13       \n\t"	// 4 * x
+
+	// "prefetcht0	 384(%4,%0,8)		       \n\t"
+	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231pd   (%5,%0,8), %%ymm12, %%ymm5      \n\t" 
+	// "prefetcht0	 384(%5,%0,8)		       \n\t"
+	"vfmadd231pd   (%6,%0,8), %%ymm12, %%ymm6      \n\t" 
+	"vfmadd231pd   (%7,%0,8), %%ymm12, %%ymm7      \n\t" 
+	// "prefetcht0	 384(%6,%0,8)		       \n\t"
+	"vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5      \n\t" 
+        "addq		$8 , %0	  	 	       \n\t"
+	// "prefetcht0	 384(%7,%0,8)		       \n\t"
+	"vfmadd231pd -32(%6,%0,8), %%ymm13, %%ymm6     \n\t" 
+	"subq	        $8 , %1			       \n\t"		
+	"vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7     \n\t" 
+
+	"jnz		.L01LOOP%=		      \n\t"
+
+        ".L16END%=:                                   \n\t"
+
+	"vextractf128   $1 , %%ymm4, %%xmm12	      \n\t"
+	"vextractf128   $1 , %%ymm5, %%xmm13	      \n\t"
+	"vextractf128   $1 , %%ymm6, %%xmm14	      \n\t"
+	"vextractf128   $1 , %%ymm7, %%xmm15	      \n\t"
+
+	"vaddpd		%%xmm4, %%xmm12, %%xmm4       \n\t"
+	"vaddpd		%%xmm5, %%xmm13, %%xmm5       \n\t"
+	"vaddpd		%%xmm6, %%xmm14, %%xmm6       \n\t"
+	"vaddpd		%%xmm7, %%xmm15, %%xmm7       \n\t"
+
+        "vhaddpd        %%xmm4, %%xmm4, %%xmm4  \n\t"
+        "vhaddpd        %%xmm5, %%xmm5, %%xmm5  \n\t"
+        "vhaddpd        %%xmm6, %%xmm6, %%xmm6  \n\t"
+        "vhaddpd        %%xmm7, %%xmm7, %%xmm7  \n\t"
+
+        "vmovsd         %%xmm4,    (%3)         \n\t"
+        "vmovsd         %%xmm5,   8(%3)         \n\t"
+        "vmovsd         %%xmm6,  16(%3)         \n\t"
+        "vmovsd         %%xmm7,  24(%3)         \n\t"
+
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@ -0,0 +1,591 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+
+#if defined(BULLDOZER) || defined(PILEDRIVER)
+#include "sgemv_n_microk_bulldozer-4.c"
+#elif defined(NEHALEM)
+#include "sgemv_n_microk_nehalem-4.c"
+#elif defined(SANDYBRIDGE)
+#include "sgemv_n_microk_sandy-4.c"
+#elif defined(HASWELL)
+#include "sgemv_n_microk_haswell-4.c"
+#endif
+
+
+#define NBMAX 4096
+
+#ifndef HAVE_KERNEL_4x8
+
+static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+	BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3;
+	FLOAT *b0,*b1,*b2,*b3;
+	FLOAT *x4;
+	FLOAT x[8];
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3];
+	b0 = a0 + lda4 ;
+	b1 = a1 + lda4 ;
+	b2 = a2 + lda4 ;
+	b3 = a3 + lda4 ;
+	x4 = x + 4;
+
+	for ( i=0; i<8; i++)
+		x[i] = xo[i] * *alpha;
+
+	for ( i=0; i< n; i+=4 )
+	{
+
+		y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];		
+		y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];		
+		y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];		
+		y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];		
+
+		y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3];		
+		y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3];		
+		y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3];		
+		y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3];		
+
+	}
+}
+	
+#endif
+
+
+#ifndef HAVE_KERNEL_4x4
+
+static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+{
+	BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3;
+	FLOAT x[4];
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3];
+
+	for ( i=0; i<4; i++)
+		x[i] = xo[i] * *alpha;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];		
+		y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];		
+		y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];		
+		y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];		
+	}
+}
+	
+#endif
+
+#ifndef HAVE_KERNEL_4x2
+
+static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"movss    (%2)  , %%xmm12	 \n\t"	// x0 
+	"movss    (%6)  , %%xmm4 	 \n\t"	// alpha 
+	"movss   4(%2)  , %%xmm13	 \n\t"	// x1 
+        "mulss  %%xmm4  , %%xmm12        \n\t"  // alpha 
+        "mulss  %%xmm4  , %%xmm13        \n\t"  // alpha 
+	"shufps $0,  %%xmm12, %%xmm12    \n\t"	
+	"shufps $0,  %%xmm13, %%xmm13    \n\t"	
+
+	".align 16				       \n\t"
+	".L01LOOP%=:				       \n\t"
+	"movups	       (%3,%0,4), %%xmm4	       \n\t"	// 4 * y
+
+	"movups             (%4,%0,4), %%xmm8          \n\t" 
+	"movups             (%5,%0,4), %%xmm9          \n\t" 
+	"mulps		%%xmm12, %%xmm8		       \n\t"
+	"mulps		%%xmm13, %%xmm9		       \n\t"
+	"addps		%%xmm8 , %%xmm4		       \n\t"
+        "addq		$4 , %0	  	 	       \n\t"
+	"addps		%%xmm9 , %%xmm4		       \n\t"
+
+	"movups  %%xmm4 , -16(%3,%0,4)		       \n\t"	// 4 * y
+
+	"subq	        $4 , %1			       \n\t"		
+	"jnz		.L01LOOP%=		       \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (alpha)   // 6
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+#endif
+
+#ifndef HAVE_KERNEL_4x2
+
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+        BLASLONG register i = 0;
+	BLASLONG register n1 = n & -8 ;
+	BLASLONG register n2 = n & 4  ;
+
+        __asm__  __volatile__
+        (
+        "movss          (%2), %%xmm12            \n\t"  // x0 
+        "mulss          (%6), %%xmm12            \n\t"  // alpha 
+        "shufps $0,  %%xmm12, %%xmm12            \n\t"
+
+        "cmpq           $0, %1                   \n\t"
+        "je             .L16END%=                \n\t"
+
+        ".align 16                               \n\t"
+        ".L01LOOP%=:                             \n\t"
+        "movups       (%3,%0,4), %%xmm4          \n\t"  // 4 * y
+        "movups     16(%3,%0,4), %%xmm5          \n\t"  // 4 * y
+        "movups       (%4,%0,4), %%xmm8          \n\t"  // 4 * a
+        "movups     16(%4,%0,4), %%xmm9          \n\t"  // 4 * a
+	"mulps          %%xmm12, %%xmm8          \n\t"
+	"mulps          %%xmm12, %%xmm9          \n\t"
+        "addps          %%xmm4 , %%xmm8          \n\t"
+        "addps          %%xmm5 , %%xmm9          \n\t"
+
+        "addq           $8 , %0                  \n\t"
+	"movups  %%xmm8 , -32(%3,%0,4)           \n\t"    // 4 * y
+	"movups  %%xmm9 , -16(%3,%0,4)           \n\t"    // 4 * y
+
+        "subq           $8 , %1                  \n\t"
+
+        "jnz            .L01LOOP%=               \n\t"
+
+        ".L16END%=:                              \n\t"
+
+        "testq          $0x04, %5                \n\t"
+        "jz             .L08LABEL%=              \n\t"
+
+        "movups       (%3,%0,4), %%xmm4          \n\t"  // 4 * y
+        "movups       (%4,%0,4), %%xmm8          \n\t"  // 4 * a
+	"mulps          %%xmm12, %%xmm8          \n\t"
+        "addps          %%xmm8 , %%xmm4          \n\t"
+	"movups  %%xmm4 ,    (%3,%0,4)           \n\t"    // 4 * y
+        "addq           $4 , %0                  \n\t"
+        "subq           $4 , %1                  \n\t"
+
+        ".L08LABEL%=:      			 \n\t" 
+        :
+        :
+          "r" (i),      // 0    
+          "r" (n1),     // 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap),     // 4
+          "r" (n2),     // 5
+          "r" (alpha)   // 6
+        : "cc",
+          "%xmm4", "%xmm5",
+          "%xmm6", "%xmm7",
+          "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+          "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+          "memory"
+        );
+
+}
+
+#endif
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+	BLASLONG i;
+	if ( inc_dest != 1 )
+	{
+		for ( i=0; i<n; i++ )
+		{
+			*dest += *src;
+			src++;
+			dest += inc_dest;
+		}
+		return;
+	}
+
+        i=0;
+
+        __asm__  __volatile__
+        (
+
+        ".align 16                              \n\t"
+        ".L01LOOP%=:                            \n\t"
+
+        "movups  (%2,%0,4) , %%xmm12            \n\t"
+        "movups  (%3,%0,4) , %%xmm11            \n\t"
+        "addps   %%xmm12   , %%xmm11            \n\t"
+        "addq           $4 , %0                 \n\t"
+        "movups  %%xmm11, -16(%3,%0,4)          \n\t"
+
+        "subq           $4 , %1                 \n\t"
+        "jnz            .L01LOOP%=              \n\t"
+
+        :
+        :
+        "r" (i),          // 0
+        "r" (n),          // 1
+        "r" (src),        // 2
+        "r" (dest)        // 3
+        : "cc",
+        "%xmm10", "%xmm11", "%xmm12",
+        "memory"
+        );
+
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	FLOAT *ap[4];
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG m3;
+	BLASLONG n2;
+	BLASLONG lda4 =  lda << 2;
+	BLASLONG lda8 =  lda << 3;
+	FLOAT xbuffer[8],*ybuffer;
+
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
+	ybuffer = buffer;
+	
+        if ( inc_x == 1 )
+	{
+		n1 = n >> 3 ;
+		n2 = n &  7 ;
+	}
+	else
+	{
+		n1 = n >> 2 ;
+		n2 = n &  3 ;
+
+	}
+	
+        m3 = m & 3  ;
+        m1 = m & -4 ;
+        m2 = (m & (NBMAX-1)) - m3 ;
+
+
+	y_ptr = y;
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		a_ptr = a;
+		x_ptr = x;
+		
+		ap[0] = a_ptr;
+		ap[1] = a_ptr + lda;
+		ap[2] = ap[1] + lda;
+		ap[3] = ap[2] + lda;
+
+		if ( inc_y != 1 )
+			memset(ybuffer,0,NB*4);
+		else
+			ybuffer = y_ptr;
+
+		if ( inc_x == 1 )
+		{
+
+
+			for( i = 0; i < n1 ; i++)
+			{
+				sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
+				ap[0] += lda8; 
+				ap[1] += lda8; 
+				ap[2] += lda8; 
+				ap[3] += lda8; 
+				a_ptr += lda8;
+				x_ptr += 8;	
+			}
+
+
+			if ( n2 & 4 )
+			{
+				sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				a_ptr += lda4;
+				x_ptr += 4;	
+			}
+
+			if ( n2 & 2 )
+			{
+				sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
+				a_ptr += lda*2;
+				x_ptr += 2;	
+			}
+
+
+			if ( n2 & 1 )
+			{
+				sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
+				a_ptr += lda;
+				x_ptr += 1;	
+
+			}
+
+
+		}
+		else
+		{
+
+			for( i = 0; i < n1 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[1] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[2] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[3] = x_ptr[0];
+				x_ptr += inc_x;	
+				sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				ap[2] += lda4; 
+				ap[3] += lda4; 
+				a_ptr += lda4;
+			}
+
+			for( i = 0; i < n2 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
+				a_ptr += lda;
+
+			}
+
+		}
+
+		a     += NB;
+		if ( inc_y != 1 )
+		{
+			add_y(NB,ybuffer,y_ptr,inc_y);
+			y_ptr += NB * inc_y;
+		}
+		else
+			y_ptr += NB ;
+
+	}
+
+	if ( m3 == 0 ) return(0);
+
+	if ( m3 == 3 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		FLOAT temp2 = 0.0;
+		if ( lda == 3 && inc_x ==1 )
+		{
+
+			for( i = 0; i < ( n & -4 ); i+=4 )
+			{
+
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
+				temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+
+				temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9]  * x_ptr[3];
+				temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
+				temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
+
+				a_ptr += 12;
+				x_ptr += 4;
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += 3;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp2;
+		return(0);
+	}
+
+
+	if ( m3 == 2 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		if ( lda == 2 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4) ; i+=4 )
+			{
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
+				temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+				a_ptr += 8;
+				x_ptr += 4;
+
+			}
+
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0]   * x_ptr[0];
+				temp1 += a_ptr[1]   * x_ptr[0];
+				a_ptr += 2;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		return(0);
+	}
+
+	if ( m3 == 1 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp = 0.0;
+		if ( lda == 1 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4); i+=4 )
+			{
+				temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
+	
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp += a_ptr[i] * x_ptr[i];
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp += a_ptr[0] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+			}
+
+		}
+		y_ptr[0] += alpha * temp;
+		return(0);
+	}
+
+
+	return(0);
+}
+
+
--- a/kernel/x86_64/sgemv_n_avx.c
+++ b/kernel/x86_64/sgemv_n_avx.c
@ -1,218 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-
-#include "common.h"
-
-#if defined(BULLDOZER) || defined(PILEDRIVER)
-#include "sgemv_n_microk_bulldozer.c"
-#elif defined(HASWELL)
-#include "sgemv_n_microk_haswell.c"
-#else
-#include "sgemv_n_microk_sandy.c"
-#endif
-
-static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
-{
-	BLASLONG i;
-	for ( i=0; i<n; i++ )
-	{
-		*dest = *src;
-		dest++;
-		src += inc_src;
-	}
-}
-
-static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
-{
-	BLASLONG i;
-	for ( i=0; i<n; i++ )
-	{
-		*dest += *src;
-		src++;
-		dest += inc_dest;
-	}
-}
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
-{
-	BLASLONG i;
-	BLASLONG j;
-	FLOAT *a_ptr;
-	FLOAT *x_ptr;
-	FLOAT *y_ptr;
-	BLASLONG n1;
-	BLASLONG m1;
-	BLASLONG register m2;
-	BLASLONG register n2;
-	FLOAT *xbuffer,*ybuffer;
-	xbuffer = buffer;
-	ybuffer = xbuffer + 2048 + 256;
-	
-	n1 = n / 512 ;
-	n2 = n % 512 ;
-
-	m1 = m / 64;
-	m2 = m % 64;
-
-	y_ptr = y;
-	x_ptr = x;
-
-	for (j=0; j<n1; j++)
-	{
-
-		if ( inc_x == 1 )
-			xbuffer = x_ptr;
-		else
-			copy_x(512,x_ptr,xbuffer,inc_x);
-
-		a_ptr = a + j * 512 * lda;
-		y_ptr = y;
-
-		for(i = 0; i<m1; i++ )
-		{
-			sgemv_kernel_64(512,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(64,ybuffer,y_ptr,inc_y);
-			y_ptr += 64 * inc_y;
-			a_ptr += 64;			
-
-		}
-
-		if ( m2 & 32 )
-		{
-			sgemv_kernel_32(512,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(32,ybuffer,y_ptr,inc_y);
-			y_ptr += 32 * inc_y;
-			a_ptr += 32;			
-
-		}
-
-		if ( m2 & 16 )
-		{
-			sgemv_kernel_16(512,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(16,ybuffer,y_ptr,inc_y);
-			y_ptr += 16 * inc_y;
-			a_ptr += 16;			
-		}
-		if ( m2 & 8 )
-		{
-			sgemv_kernel_8(512,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(8,ybuffer,y_ptr,inc_y);
-			y_ptr += 8 * inc_y;
-			a_ptr += 8;			
-		}
-		if ( m2 & 4 )
-		{
-			sgemv_kernel_4(512,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(4,ybuffer,y_ptr,inc_y);
-			y_ptr += 4 * inc_y;
-			a_ptr += 4;			
-		}
-		if ( m2 & 2 )
-		{
-			sgemv_kernel_2(512,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(2,ybuffer,y_ptr,inc_y);
-			y_ptr += 2 * inc_y;
-			a_ptr += 2;			
-		}
-		if ( m2 & 1 )
-		{
-			sgemv_kernel_1(512,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(1,ybuffer,y_ptr,inc_y);
-		}
-		x_ptr += 512 * inc_x;
-
-	}
-
-	if ( n2 > 0 )
-	{
-
-		if ( inc_x == 1 )
-			xbuffer = x_ptr;
-		else
-			copy_x(n2,x_ptr,xbuffer,inc_x);
-
-		a_ptr = a + n1 * 512 * lda;
-		y_ptr = y;
-
-		for(i = 0; i<m1; i++ )
-		{
-			sgemv_kernel_64(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(64,ybuffer,y_ptr,inc_y);
-			y_ptr += 64 * inc_y;
-			a_ptr += 64;			
-
-		}
-
-		if ( m2 & 32 )
-		{
-			sgemv_kernel_32(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(32,ybuffer,y_ptr,inc_y);
-			y_ptr += 32 * inc_y;
-			a_ptr += 32;			
-
-		}
-		if ( m2 & 16 )
-		{
-			sgemv_kernel_16(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(16,ybuffer,y_ptr,inc_y);
-			y_ptr += 16 * inc_y;
-			a_ptr += 16;			
-		}
-		if ( m2 & 8 )
-		{
-			sgemv_kernel_8(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(8,ybuffer,y_ptr,inc_y);
-			y_ptr += 8 * inc_y;
-			a_ptr += 8;			
-		}
-		if ( m2 & 4 )
-		{
-			sgemv_kernel_4(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(4,ybuffer,y_ptr,inc_y);
-			y_ptr += 4 * inc_y;
-			a_ptr += 4;			
-		}
-		if ( m2 & 2 )
-		{
-			sgemv_kernel_2(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(2,ybuffer,y_ptr,inc_y);
-			y_ptr += 2 * inc_y;
-			a_ptr += 2;			
-		}
-		if ( m2 & 1 )
-		{
-			sgemv_kernel_1(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
-			add_y(1,ybuffer,y_ptr,inc_y);
-		}
-
-
-	}
-	return(0);
-}
-
-
--- a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
+++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
@ -0,0 +1,269 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+
+#define HAVE_KERNEL_4x8 1
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vbroadcastss    (%2), %%xmm12	 \n\t"	// x0 
+	"vbroadcastss   4(%2), %%xmm13	 \n\t"	// x1 
+	"vbroadcastss   8(%2), %%xmm14	 \n\t"	// x2 
+	"vbroadcastss  12(%2), %%xmm15	 \n\t"	// x3 
+	"vbroadcastss  16(%2), %%xmm0 	 \n\t"	// x4 
+	"vbroadcastss  20(%2), %%xmm1 	 \n\t"	// x5 
+	"vbroadcastss  24(%2), %%xmm2 	 \n\t"	// x6 
+	"vbroadcastss  28(%2), %%xmm3 	 \n\t"	// x7 
+
+	"vbroadcastss    (%9), %%xmm8 	 \n\t"	// alpha 
+
+        "testq          $0x04, %1                      \n\t"
+        "jz             .L08LABEL%=                    \n\t"
+
+	"vxorps		%%xmm4, %%xmm4 , %%xmm4  \n\t"
+	"vxorps		%%xmm5, %%xmm5 , %%xmm5  \n\t"
+
+	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%5,%0,4), %%xmm13, %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%6,%0,4), %%xmm14, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%7,%0,4), %%xmm15, %%xmm5 \n\t" 
+        "addq		$4 , %0	  	 	       \n\t"
+
+	"vfmaddps %%xmm4,   (%4,%8,4), %%xmm0 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%5,%8,4), %%xmm1 , %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%6,%8,4), %%xmm2 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%7,%8,4), %%xmm3 , %%xmm5 \n\t" 
+        "addq		$4 , %8	  	 	       \n\t"
+	
+	"vaddps		%%xmm5 , %%xmm4, %%xmm4        \n\t"
+	"vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
+	"subq	        $4 , %1			       \n\t"		
+	"vmovups  %%xmm6, -16(%3,%0,4)		       \n\t"	// 4 * y
+
+	".L08LABEL%=:                                  \n\t"
+
+        "testq          $0x08, %1                      \n\t"
+        "jz             .L16LABEL%=                    \n\t"
+
+	"vxorps		%%xmm4, %%xmm4 , %%xmm4  \n\t"
+	"vxorps		%%xmm5, %%xmm5 , %%xmm5  \n\t"
+
+	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%5,%0,4), %%xmm13, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%6,%0,4), %%xmm14, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%7,%0,4), %%xmm15, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" 
+
+	"vfmaddps %%xmm4,   (%4,%8,4), %%xmm0 , %%xmm4 \n\t" 
+        "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%5,%8,4), %%xmm1 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%6,%8,4), %%xmm2 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%7,%8,4), %%xmm3 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" 
+	
+	"vfmaddps    (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
+	"vfmaddps  16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
+	"vmovups  %%xmm4,   (%3,%0,4)		      \n\t"	// 4 * y
+	"vmovups  %%xmm5, 16(%3,%0,4)		      \n\t"	// 4 * y
+
+        "addq		$8 , %0	  	 	      \n\t"
+        "addq		$8 , %8	  	 	      \n\t"
+	"subq	        $8 , %1			      \n\t"		
+
+
+        ".L16LABEL%=:                                  \n\t"
+
+        "cmpq           $0, %1                         \n\t"
+        "je             .L16END%=                      \n\t"
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+
+	"vxorps		%%xmm4, %%xmm4 , %%xmm4  \n\t"
+	"vxorps		%%xmm5, %%xmm5 , %%xmm5  \n\t"
+	"vxorps		%%xmm6, %%xmm6 , %%xmm6  \n\t"
+	"vxorps		%%xmm7, %%xmm7 , %%xmm7  \n\t"
+
+        "prefetcht0      192(%4,%0,4)                  \n\t"
+	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" 
+        "prefetcht0      192(%5,%0,4)                  \n\t"
+	"vfmaddps %%xmm4,   (%5,%0,4), %%xmm13, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" 
+        "prefetcht0      192(%6,%0,4)                  \n\t"
+	"vfmaddps %%xmm4,   (%6,%0,4), %%xmm14, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" 
+        "prefetcht0      192(%7,%0,4)                  \n\t"
+	"vfmaddps %%xmm4,   (%7,%0,4), %%xmm15, %%xmm4 \n\t" 
+	".align 2				 \n\t"
+	"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" 
+
+	"vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t" 
+	"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t" 
+	"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t" 
+	"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" 
+
+        "prefetcht0      192(%4,%8,4)                  \n\t"
+	"vfmaddps %%xmm4,   (%4,%8,4), %%xmm0 , %%xmm4 \n\t" 
+        "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" 
+        "prefetcht0      192(%5,%8,4)                  \n\t"
+	"vfmaddps %%xmm4,   (%5,%8,4), %%xmm1 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" 
+        "prefetcht0      192(%6,%8,4)                  \n\t"
+	"vfmaddps %%xmm4,   (%6,%8,4), %%xmm2 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" 
+        "prefetcht0      192(%7,%8,4)                  \n\t"
+	"vfmaddps %%xmm4,   (%7,%8,4), %%xmm3 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" 
+	
+	"vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t" 
+        "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t" 
+	"vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t" 
+	"vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t" 
+	"vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t" 
+	
+	"vfmaddps    (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
+	"vfmaddps  16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
+	"vfmaddps  32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
+	"vfmaddps  48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
+
+        "addq		$16, %0	  	 	      \n\t"
+	"vmovups  %%xmm4,-64(%3,%0,4)		      \n\t"	// 4 * y
+	"vmovups  %%xmm5,-48(%3,%0,4)		      \n\t"	// 4 * y
+        "addq		$16, %8	  	 	      \n\t"
+	"vmovups  %%xmm6,-32(%3,%0,4)		      \n\t"	// 4 * y
+	"vmovups  %%xmm7,-16(%3,%0,4)		      \n\t"	// 4 * y
+
+	"subq	        $16, %1			      \n\t"		
+	"jnz		.L01LOOP%=		      \n\t"
+
+	".L16END%=:                             \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (lda4),   // 8
+          "r" (alpha)   // 9
+	: "cc", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+
+
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vbroadcastss    (%2), %%xmm12	 \n\t"	// x0 
+	"vbroadcastss   4(%2), %%xmm13	 \n\t"	// x1 
+	"vbroadcastss   8(%2), %%xmm14	 \n\t"	// x2 
+	"vbroadcastss  12(%2), %%xmm15	 \n\t"	// x3 
+
+	"vbroadcastss    (%8), %%xmm8 	 \n\t"	// alpha 
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vxorps		%%xmm4, %%xmm4 , %%xmm4  \n\t"
+	"vxorps		%%xmm5, %%xmm5 , %%xmm5  \n\t"
+
+	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%5,%0,4), %%xmm13, %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%6,%0,4), %%xmm14, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%7,%0,4), %%xmm15, %%xmm5 \n\t" 
+	
+	"vaddps	  %%xmm4, %%xmm5, %%xmm4	       \n\t"
+
+	"vfmaddps    (%3,%0,4) , %%xmm4,%%xmm8,%%xmm6 \n\t"
+	"vmovups  %%xmm6,   (%3,%0,4)		      \n\t"	// 4 * y
+
+        "addq		$4 , %0	  	 	      \n\t"
+	"subq	        $4 , %1			      \n\t"		
+	"jnz		.L01LOOP%=		      \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (alpha)   // 8
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
--- a/kernel/x86_64/sgemv_n_microk_bulldozer.c
+++ b/kernel/x86_64/sgemv_n_microk_bulldozer.c
@ -1,451 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-static void  sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	float *pre = a + lda*3;
-
-	__asm__  __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-	"movq		%6,	 %%r8\n\t"		// address for prefetch
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-
-	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
-	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
-	"vxorps		%%ymm10, %%ymm10, %%ymm10\n\t"	// set to zero
-	"vxorps		%%ymm11, %%ymm11, %%ymm11\n\t"	// set to zero
-	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
-	"vxorps		%%ymm13, %%ymm13, %%ymm13\n\t"	// set to zero
-	"vxorps		%%ymm14, %%ymm14, %%ymm14\n\t"	// set to zero
-	"vxorps		%%ymm15, %%ymm15, %%ymm15\n\t"	// set to zero
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-	"nop					 \n\t"
-	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
-
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"vfmaddps %%ymm8 ,   0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-	"vfmaddps %%ymm9 ,   8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp
-	"prefetcht0   128(%%r8)\n\t"			// Prefetch
-	"vfmaddps %%ymm10,  16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp
-	"vfmaddps %%ymm11,  24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp
-	"prefetcht0   192(%%r8)\n\t"			// Prefetch
-	"vfmaddps %%ymm12,  32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
-	"vfmaddps %%ymm13,  40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
-	"vfmaddps %%ymm14,  48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp
-	"vfmaddps %%ymm15,  56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
-	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
-	"vmulps		%%ymm10, %%ymm1,  %%ymm10\n\t"  // scale by alpha
-	"vmulps		%%ymm11, %%ymm1,  %%ymm11\n\t"  // scale by alpha
-	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
-	"vmulps		%%ymm13, %%ymm1,  %%ymm13\n\t"  // scale by alpha
-	"vmulps		%%ymm14, %%ymm1,  %%ymm14\n\t"  // scale by alpha
-	"vmulps		%%ymm15, %%ymm1,  %%ymm15\n\t"  // scale by alpha
-
-	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm10, 16*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm11, 24*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm12, 32*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm13, 40*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm14, 48*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm15, 56*4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y),      // 5
-	  "m" (pre)	// 6
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-
-static void  sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	float *pre = a + lda*3;
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-	"movq		%6,	 %%r8\n\t"		// address for prefetch
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-
-	"vxorps		%%xmm8 , %%xmm8 , %%xmm8 \n\t"	// set to zero
-	"vxorps		%%xmm9 , %%xmm9 , %%xmm9 \n\t"	// set to zero
-	"vxorps		%%xmm10, %%xmm10, %%xmm10\n\t"	// set to zero
-	"vxorps		%%xmm11, %%xmm11, %%xmm11\n\t"	// set to zero
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
-	"vxorps		%%xmm14, %%xmm14, %%xmm14\n\t"	// set to zero
-	"vxorps		%%xmm15, %%xmm15, %%xmm15\n\t"	// set to zero
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%xmm0	 \n\t"	// load values of c
-	"nop					 \n\t"
-	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
-
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"vfmaddps %%xmm8 ,   0*4(%%rsi), %%xmm0, %%xmm8 \n\t" // multiply a and c and add to temp
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-	"vfmaddps %%xmm9 ,   4*4(%%rsi), %%xmm0, %%xmm9 \n\t" // multiply a and c and add to temp
-	"vfmaddps %%xmm10,   8*4(%%rsi), %%xmm0, %%xmm10\n\t" // multiply a and c and add to temp
-	"vfmaddps %%xmm11,  12*4(%%rsi), %%xmm0, %%xmm11\n\t" // multiply a and c and add to temp
-	"vfmaddps %%xmm12,  16*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
-	"vfmaddps %%xmm13,  20*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp
-	"vfmaddps %%xmm14,  24*4(%%rsi), %%xmm0, %%xmm14\n\t" // multiply a and c and add to temp
-	"vfmaddps %%xmm15,  28*4(%%rsi), %%xmm0, %%xmm15\n\t" // multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%xmm8 , %%xmm1,  %%xmm8 \n\t"  // scale by alpha
-	"vmulps		%%xmm9 , %%xmm1,  %%xmm9 \n\t"  // scale by alpha
-	"vmulps		%%xmm10, %%xmm1,  %%xmm10\n\t"  // scale by alpha
-	"vmulps		%%xmm11, %%xmm1,  %%xmm11\n\t"  // scale by alpha
-	"vmulps		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
-	"vmulps		%%xmm13, %%xmm1,  %%xmm13\n\t"  // scale by alpha
-	"vmulps		%%xmm14, %%xmm1,  %%xmm14\n\t"  // scale by alpha
-	"vmulps		%%xmm15, %%xmm1,  %%xmm15\n\t"  // scale by alpha
-
-	"vmovups	%%xmm8 ,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%xmm9 ,  4*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%xmm10,  8*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%xmm11, 12*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%xmm12, 16*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%xmm13, 20*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%xmm14, 24*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%xmm15, 28*4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y),      // 5
-	  "m" (pre)	// 6
-	);
-
-} 
-
-static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-	float *pre = a + lda*3;
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-	"movq		%6,	 %%r8\n\t"		// address for prefetch
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-
-	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
-	"vxorps		%%ymm13, %%ymm13, %%ymm13\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-
-	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-
-	"vfmaddps %%ymm12,   0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
-	"vfmaddps %%ymm13,   8*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
-
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
-	"vmulps		%%ymm13, %%ymm1,  %%ymm13\n\t"  // scale by alpha
-
-	"vmovups	%%ymm12,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm13,  8*4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y),      // 5
-	  "m" (pre)	// 6
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-static void  sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-
-	"vfmaddps %%ymm12,   0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
-
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
-
-	"vmovups	%%ymm12,     (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-static void  sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%xmm0	 \n\t"	// load values of c
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-
-	"vfmaddps %%xmm12,   0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
-
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
-
-	"vmovups	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-static void  sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-
-	"vfmaddss %%xmm12,   0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
-	"vfmaddss %%xmm13,   1*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp
-
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
-	"vmulss		%%xmm13, %%xmm1,  %%xmm13\n\t"  // scale by alpha
-
-	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovss 	%%xmm13,    4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-
-static void  sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-
-	"vfmaddss %%xmm12,   0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
-
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
-
-	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
--- a/kernel/x86_64/sgemv_n_microk_haswell-4.c
+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c
@ -0,0 +1,299 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+
+#define HAVE_KERNEL_4x8 1
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+	"vbroadcastss    (%2), %%ymm12	 \n\t"	// x0 
+	"vbroadcastss   4(%2), %%ymm13	 \n\t"	// x1 
+	"vbroadcastss   8(%2), %%ymm14	 \n\t"	// x2 
+	"vbroadcastss  12(%2), %%ymm15	 \n\t"	// x3 
+	"vbroadcastss  16(%2), %%ymm0 	 \n\t"	// x4 
+	"vbroadcastss  20(%2), %%ymm1 	 \n\t"	// x5 
+	"vbroadcastss  24(%2), %%ymm2 	 \n\t"	// x6 
+	"vbroadcastss  28(%2), %%ymm3 	 \n\t"	// x7 
+
+	"vbroadcastss    (%9), %%ymm6 	 \n\t"	// alpha 
+
+        "testq          $0x04, %1                      \n\t"
+        "jz             .L08LABEL%=                    \n\t"
+
+	"vmovups	(%3,%0,4), %%xmm7	       \n\t"	// 4 * y
+	"vxorps		%%xmm4 , %%xmm4, %%xmm4        \n\t"
+	"vxorps		%%xmm5 , %%xmm5, %%xmm5        \n\t"
+
+	"vfmadd231ps   (%4,%0,4), %%xmm12, %%xmm4      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%xmm13, %%xmm5      \n\t" 
+	"vfmadd231ps   (%6,%0,4), %%xmm14, %%xmm4      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%xmm15, %%xmm5      \n\t" 
+
+	"vfmadd231ps   (%4,%8,4), %%xmm0 , %%xmm4      \n\t" 
+	"vfmadd231ps   (%5,%8,4), %%xmm1 , %%xmm5      \n\t" 
+	"vfmadd231ps   (%6,%8,4), %%xmm2 , %%xmm4      \n\t" 
+	"vfmadd231ps   (%7,%8,4), %%xmm3 , %%xmm5      \n\t" 
+
+	"vaddps		%%xmm4 , %%xmm5 , %%xmm5       \n\t"
+	"vmulps		%%xmm6 , %%xmm5 , %%xmm5       \n\t"
+	"vaddps		%%xmm7 , %%xmm5 , %%xmm5       \n\t"
+
+	"vmovups  %%xmm5,   (%3,%0,4)		       \n\t"	// 4 * y
+
+        "addq		$4 , %8	  	 	       \n\t"
+        "addq		$4 , %0	  	 	       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+
+        ".L08LABEL%=:                                  \n\t"
+
+        "testq          $0x08, %1                      \n\t"
+        "jz             .L16LABEL%=                    \n\t"
+
+	"vmovups	(%3,%0,4), %%ymm7	       \n\t"	// 8 * y
+	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
+	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"
+
+	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%ymm13, %%ymm5      \n\t" 
+	"vfmadd231ps   (%6,%0,4), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%ymm15, %%ymm5      \n\t" 
+
+	"vfmadd231ps   (%4,%8,4), %%ymm0 , %%ymm4      \n\t" 
+	"vfmadd231ps   (%5,%8,4), %%ymm1 , %%ymm5      \n\t" 
+	"vfmadd231ps   (%6,%8,4), %%ymm2 , %%ymm4      \n\t" 
+	"vfmadd231ps   (%7,%8,4), %%ymm3 , %%ymm5      \n\t" 
+
+	"vaddps		%%ymm4 , %%ymm5 , %%ymm5       \n\t"
+	"vmulps		%%ymm6 , %%ymm5 , %%ymm5       \n\t"
+	"vaddps		%%ymm7 , %%ymm5 , %%ymm5       \n\t"
+
+
+	"vmovups  %%ymm5,   (%3,%0,4)		       \n\t"	// 8 * y
+
+        "addq		$8 , %8	  	 	       \n\t"
+        "addq		$8 , %0	  	 	       \n\t"
+	"subq	        $8 , %1			       \n\t"		
+
+        ".L16LABEL%=:                                  \n\t"
+
+        "cmpq           $0, %1                         \n\t"
+        "je             .L16END%=                      \n\t"
+
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+
+	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
+	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"
+	"vmovups	(%3,%0,4), %%ymm8	       \n\t"	// 8 * y
+	"vmovups      32(%3,%0,4), %%ymm9	       \n\t"	// 8 * y
+
+	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%ymm13, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5      \n\t" 
+	"vfmadd231ps   (%6,%0,4), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%ymm15, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5      \n\t" 
+
+	"vfmadd231ps   (%4,%8,4), %%ymm0 , %%ymm4      \n\t" 
+        "addq		$16, %0	  	 	       \n\t"
+	"vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5      \n\t" 
+	"vfmadd231ps   (%5,%8,4), %%ymm1 , %%ymm4      \n\t" 
+	"vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5      \n\t" 
+	"vfmadd231ps   (%6,%8,4), %%ymm2 , %%ymm4      \n\t" 
+	"vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5      \n\t" 
+	"vfmadd231ps   (%7,%8,4), %%ymm3 , %%ymm4      \n\t" 
+	"vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5      \n\t" 
+
+	"vfmadd231ps     %%ymm6 , %%ymm4 , %%ymm8      \n\t"
+	"vfmadd231ps     %%ymm6 , %%ymm5 , %%ymm9      \n\t"
+
+        "addq		$16, %8	  	 	      \n\t"
+	"vmovups  %%ymm8,-64(%3,%0,4)		      \n\t"	// 8 * y
+	"subq	        $16, %1			      \n\t"		
+	"vmovups  %%ymm9,-32(%3,%0,4)		      \n\t"	// 8 * y
+
+	"jnz		.L01LOOP%=		      \n\t"
+
+        ".L16END%=:                             \n\t"
+	"vzeroupper			        \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (lda4),   // 8
+          "r" (alpha)   // 9
+	: "cc", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+	"vbroadcastss    (%2), %%ymm12	 \n\t"	// x0 
+	"vbroadcastss   4(%2), %%ymm13	 \n\t"	// x1 
+	"vbroadcastss   8(%2), %%ymm14	 \n\t"	// x2 
+	"vbroadcastss  12(%2), %%ymm15	 \n\t"	// x3 
+
+	"vbroadcastss    (%8), %%ymm6 	 \n\t"	// alpha 
+
+        "testq          $0x04, %1                      \n\t"
+        "jz             .L08LABEL%=                    \n\t"
+
+	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
+	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"
+	"vmovups	(%3,%0,4), %%xmm7	       \n\t"	// 4 * y
+
+	"vfmadd231ps   (%4,%0,4), %%xmm12, %%xmm4      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%xmm13, %%xmm5      \n\t" 
+	"vfmadd231ps   (%6,%0,4), %%xmm14, %%xmm4      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%xmm15, %%xmm5      \n\t" 
+
+	"vaddps		%%xmm4 , %%xmm5 , %%xmm5       \n\t"
+	"vmulps		%%xmm6 , %%xmm5 , %%xmm5       \n\t"
+	"vaddps		%%xmm7 , %%xmm5 , %%xmm5       \n\t"
+
+	"vmovups  %%xmm5,   (%3,%0,4)		       \n\t"	// 4 * y
+
+        "addq		$4 , %0	  	 	       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+
+        ".L08LABEL%=:                                  \n\t"
+
+        "testq          $0x08, %1                      \n\t"
+        "jz             .L16LABEL%=                    \n\t"
+
+	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
+	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"
+	"vmovups	(%3,%0,4), %%ymm7	       \n\t"	// 8 * y
+
+	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%ymm13, %%ymm5      \n\t" 
+	"vfmadd231ps   (%6,%0,4), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%ymm15, %%ymm5      \n\t" 
+
+	"vaddps		%%ymm4 , %%ymm5 , %%ymm5       \n\t"
+	"vmulps		%%ymm6 , %%ymm5 , %%ymm5       \n\t"
+	"vaddps		%%ymm7 , %%ymm5 , %%ymm5       \n\t"
+
+	"vmovups  %%ymm5,   (%3,%0,4)		       \n\t"	// 8 * y
+
+        "addq		$8 , %0	  	 	       \n\t"
+	"subq	        $8 , %1			       \n\t"		
+
+        ".L16LABEL%=:                                  \n\t"
+
+        "cmpq           $0, %1                         \n\t"
+        "je             .L16END%=                      \n\t"
+
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
+	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"
+	"vmovups	(%3,%0,4), %%ymm8	 \n\t"	// 8 * y
+	"vmovups      32(%3,%0,4), %%ymm9	 \n\t"	// 8 * y
+
+	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%ymm13, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5      \n\t" 
+	"vfmadd231ps   (%6,%0,4), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%ymm15, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5      \n\t" 
+
+	"vfmadd231ps     %%ymm6 , %%ymm4 , %%ymm8      \n\t"
+	"vfmadd231ps     %%ymm6 , %%ymm5 , %%ymm9      \n\t"
+
+	"vmovups  %%ymm8,   (%3,%0,4)		      \n\t"	// 8 * y
+	"vmovups  %%ymm9, 32(%3,%0,4)		      \n\t"	// 8 * y
+
+        "addq		$16, %0	  	 	      \n\t"
+	"subq	        $16, %1			      \n\t"		
+	"jnz		.L01LOOP%=		      \n\t"
+
+        ".L16END%=:                             \n\t"
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (alpha)   // 8
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
--- a/kernel/x86_64/sgemv_n_microk_haswell.c
+++ b/kernel/x86_64/sgemv_n_microk_haswell.c
@ -1,461 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-static void  sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	float *pre = a + lda*2;
-
-	__asm__  __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-	"movq		%6,	 %%r8\n\t"		// address for prefetch
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-
-	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
-	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
-	"vxorps		%%ymm10, %%ymm10, %%ymm10\n\t"	// set to zero
-	"vxorps		%%ymm11, %%ymm11, %%ymm11\n\t"	// set to zero
-	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
-	"vxorps		%%ymm13, %%ymm13, %%ymm13\n\t"	// set to zero
-	"vxorps		%%ymm14, %%ymm14, %%ymm14\n\t"	// set to zero
-	"vxorps		%%ymm15, %%ymm15, %%ymm15\n\t"	// set to zero
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
-
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"vfmadd231ps   0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp
-	"vfmadd231ps   8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-	"vfmadd231ps  16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp
-	"vfmadd231ps  24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp
-	"prefetcht0  128(%%r8)\n\t"			// Prefetch
-	"vfmadd231ps  32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
-	"vfmadd231ps  40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
-	"prefetcht0  192(%%r8)\n\t"			// Prefetch
-	"vfmadd231ps  48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp
-	"vfmadd231ps  56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
-	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
-	"vmulps		%%ymm10, %%ymm1,  %%ymm10\n\t"  // scale by alpha
-	"vmulps		%%ymm11, %%ymm1,  %%ymm11\n\t"  // scale by alpha
-	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
-	"vmulps		%%ymm13, %%ymm1,  %%ymm13\n\t"  // scale by alpha
-	"vmulps		%%ymm14, %%ymm1,  %%ymm14\n\t"  // scale by alpha
-	"vmulps		%%ymm15, %%ymm1,  %%ymm15\n\t"  // scale by alpha
-
-	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm10, 16*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm11, 24*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm12, 32*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm13, 40*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm14, 48*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm15, 56*4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y),      // 5
-	  "m" (pre)	// 6
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-
-static void  sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	float *pre = a + lda*3;
-
-	__asm__  __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-	"movq		%6,	 %%r8\n\t"		// address for prefetch
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-
-	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
-	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
-	"vxorps		%%ymm10, %%ymm10, %%ymm10\n\t"	// set to zero
-	"vxorps		%%ymm11, %%ymm11, %%ymm11\n\t"	// set to zero
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-	"nop					 \n\t"
-	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
-
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-
-	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
-	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
-	"vmulps  16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
-	"vmulps  24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
-
-	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
-	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
-	"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
-	"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
-
-
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
-	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
-	"vmulps		%%ymm10, %%ymm1,  %%ymm10\n\t"  // scale by alpha
-	"vmulps		%%ymm11, %%ymm1,  %%ymm11\n\t"  // scale by alpha
-
-	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm10, 16*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm11, 24*4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y),      // 5
-	  "m" (pre)	// 6
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "memory"
-	);
-
-
-
-} 
-
-static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-	float *pre = a + lda*3;
-	
-	__asm__  __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-	"movq		%6,	 %%r8\n\t"		// address for prefetch
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-
-	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
-	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-	"nop					 \n\t"
-	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
-
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-
-	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
-	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
-
-	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
-	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
-	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
-
-	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y),      // 5
-	  "m" (pre)	// 6
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "memory"
-	);
-
-
-} 
-
-
-static void  sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-	
-	__asm__  __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-
-	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
-	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
-	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "memory"
-	);
-
-
-} 
-
-
-static void  sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%xmm0	 \n\t"	// load values of c
-
-	"vmulps   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
-	"vaddps %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
-
-	"vmovups	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-static void  sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
-
-	"vmulps   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
-	"vmulps   1*4(%%rsi), %%xmm0, %%xmm5 \n\t" 		// multiply a and c and add to temp
-
-	"vaddps %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp
-	"vaddps %%xmm13, %%xmm5, %%xmm13     \n\t" 		// multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
-	"vmulss		%%xmm13, %%xmm1,  %%xmm13\n\t"  // scale by alpha
-
-	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovss 	%%xmm13,    4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-
-static void  sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-
-	"vmulss   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
-	"vaddss %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp
-
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
-
-	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
--- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c
+++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c
@ -0,0 +1,204 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+
+#define HAVE_KERNEL_4x8 1
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"movss    (%2), %%xmm12	 \n\t"	// x0 
+	"movss   4(%2), %%xmm13	 \n\t"	// x1 
+	"movss   8(%2), %%xmm14	 \n\t"	// x2 
+	"movss  12(%2), %%xmm15	 \n\t"	// x3 
+	"shufps $0,  %%xmm12, %%xmm12\n\t"	
+	"shufps $0,  %%xmm13, %%xmm13\n\t"	
+	"shufps $0,  %%xmm14, %%xmm14\n\t"	
+	"shufps $0,  %%xmm15, %%xmm15\n\t"	
+
+	"movss  16(%2), %%xmm0	 \n\t"	// x4 
+	"movss  20(%2), %%xmm1	 \n\t"	// x5 
+	"movss  24(%2), %%xmm2	 \n\t"	// x6 
+	"movss  28(%2), %%xmm3	 \n\t"	// x7 
+	"shufps $0,  %%xmm0 , %%xmm0 \n\t"	
+	"shufps $0,  %%xmm1 , %%xmm1 \n\t"	
+	"shufps $0,  %%xmm2 , %%xmm2 \n\t"	
+	"shufps $0,  %%xmm3 , %%xmm3 \n\t"	
+
+	"movss    (%9), %%xmm6	     \n\t"	// alpha 
+	"shufps $0,  %%xmm6 , %%xmm6 \n\t"	
+
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"xorps           %%xmm4 , %%xmm4	 \n\t"
+	"xorps           %%xmm5 , %%xmm5	 \n\t"
+	"movups             (%3,%0,4), %%xmm7          \n\t" // 4 * y
+
+	".align 2				       \n\t"
+	"movups             (%4,%0,4), %%xmm8          \n\t" 
+	"movups             (%5,%0,4), %%xmm9          \n\t" 
+	"movups             (%6,%0,4), %%xmm10         \n\t" 
+	"movups             (%7,%0,4), %%xmm11         \n\t" 
+	".align 2				       \n\t"
+	"mulps		%%xmm12, %%xmm8		       \n\t"
+	"mulps		%%xmm13, %%xmm9		       \n\t"
+	"mulps		%%xmm14, %%xmm10	       \n\t"
+	"mulps		%%xmm15, %%xmm11	       \n\t"
+	"addps		%%xmm8 , %%xmm4		       \n\t"
+	"addps		%%xmm9 , %%xmm5		       \n\t"
+	"addps		%%xmm10, %%xmm4	               \n\t"
+	"addps		%%xmm11, %%xmm5 	       \n\t"
+
+	"movups             (%4,%8,4), %%xmm8          \n\t" 
+	"movups             (%5,%8,4), %%xmm9          \n\t" 
+	"movups             (%6,%8,4), %%xmm10         \n\t" 
+	"movups             (%7,%8,4), %%xmm11         \n\t" 
+	".align 2				       \n\t"
+	"mulps		%%xmm0 , %%xmm8		       \n\t"
+	"mulps		%%xmm1 , %%xmm9		       \n\t"
+	"mulps		%%xmm2 , %%xmm10	       \n\t"
+	"mulps		%%xmm3 , %%xmm11	       \n\t"
+	"addps		%%xmm8 , %%xmm4		       \n\t"
+	"addps		%%xmm9 , %%xmm5		       \n\t"
+	"addps		%%xmm10, %%xmm4	       	       \n\t"
+	"addps		%%xmm11, %%xmm5 	       \n\t"
+
+        "addq		$4 , %8	  	 	       \n\t"
+	"addps		%%xmm5 , %%xmm4 	       \n\t"
+        "addq		$4 , %0	  	 	       \n\t"
+	"mulps		%%xmm6 , %%xmm4		       \n\t" 
+	"subq	        $4 , %1			       \n\t"		
+	"addps		%%xmm4 , %%xmm7 	       \n\t"
+
+	"movups  %%xmm7 , -16(%3,%0,4)		       \n\t"	// 4 * y
+
+	"jnz		.L01LOOP%=		       \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (lda4),   // 8
+          "r" (alpha)   // 9
+	: "cc", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+
+
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"movss    (%2), %%xmm12	 \n\t"	// x0 
+	"movss   4(%2), %%xmm13	 \n\t"	// x1 
+	"movss   8(%2), %%xmm14	 \n\t"	// x2 
+	"movss  12(%2), %%xmm15	 \n\t"	// x3 
+	"shufps $0,  %%xmm12, %%xmm12\n\t"	
+	"shufps $0,  %%xmm13, %%xmm13\n\t"	
+	"shufps $0,  %%xmm14, %%xmm14\n\t"	
+	"shufps $0,  %%xmm15, %%xmm15\n\t"	
+
+	"movss    (%8), %%xmm6	     \n\t"	// alpha 
+	"shufps $0,  %%xmm6 , %%xmm6 \n\t"	
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"xorps           %%xmm4 , %%xmm4	 \n\t"
+	"movups	       (%3,%0,4), %%xmm7	 \n\t"	// 4 * y
+
+	"movups             (%4,%0,4), %%xmm8          \n\t" 
+	"movups             (%5,%0,4), %%xmm9          \n\t" 
+	"movups             (%6,%0,4), %%xmm10         \n\t" 
+	"movups             (%7,%0,4), %%xmm11         \n\t" 
+	"mulps		%%xmm12, %%xmm8		       \n\t"
+	"mulps		%%xmm13, %%xmm9		       \n\t"
+	"mulps		%%xmm14, %%xmm10	       \n\t"
+	"mulps		%%xmm15, %%xmm11	       \n\t"
+	"addps		%%xmm8 , %%xmm4		       \n\t"
+        "addq		$4 , %0	  	 	       \n\t"
+	"addps		%%xmm9 , %%xmm4		       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+	"addps		%%xmm10 , %%xmm4	       \n\t"
+	"addps		%%xmm4 , %%xmm11	       \n\t"
+
+	"mulps		%%xmm6 , %%xmm11	       \n\t" 
+	"addps		%%xmm7 , %%xmm11 	       \n\t"
+	"movups  %%xmm11, -16(%3,%0,4)		       \n\t"	// 4 * y
+
+	"jnz		.L01LOOP%=		       \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (alpha)   // 8
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
--- a/kernel/x86_64/sgemv_n_microk_sandy-4.c
+++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c
@ -0,0 +1,370 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+
+
+#define HAVE_KERNEL_4x8 1
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+	"vbroadcastss    (%2), %%ymm12	 \n\t"	// x0 
+	"vbroadcastss   4(%2), %%ymm13	 \n\t"	// x1 
+	"vbroadcastss   8(%2), %%ymm14	 \n\t"	// x2 
+	"vbroadcastss  12(%2), %%ymm15	 \n\t"	// x3 
+	"vbroadcastss  16(%2), %%ymm0 	 \n\t"	// x4 
+	"vbroadcastss  20(%2), %%ymm1 	 \n\t"	// x5 
+	"vbroadcastss  24(%2), %%ymm2 	 \n\t"	// x6 
+	"vbroadcastss  28(%2), %%ymm3 	 \n\t"	// x7 
+
+	"vbroadcastss    (%9), %%ymm6 	 \n\t"	// alpha 
+
+        "testq          $0x04, %1               \n\t"
+        "jz             .L08LABEL%=             \n\t"
+
+	"vxorps	  %%xmm4 , %%xmm4 , %%xmm4        \n\t"
+	"vxorps	  %%xmm5 , %%xmm5 , %%xmm5        \n\t"
+	"vmovups	(%3,%0,4), %%xmm7	  \n\t"	// 4 * y
+
+	"vmulps   (%4,%0,4), %%xmm12, %%xmm8      \n\t" 
+	"vmulps   (%5,%0,4), %%xmm13, %%xmm10     \n\t" 
+	"vmulps   (%6,%0,4), %%xmm14, %%xmm9      \n\t" 
+	"vmulps   (%7,%0,4), %%xmm15, %%xmm11     \n\t" 
+	"vaddps	  %%xmm4, %%xmm8 , %%xmm4	  \n\t"
+	"vaddps	  %%xmm5, %%xmm10, %%xmm5	  \n\t"
+	"vaddps	  %%xmm4, %%xmm9 , %%xmm4	  \n\t"
+	"vaddps	  %%xmm5, %%xmm11, %%xmm5	  \n\t"
+
+	"vmulps   (%4,%8,4), %%xmm0 , %%xmm8      \n\t" 
+	"vmulps   (%5,%8,4), %%xmm1 , %%xmm10     \n\t" 
+	"vmulps   (%6,%8,4), %%xmm2 , %%xmm9      \n\t" 
+	"vmulps   (%7,%8,4), %%xmm3 , %%xmm11     \n\t" 
+	"vaddps	  %%xmm4, %%xmm8 , %%xmm4	  \n\t"
+	"vaddps	  %%xmm5, %%xmm10, %%xmm5	  \n\t"
+	"vaddps	  %%xmm4, %%xmm9 , %%xmm4	  \n\t"
+	"vaddps	  %%xmm5, %%xmm11, %%xmm5	  \n\t"
+
+	"vaddps	  %%xmm5, %%xmm4 , %%xmm4	  \n\t"
+	"vmulps	  %%xmm6, %%xmm4 , %%xmm5	  \n\t"
+	"vaddps	  %%xmm5, %%xmm7 , %%xmm5	  \n\t"
+
+	"vmovups  %%xmm5,   (%3,%0,4)		  \n\t"	// 4 * y
+
+        "addq		$4, %8	  	 	  \n\t"
+        "addq		$4, %0	  	 	  \n\t"
+	"subq	        $4, %1			  \n\t"		
+
+        ".L08LABEL%=:                             \n\t"
+
+        "testq          $0x08, %1                 \n\t"
+        "jz             .L16LABEL%=               \n\t"
+
+	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
+	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"
+	"vmovups	(%3,%0,4), %%ymm7	  \n\t"	// 8 * y
+
+	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
+	"vmulps   (%5,%0,4), %%ymm13, %%ymm10     \n\t" 
+	"vmulps   (%6,%0,4), %%ymm14, %%ymm9      \n\t" 
+	"vmulps   (%7,%0,4), %%ymm15, %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm10, %%ymm5	  \n\t"
+	"vaddps	  %%ymm4, %%ymm9 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
+
+	"vmulps   (%4,%8,4), %%ymm0 , %%ymm8      \n\t" 
+	"vmulps   (%5,%8,4), %%ymm1 , %%ymm10     \n\t" 
+	"vmulps   (%6,%8,4), %%ymm2 , %%ymm9      \n\t" 
+	"vmulps   (%7,%8,4), %%ymm3 , %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm10, %%ymm5	  \n\t"
+	"vaddps	  %%ymm4, %%ymm9 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
+
+	"vaddps	  %%ymm5, %%ymm4 , %%ymm4	  \n\t"
+	"vmulps	  %%ymm6, %%ymm4 , %%ymm5	  \n\t"
+	"vaddps	  %%ymm5, %%ymm7 , %%ymm5	  \n\t"
+
+	"vmovups  %%ymm5,   (%3,%0,4)		  \n\t"	// 8 * y
+
+        "addq		$8, %8	  	 	  \n\t"
+        "addq		$8, %0	  	 	  \n\t"
+	"subq	        $8, %1			  \n\t"		
+
+
+        ".L16LABEL%=:                             \n\t"
+
+        "cmpq           $0, %1                    \n\t"
+        "je             .L16END%=                 \n\t"
+
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
+	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"
+
+	"prefetcht0	 192(%4,%0,4)		  \n\t"
+	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
+	"vmulps 32(%4,%0,4), %%ymm12, %%ymm9      \n\t" 
+	"prefetcht0	 192(%5,%0,4)		  \n\t"
+	"vmulps   (%5,%0,4), %%ymm13, %%ymm10     \n\t" 
+	"vmulps 32(%5,%0,4), %%ymm13, %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
+	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
+
+	"prefetcht0	 192(%6,%0,4)		  \n\t"
+	"vmulps   (%6,%0,4), %%ymm14, %%ymm8      \n\t" 
+	"vmulps 32(%6,%0,4), %%ymm14, %%ymm9      \n\t" 
+	"prefetcht0	 192(%7,%0,4)		  \n\t"
+	"vmulps   (%7,%0,4), %%ymm15, %%ymm10     \n\t" 
+	"vmulps 32(%7,%0,4), %%ymm15, %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
+	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
+
+	"prefetcht0	 192(%4,%8,4)		  \n\t"
+	"vmulps   (%4,%8,4), %%ymm0 , %%ymm8      \n\t" 
+	"vmulps 32(%4,%8,4), %%ymm0 , %%ymm9      \n\t" 
+	"prefetcht0	 192(%5,%8,4)		  \n\t"
+	"vmulps   (%5,%8,4), %%ymm1 , %%ymm10     \n\t" 
+	"vmulps 32(%5,%8,4), %%ymm1 , %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
+	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
+
+	"prefetcht0	 192(%6,%8,4)		  \n\t"
+	"vmulps   (%6,%8,4), %%ymm2 , %%ymm8      \n\t" 
+	"vmulps 32(%6,%8,4), %%ymm2 , %%ymm9      \n\t" 
+	"prefetcht0	 192(%7,%8,4)		  \n\t"
+	"vmulps   (%7,%8,4), %%ymm3 , %%ymm10     \n\t" 
+	"vmulps 32(%7,%8,4), %%ymm3 , %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
+	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
+
+	"vmulps	  %%ymm6, %%ymm4 , %%ymm4	  \n\t"
+	"vmulps	  %%ymm6, %%ymm5 , %%ymm5	  \n\t"
+
+	"vaddps    (%3,%0,4), %%ymm4 , %%ymm4	 \n\t"	// 8 * y
+	"vaddps  32(%3,%0,4), %%ymm5 , %%ymm5	 \n\t"	// 8 * y
+
+	"vmovups  %%ymm4,   (%3,%0,4)		  \n\t"	// 8 * y
+	"vmovups  %%ymm5, 32(%3,%0,4)		  \n\t"	// 8 * y
+
+        "addq		$16, %8	  	 	  \n\t"
+        "addq		$16, %0	  	 	  \n\t"
+	"subq	        $16, %1			  \n\t"		
+	"jnz		.L01LOOP%=		  \n\t"
+
+	".L16END%=:                               \n\t"
+	"vzeroupper			          \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (lda4),   // 8
+          "r" (alpha)   // 9
+	: "cc", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+
+
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+	"vbroadcastss    (%2), %%ymm12	 \n\t"	// x0 
+	"vbroadcastss   4(%2), %%ymm13	 \n\t"	// x1 
+	"vbroadcastss   8(%2), %%ymm14	 \n\t"	// x2 
+	"vbroadcastss  12(%2), %%ymm15	 \n\t"	// x3 
+
+	"vbroadcastss    (%8), %%ymm6 	 \n\t"	// alpha 
+
+        "testq          $0x04, %1               \n\t"
+        "jz             .L08LABEL%=             \n\t"
+
+	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
+	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"
+	"vmovups	(%3,%0,4), %%xmm7	  \n\t"	// 4 * y
+
+	"vmulps   (%4,%0,4), %%xmm12, %%xmm8      \n\t" 
+	"vmulps   (%5,%0,4), %%xmm13, %%xmm10     \n\t" 
+	"vmulps   (%6,%0,4), %%xmm14, %%xmm9      \n\t" 
+	"vmulps   (%7,%0,4), %%xmm15, %%xmm11     \n\t" 
+	"vaddps	  %%xmm4, %%xmm8 , %%xmm4	  \n\t"
+	"vaddps	  %%xmm5, %%xmm10, %%xmm5	  \n\t"
+	"vaddps	  %%xmm4, %%xmm9 , %%xmm4	  \n\t"
+	"vaddps	  %%xmm5, %%xmm11, %%xmm5	  \n\t"
+
+	"vaddps	  %%xmm5, %%xmm4 , %%xmm4	  \n\t"
+	"vmulps	  %%xmm6, %%xmm4 , %%xmm5	  \n\t"
+	"vaddps	  %%xmm5, %%xmm7 , %%xmm5	  \n\t"
+
+	"vmovups  %%xmm5,   (%3,%0,4)		  \n\t"	// 4 * y
+
+        "addq		$4, %0	  	 	  \n\t"
+	"subq	        $4, %1			  \n\t"		
+
+        ".L08LABEL%=:                           \n\t"
+
+        "testq          $0x08, %1                 \n\t"
+        "jz             .L16LABEL%=               \n\t"
+
+	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
+	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"
+	"vmovups	(%3,%0,4), %%ymm7	  \n\t"	// 8 * y
+
+	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
+	"vmulps   (%5,%0,4), %%ymm13, %%ymm10     \n\t" 
+	"vmulps   (%6,%0,4), %%ymm14, %%ymm9      \n\t" 
+	"vmulps   (%7,%0,4), %%ymm15, %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm10, %%ymm5	  \n\t"
+	"vaddps	  %%ymm4, %%ymm9 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
+
+	"vaddps	  %%ymm5, %%ymm4 , %%ymm4	  \n\t"
+	"vmulps	  %%ymm6, %%ymm4 , %%ymm5	  \n\t"
+	"vaddps	  %%ymm5, %%ymm7 , %%ymm5	  \n\t"
+
+	"vmovups  %%ymm5,   (%3,%0,4)		  \n\t"	// 8 * y
+
+        "addq		$8, %0	  	 	  \n\t"
+	"subq	        $8, %1			  \n\t"		
+
+
+        ".L16LABEL%=:                             \n\t"
+
+        "cmpq           $0, %1                    \n\t"
+        "je             .L16END%=                 \n\t"
+
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
+	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"
+	"vmovups	(%3,%0,4), %%ymm0	 \n\t"	// 8 * y
+	"vmovups      32(%3,%0,4), %%ymm1	 \n\t"	// 8 * y
+
+	"prefetcht0	 192(%4,%0,4)		  \n\t"
+	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
+	"vmulps 32(%4,%0,4), %%ymm12, %%ymm9      \n\t" 
+	"prefetcht0	 192(%5,%0,4)		  \n\t"
+	"vmulps   (%5,%0,4), %%ymm13, %%ymm10     \n\t" 
+	"vmulps 32(%5,%0,4), %%ymm13, %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
+	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
+
+	"prefetcht0	 192(%6,%0,4)		  \n\t"
+	"vmulps   (%6,%0,4), %%ymm14, %%ymm8      \n\t" 
+	"vmulps 32(%6,%0,4), %%ymm14, %%ymm9      \n\t" 
+	"prefetcht0	 192(%7,%0,4)		  \n\t"
+	"vmulps   (%7,%0,4), %%ymm15, %%ymm10     \n\t" 
+	"vmulps 32(%7,%0,4), %%ymm15, %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
+	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
+
+	"vmulps	  %%ymm6, %%ymm4 , %%ymm4	  \n\t"
+	"vmulps	  %%ymm6, %%ymm5 , %%ymm5	  \n\t"
+
+	"vaddps	  %%ymm4, %%ymm0 , %%ymm0	  \n\t"
+	"vaddps	  %%ymm5, %%ymm1 , %%ymm1	  \n\t"
+
+	"vmovups  %%ymm0,   (%3,%0,4)		  \n\t"	// 8 * y
+	"vmovups  %%ymm1, 32(%3,%0,4)		  \n\t"	// 8 * y
+
+        "addq		$16, %0	  	 	  \n\t"
+	"subq	        $16, %1			  \n\t"		
+	"jnz		.L01LOOP%=		  \n\t"
+
+	".L16END%=:                               \n\t"
+	"vzeroupper			          \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (alpha)   // 8
+	: "cc", 
+	  "%xmm0", "%xmm1", 
+	  "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
--- a/kernel/x86_64/sgemv_n_microk_sandy.c
+++ b/kernel/x86_64/sgemv_n_microk_sandy.c
@ -1,473 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-static void  sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	float *pre = a + lda*2;
-
-	__asm__  __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-	"movq		%6,	 %%r8\n\t"		// address for prefetch
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-
-	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
-	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
-	"vxorps		%%ymm10, %%ymm10, %%ymm10\n\t"	// set to zero
-	"vxorps		%%ymm11, %%ymm11, %%ymm11\n\t"	// set to zero
-	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
-	"vxorps		%%ymm13, %%ymm13, %%ymm13\n\t"	// set to zero
-	"vxorps		%%ymm14, %%ymm14, %%ymm14\n\t"	// set to zero
-	"vxorps		%%ymm15, %%ymm15, %%ymm15\n\t"	// set to zero
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-	"nop					 \n\t"
-	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
-
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
-	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-	"vmulps  16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
-	"vmulps  24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
-
-	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
-	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
-	"prefetcht0  128(%%r8)\n\t"			// Prefetch
-	"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
-	"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
-
-	"prefetcht0  192(%%r8)\n\t"			// Prefetch
-	"vmulps  32*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
-	"vmulps  40*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
-	"vmulps  48*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
-	"vmulps  56*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
-
-	"vaddps %%ymm12, %%ymm4, %%ymm12\n\t" // multiply a and c and add to temp
-	"vaddps %%ymm13, %%ymm5, %%ymm13\n\t" // multiply a and c and add to temp
-	"vaddps %%ymm14, %%ymm6, %%ymm14\n\t" // multiply a and c and add to temp
-	"vaddps %%ymm15, %%ymm7, %%ymm15\n\t" // multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
-	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
-	"vmulps		%%ymm10, %%ymm1,  %%ymm10\n\t"  // scale by alpha
-	"vmulps		%%ymm11, %%ymm1,  %%ymm11\n\t"  // scale by alpha
-	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
-	"vmulps		%%ymm13, %%ymm1,  %%ymm13\n\t"  // scale by alpha
-	"vmulps		%%ymm14, %%ymm1,  %%ymm14\n\t"  // scale by alpha
-	"vmulps		%%ymm15, %%ymm1,  %%ymm15\n\t"  // scale by alpha
-
-	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm10, 16*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm11, 24*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm12, 32*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm13, 40*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm14, 48*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm15, 56*4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y),      // 5
-	  "m" (pre)	// 6
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-
-static void  sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	float *pre = a + lda*3;
-
-	__asm__  __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-	"movq		%6,	 %%r8\n\t"		// address for prefetch
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-
-	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
-	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
-	"vxorps		%%ymm10, %%ymm10, %%ymm10\n\t"	// set to zero
-	"vxorps		%%ymm11, %%ymm11, %%ymm11\n\t"	// set to zero
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-	"nop					 \n\t"
-	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
-
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-
-	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
-	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
-	"vmulps  16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
-	"vmulps  24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
-
-	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
-	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
-	"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
-	"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
-
-
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
-	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
-	"vmulps		%%ymm10, %%ymm1,  %%ymm10\n\t"  // scale by alpha
-	"vmulps		%%ymm11, %%ymm1,  %%ymm11\n\t"  // scale by alpha
-
-	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm10, 16*4(%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm11, 24*4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y),      // 5
-	  "m" (pre)	// 6
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "memory"
-	);
-
-
-
-} 
-
-static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-	float *pre = a + lda*3;
-	
-	__asm__  __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-	"movq		%6,	 %%r8\n\t"		// address for prefetch
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-
-	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
-	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-	"nop					 \n\t"
-	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
-
-	"prefetcht0	(%%r8)\n\t"			// Prefetch
-
-	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
-	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
-
-	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
-	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
-	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
-
-	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y),      // 5
-	  "m" (pre)	// 6
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "memory"
-	);
-
-
-} 
-
-
-static void  sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-	
-	__asm__  __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
-
-	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
-	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
-	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "memory"
-	);
-
-
-} 
-
-
-static void  sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vbroadcastss   %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vbroadcastss	(%%rdi),   %%xmm0	 \n\t"	// load values of c
-
-	"vmulps   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
-	"vaddps %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulps		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
-
-	"vmovups	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-static void  sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
-
-	"vmulps   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
-	"vmulps   1*4(%%rsi), %%xmm0, %%xmm5 \n\t" 		// multiply a and c and add to temp
-
-	"vaddps %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp
-	"vaddps %%xmm13, %%xmm5, %%xmm13     \n\t" 		// multiply a and c and add to temp
-
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
-	"vmulss		%%xmm13, %%xmm1,  %%xmm13\n\t"  // scale by alpha
-
-	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
-	"vmovss 	%%xmm13,    4(%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-
-static void  sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-
-	".L01LOOP%=:				 \n\t"
-	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
-        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
-
-	"vmulss   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
-	"vaddss %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp
-
-	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
-
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
-
-	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
-	  "%xmm0", "%xmm1", 
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@ -0,0 +1,624 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(NEHALEM)
+#include "sgemv_t_microk_nehalem-4.c"
+#elif defined(BULLDOZER) || defined(PILEDRIVER)
+#include "sgemv_t_microk_bulldozer-4.c"
+#elif defined(SANDYBRIDGE)
+#include "sgemv_t_microk_sandy-4.c"
+#elif defined(HASWELL)
+#include "sgemv_t_microk_haswell-4.c"
+#endif
+
+#define NBMAX 4096
+
+#ifndef HAVE_KERNEL_4x4
+
+static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3];
+	FLOAT temp0 = 0.0;
+	FLOAT temp1 = 0.0;
+	FLOAT temp2 = 0.0;
+	FLOAT temp3 = 0.0;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];		
+		temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];		
+		temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];		
+		temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];		
+	}
+	y[0] = temp0;
+	y[1] = temp1;
+	y[2] = temp2;
+	y[3] = temp3;
+}
+	
+#endif
+
+static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y)  __attribute__ ((noinline));
+
+static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+
+	i=0;
+
+        __asm__  __volatile__
+ 	(
+	"xorps %%xmm10 , %%xmm10		\n\t"
+	"xorps %%xmm11 , %%xmm11		\n\t"
+		
+	"testq	$4 , %1				\n\t"
+	"jz	.L01LABEL%=			\n\t"
+
+	"movups  (%5,%0,4) , %%xmm14		\n\t" // x
+	"movups  (%3,%0,4) , %%xmm12		\n\t" // ap0
+	"movups  (%4,%0,4) , %%xmm13		\n\t" // ap1
+	"mulps   %%xmm14   , %%xmm12 		\n\t"
+	"mulps   %%xmm14   , %%xmm13 		\n\t"
+        "addq           $4 , %0                 \n\t"
+	"addps   %%xmm12   , %%xmm10		\n\t"
+        "subq           $4 , %1                 \n\t"
+	"addps   %%xmm13   , %%xmm11		\n\t"
+
+        ".L01LABEL%=:                           \n\t"
+
+	"cmpq	$0, %1				\n\t"
+	"je	.L01END%=			\n\t"
+
+        ".align 16                              \n\t"
+        ".L01LOOP%=:                            \n\t"
+
+	"movups  (%5,%0,4) , %%xmm14		\n\t" // x
+	"movups  (%3,%0,4) , %%xmm12		\n\t" // ap0
+	"movups  (%4,%0,4) , %%xmm13		\n\t" // ap1
+	"mulps   %%xmm14   , %%xmm12 		\n\t"
+	"mulps   %%xmm14   , %%xmm13 		\n\t"
+	"addps   %%xmm12   , %%xmm10		\n\t"
+	"addps   %%xmm13   , %%xmm11		\n\t"
+
+	"movups  16(%5,%0,4) , %%xmm14		\n\t" // x
+	"movups  16(%3,%0,4) , %%xmm12		\n\t" // ap0
+	"movups  16(%4,%0,4) , %%xmm13		\n\t" // ap1
+	"mulps   %%xmm14   , %%xmm12 		\n\t"
+	"mulps   %%xmm14   , %%xmm13 		\n\t"
+	"addps   %%xmm12   , %%xmm10		\n\t"
+	"addps   %%xmm13   , %%xmm11		\n\t"
+
+        "addq           $8 , %0                 \n\t"
+        "subq           $8 , %1                 \n\t"
+        "jnz            .L01LOOP%=              \n\t"
+
+        ".L01END%=:                             \n\t"
+
+	"haddps        %%xmm10, %%xmm10         \n\t"
+	"haddps        %%xmm11, %%xmm11         \n\t"
+	"haddps        %%xmm10, %%xmm10         \n\t"
+	"haddps        %%xmm11, %%xmm11         \n\t"
+
+	"movss	       %%xmm10, (%2)	        \n\t"
+	"movss	       %%xmm11,4(%2)	        \n\t"
+
+        :
+   	:
+	"r" (i),	 // 0
+	"r" (n),	 // 1
+        "r" (y),         // 2    
+        "r" (ap0),       // 3
+        "r" (ap1),       // 4
+        "r" (x)          // 5
+        : "cc",
+       	"%xmm4", "%xmm5", "%xmm10", "%xmm11",
+       	"%xmm12", "%xmm13", "%xmm14", "%xmm15",
+       	"memory"
+       	);
+
+
+}
+	
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)  __attribute__ ((noinline));
+
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+	BLASLONG i;
+
+	i=0;
+
+        __asm__  __volatile__
+ 	(
+	"xorps %%xmm9  , %%xmm9 		\n\t"
+	"xorps %%xmm10 , %%xmm10		\n\t"
+	
+	"testq	$4 , %1				\n\t"
+	"jz	.L01LABEL%=			\n\t"
+
+	"movups  (%3,%0,4) , %%xmm12		\n\t"
+	"movups  (%4,%0,4) , %%xmm11		\n\t"
+	"mulps   %%xmm11   , %%xmm12 		\n\t"
+        "addq           $4 , %0                 \n\t"
+	"addps   %%xmm12   , %%xmm10		\n\t"
+        "subq           $4 , %1                 \n\t"
+
+        ".L01LABEL%=:                           \n\t"
+
+	"cmpq	$0, %1				\n\t"
+	"je	.L01END%=			\n\t"
+
+        ".align 16                              \n\t"
+        ".L01LOOP%=:                            \n\t"
+
+	"movups    (%3,%0,4) , %%xmm12		\n\t"
+	"movups  16(%3,%0,4) , %%xmm14		\n\t"
+	"movups    (%4,%0,4) , %%xmm11		\n\t"
+	"movups  16(%4,%0,4) , %%xmm13		\n\t"
+	"mulps   %%xmm11   , %%xmm12 		\n\t"
+	"mulps   %%xmm13   , %%xmm14 		\n\t"
+        "addq           $8 , %0                 \n\t"
+	"addps   %%xmm12   , %%xmm10		\n\t"
+        "subq           $8 , %1                 \n\t"
+	"addps   %%xmm14   , %%xmm9 		\n\t"
+
+        "jnz            .L01LOOP%=              \n\t"
+
+        ".L01END%=:                             \n\t"
+
+	"addps	       %%xmm9 , %%xmm10         \n\t"
+	"haddps        %%xmm10, %%xmm10         \n\t"
+	"haddps        %%xmm10, %%xmm10         \n\t"
+
+	"movss	       %%xmm10, (%2)	        \n\t"
+
+        :
+   	:
+	"r" (i),	 // 0
+	"r" (n),	 // 1
+        "r" (y),         // 2    
+        "r" (ap),        // 3
+        "r" (x)          // 4
+        : "cc",
+       	"%xmm9", "%xmm10" ,
+       	"%xmm11", "%xmm12", "%xmm13", "%xmm14",
+       	"memory"
+       	);
+
+
+}
+	
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
+{
+        BLASLONG i;
+        for ( i=0; i<n; i++ )
+        {
+                *dest = *src;
+                dest++;
+                src += inc_src;
+        }
+}
+
+static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+
+        BLASLONG i;
+
+	if ( inc_dest != 1 )
+	{
+        	for ( i=0; i<n; i++ )
+        	{
+                	*dest += src[i]  * da;
+                	dest  += inc_dest;
+		}
+		return;
+        }
+
+	i=0;
+
+        __asm__  __volatile__
+ 	(
+	"movss	 (%2) , %%xmm10                 \n\t"
+	"shufps  $0 , %%xmm10 , %%xmm10		\n\t"
+
+        ".align 16                              \n\t"
+        ".L01LOOP%=:                            \n\t"
+
+	"movups  (%3,%0,4) , %%xmm12		\n\t"
+	"movups  (%4,%0,4) , %%xmm11		\n\t"
+	"mulps   %%xmm10   , %%xmm12 		\n\t"
+        "addq           $4 , %0                 \n\t"
+	"addps   %%xmm12   , %%xmm11		\n\t"
+        "subq           $4 , %1                 \n\t"
+	"movups  %%xmm11, -16(%4,%0,4)		\n\t"
+
+        "jnz            .L01LOOP%=              \n\t"
+
+        :
+   	:
+	"r" (i),	  // 0
+	"r" (n),	  // 1
+        "r" (&da),        // 2    
+        "r" (src),        // 3
+        "r" (dest)        // 4
+        : "cc",
+       	"%xmm10", "%xmm11", "%xmm12",
+       	"memory"
+       	);
+
+
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG register i;
+	BLASLONG register j;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	BLASLONG n0;
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG m3;
+	BLASLONG n2;
+	FLOAT ybuffer[4],*xbuffer;
+	FLOAT *ytemp;
+
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
+	xbuffer = buffer;
+	ytemp   = buffer + NBMAX;
+	
+	n0 = n / NBMAX;
+        n1 = (n % NBMAX)  >> 2 ;
+        n2 = n & 3  ;
+
+	m3 = m & 3  ;
+        m1 = m & -4 ;
+        m2 = (m & (NBMAX-1)) - m3 ;
+
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		y_ptr = y;
+		a_ptr = a;
+		x_ptr = x;
+
+		if ( inc_x == 1 )
+			xbuffer = x_ptr;
+		else
+			copy_x(NB,x_ptr,xbuffer,inc_x);
+
+
+		FLOAT *ap[4];
+		FLOAT *yp;
+		BLASLONG register lda4 = 4 * lda;
+		ap[0] = a_ptr;
+		ap[1] = a_ptr + lda;
+		ap[2] = ap[1] + lda;
+		ap[3] = ap[2] + lda;
+
+		if ( n0 > 0 )
+		{
+			BLASLONG nb1 = NBMAX / 4;
+			for( j=0; j<n0; j++)
+			{
+
+				yp = ytemp;
+				for( i = 0; i < nb1  ; i++)
+				{
+					sgemv_kernel_4x4(NB,ap,xbuffer,yp);
+					ap[0] += lda4 ;
+					ap[1] += lda4 ;
+					ap[2] += lda4 ;
+					ap[3] += lda4 ;
+					yp += 4;
+				}
+				add_y(nb1*4, alpha, ytemp, y_ptr, inc_y );
+				y_ptr += nb1 * inc_y * 4;
+				a_ptr += nb1 * lda4 ;
+
+			}
+
+		}
+
+
+		yp = ytemp;
+
+		for( i = 0; i < n1 ; i++)
+		{
+			sgemv_kernel_4x4(NB,ap,xbuffer,yp);
+			ap[0] += lda4 ;
+			ap[1] += lda4 ;
+			ap[2] += lda4 ;
+			ap[3] += lda4 ;
+			yp += 4;
+		}
+		if ( n1 > 0 )
+		{
+			add_y(n1*4, alpha, ytemp, y_ptr, inc_y );
+			y_ptr += n1 * inc_y * 4;
+			a_ptr += n1 * lda4 ;
+		}
+
+		if ( n2 & 2 )
+		{
+
+			sgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer);
+			a_ptr  += lda * 2;
+			*y_ptr += ybuffer[0] * alpha;
+			y_ptr  += inc_y;
+			*y_ptr += ybuffer[1] * alpha;
+			y_ptr  += inc_y;
+
+		}
+
+		if ( n2 & 1 )
+		{
+
+			sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
+			a_ptr  += lda;
+			*y_ptr += ybuffer[0] * alpha;
+			y_ptr  += inc_y;
+
+		}
+		a += NB;
+		x += NB * inc_x;	
+	}
+
+	if ( m3 == 0 ) return(0);
+
+	x_ptr = x;
+	a_ptr = a;
+	if ( m3 == 3 )
+	{
+		FLOAT xtemp0 = *x_ptr * alpha;
+		x_ptr += inc_x;
+		FLOAT xtemp1 = *x_ptr * alpha;
+		x_ptr += inc_x;
+		FLOAT xtemp2 = *x_ptr * alpha;
+
+		FLOAT *aj = a_ptr;
+		y_ptr = y;
+
+		if ( lda == 3 && inc_y == 1 )
+		{
+
+			for ( j=0; j< ( n & -4) ; j+=4 )
+			{
+
+				y_ptr[j]   += aj[0] * xtemp0 + aj[1]  * xtemp1 + aj[2]  * xtemp2;
+				y_ptr[j+1] += aj[3] * xtemp0 + aj[4]  * xtemp1 + aj[5]  * xtemp2;
+				y_ptr[j+2] += aj[6] * xtemp0 + aj[7]  * xtemp1 + aj[8]  * xtemp2;
+				y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
+			 	aj        += 12;
+			}
+
+			for ( ; j<n; j++ )
+			{
+				y_ptr[j]  += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+			 	aj        += 3;
+			}
+
+		}
+		else
+		{
+
+			if ( inc_y == 1 )
+			{
+
+				BLASLONG register lda2 = lda << 1;
+				BLASLONG register lda4 = lda << 2;
+				BLASLONG register lda3 = lda2 + lda;
+
+				for ( j=0; j< ( n & -4 ); j+=4 )
+				{
+
+					y_ptr[j]    += *aj        * xtemp0 + *(aj+1)      * xtemp1 + *(aj+2)      * xtemp2;
+					y_ptr[j+1]  += *(aj+lda)  * xtemp0 + *(aj+lda+1)  * xtemp1 + *(aj+lda+2)  * xtemp2;
+					y_ptr[j+2]  += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2;
+					y_ptr[j+3]  += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2;
+			 		aj          += lda4;
+				}
+
+				for ( ; j< n ; j++ )
+				{
+
+					y_ptr[j]    += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ;
+			 		aj          += lda;
+				}
+
+			}
+			else
+			{
+
+				for ( j=0; j<n; j++ )
+				{
+					*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
+				 	y_ptr += inc_y;
+			 		aj    += lda;
+				}
+
+
+			}
+
+		}
+		return(0);
+	}
+
+	if ( m3 == 2 )
+	{
+		FLOAT xtemp0 = *x_ptr * alpha;
+		x_ptr += inc_x;
+		FLOAT xtemp1 = *x_ptr * alpha;
+
+		FLOAT *aj = a_ptr;
+		y_ptr = y;
+
+		if ( lda == 2 && inc_y == 1 )
+		{
+
+			for ( j=0; j< ( n & -4) ; j+=4 )
+			{
+				y_ptr[j]   += aj[0] * xtemp0 + aj[1] * xtemp1 ;
+				y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ;
+				y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ;
+				y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ;
+			 	aj         += 8;
+
+			}
+
+			for ( ; j<n; j++ )
+			{
+				y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
+			 	aj       += 2;
+			}
+
+		}
+		else
+		{
+			if ( inc_y == 1 )
+			{
+
+				BLASLONG register lda2 = lda << 1;
+				BLASLONG register lda4 = lda << 2;
+				BLASLONG register lda3 = lda2 + lda;
+
+				for ( j=0; j< ( n & -4 ); j+=4 )
+				{
+
+					y_ptr[j]    += *aj        * xtemp0 + *(aj+1)      * xtemp1 ;
+					y_ptr[j+1]  += *(aj+lda)  * xtemp0 + *(aj+lda+1)  * xtemp1 ;
+					y_ptr[j+2]  += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ;
+					y_ptr[j+3]  += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ;
+			 		aj          += lda4;
+				}
+
+				for ( ; j< n ; j++ )
+				{
+
+					y_ptr[j]    += *aj * xtemp0 + *(aj+1) * xtemp1 ;
+			 		aj          += lda;
+				}
+
+			}
+			else
+			{
+				for ( j=0; j<n; j++ )
+				{
+					*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
+			 		y_ptr += inc_y;
+			 		aj    += lda;
+				}
+			}
+
+		}
+		return(0);
+
+	}
+
+	FLOAT xtemp = *x_ptr * alpha;
+	FLOAT *aj = a_ptr;
+	y_ptr = y;
+	if ( lda == 1 && inc_y == 1 )
+	{
+		for ( j=0; j< ( n & -4) ; j+=4 )
+		{
+			y_ptr[j]   += aj[j]   * xtemp;
+			y_ptr[j+1] += aj[j+1] * xtemp;
+			y_ptr[j+2] += aj[j+2] * xtemp;
+			y_ptr[j+3] += aj[j+3] * xtemp;
+		}
+		for ( ; j<n   ; j++ )
+		{
+			y_ptr[j] += aj[j] * xtemp;
+		}
+
+
+
+	}
+	else
+	{
+		if ( inc_y == 1 )
+		{
+
+			BLASLONG register lda2 = lda << 1;
+			BLASLONG register lda4 = lda << 2;
+			BLASLONG register lda3 = lda2 + lda;
+			for ( j=0; j< ( n & -4 ); j+=4 )
+			{
+				y_ptr[j]    += *aj        * xtemp;
+				y_ptr[j+1]  += *(aj+lda)  * xtemp;
+				y_ptr[j+2]  += *(aj+lda2) * xtemp;
+				y_ptr[j+3]  += *(aj+lda3) * xtemp;
+		 		aj          += lda4  ;
+			}
+
+			for ( ; j<n; j++ )
+			{
+				y_ptr[j]  += *aj * xtemp;
+		 		aj        += lda;
+			}
+
+		}
+		else
+		{
+			for ( j=0; j<n; j++ )
+			{
+				*y_ptr += *aj * xtemp;
+		 		y_ptr += inc_y;
+		 		aj    += lda;
+			}
+
+		}
+	}
+
+	return(0);
+}
+
+
--- a/kernel/x86_64/sgemv_t_avx.c
+++ b/kernel/x86_64/sgemv_t_avx.c
@ -1,232 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-
-#include "common.h"
-
-#if defined(BULLDOZER) || defined(PILEDRIVER)
-#include "sgemv_t_microk_bulldozer.c"
-#elif defined(HASWELL)
-#include "sgemv_t_microk_haswell.c"
-#else
-#include "sgemv_t_microk_sandy.c"
-#endif
-
-static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
-{
-	BLASLONG i;
-	for ( i=0; i<n; i++ )
-	{
-		*dest = *src;
-		dest++;
-		src += inc_src;
-	}
-}
-
-static void  sgemv_kernel_1( BLASLONG n, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, FLOAT *y)
-{
-
-	FLOAT register temp0 = 0.0;
-	BLASLONG i;
-	for ( i=0; i<n ; i++)
-	{
-		temp0 += a[i] * x[i];
-	}
-	temp0 *= alpha ;
-	*y += temp0;
-}
-
-
-
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
-{
-	BLASLONG i;
-	BLASLONG j;
-	FLOAT *a_ptr;
-	FLOAT *x_ptr;
-	FLOAT *y_ptr;
-	FLOAT *a_ptrl;
-	BLASLONG m1;
-	BLASLONG register m2;
-	FLOAT *xbuffer;
-	xbuffer = buffer;
-	BLASLONG register Mblock;
-
-	m1 = m / 1024 ;
-	m2 = m % 1024 ;
-
-	x_ptr = x;
-	a_ptr = a;
-
-	for (j=0; j<m1; j++)
-	{
-
-		if ( inc_x == 1 )
-			xbuffer = x_ptr;
-		else
-			copy_x(1024,x_ptr,xbuffer,inc_x);
-
-		y_ptr = y;
-		a_ptrl = a_ptr;
-
-		for(i = 0; i<n; i++ )
-		{
-			sgemv_kernel_16(1024,alpha,a_ptrl,lda,xbuffer,y_ptr);
-			y_ptr += inc_y;
-			a_ptrl += lda;
-		}
-		a_ptr += 1024;	
-		x_ptr += 1024 * inc_x;
-	}
-
-	if ( m2 == 0 ) return(0);
-
-	Mblock = 512;
-	while ( Mblock >= 16 )
-	{
-	  if ( m2 & Mblock)
-	  {
-
-		if ( inc_x == 1 )
-			xbuffer = x_ptr;
-		else
-			copy_x(Mblock,x_ptr,xbuffer,inc_x);
-
-		y_ptr = y;
-		a_ptrl = a_ptr;
-
-		for(i = 0; i<n; i++ )
-		{
-			sgemv_kernel_16(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
-			y_ptr += inc_y;
-			a_ptrl += lda;
-		}
-		a_ptr += Mblock;	
-		x_ptr += Mblock * inc_x;
-
-
- 	  }
-	  Mblock /= 2;
-
-	}
-
-        if ( m2 & Mblock)
-	{
-
-		if ( inc_x == 1 )
-			xbuffer = x_ptr;
-		else
-			copy_x(Mblock,x_ptr,xbuffer,inc_x);
-
-		y_ptr = y;
-		a_ptrl = a_ptr;
-
-		for(i = 0; i<n; i++ )
-		{
-			sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
-			y_ptr += inc_y;
-			a_ptrl += lda;
-		}
-		a_ptr += Mblock;	
-		x_ptr += Mblock * inc_x;
-
-
- 	}
-	Mblock /= 2;
-
-
-        if ( m2 & Mblock)
-	{
-
-		if ( inc_x == 1 )
-			xbuffer = x_ptr;
-		else
-			copy_x(Mblock,x_ptr,xbuffer,inc_x);
-
-		y_ptr = y;
-		a_ptrl = a_ptr;
-
-		for(i = 0; i<n; i++ )
-		{
-			sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
-			y_ptr += inc_y;
-			a_ptrl += lda;
-		}
-		a_ptr += Mblock;	
-		x_ptr += Mblock * inc_x;
-
-
- 	}
-	Mblock /= 2;
-
-        if ( m2 & Mblock)
-	{
-
-		if ( inc_x == 1 )
-			xbuffer = x_ptr;
-		else
-			copy_x(Mblock,x_ptr,xbuffer,inc_x);
-
-		y_ptr = y;
-		a_ptrl = a_ptr;
-
-		for(i = 0; i<n; i++ )
-		{
-			sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
-			y_ptr += inc_y;
-			a_ptrl += lda;
-		}
-		a_ptr += Mblock;	
-		x_ptr += Mblock * inc_x;
-
-
- 	}
-	Mblock /= 2;
-
-        if ( m2 & Mblock)
-	{
-
-		xbuffer = x_ptr;
-
-		y_ptr = y;
-		a_ptrl = a_ptr;
-
-		for(i = 0; i<n; i++ )
-		{
-			sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
-			y_ptr += inc_y;
-			a_ptrl += lda;
-		}
-
-
- 	}
-
-	return(0);
-}
-
-
--- a/kernel/x86_64/sgemv_t_microk_bulldozer-4.c
+++ b/kernel/x86_64/sgemv_t_microk_bulldozer-4.c
@ -0,0 +1,147 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vxorps		%%xmm4, %%xmm4, %%xmm4	 \n\t"
+	"vxorps		%%xmm5, %%xmm5, %%xmm5	 \n\t"
+	"vxorps		%%xmm6, %%xmm6, %%xmm6	 \n\t"
+	"vxorps		%%xmm7, %%xmm7, %%xmm7	 \n\t"
+
+	"testq		$0x04, %1		       \n\t"
+	"jz		.L08LABEL%=		       \n\t"
+
+        "vmovups        (%2,%0,4), %%xmm12             \n\t"  // 4 * x
+	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%5,%0,4), %%xmm12, %%xmm5 \n\t" 
+	"vfmaddps %%xmm6,   (%6,%0,4), %%xmm12, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7,   (%7,%0,4), %%xmm12, %%xmm7 \n\t" 
+        "addq		$4 , %0	  	 	       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+
+	".L08LABEL%=:				       \n\t"
+
+	"testq		$0x08, %1		       \n\t"
+	"jz		.L16LABEL%=		       \n\t"
+
+        "vmovups        (%2,%0,4), %%xmm12             \n\t"  // 4 * x
+        "vmovups      16(%2,%0,4), %%xmm13             \n\t"  // 4 * x
+	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%5,%0,4), %%xmm12, %%xmm5 \n\t" 
+	"vfmaddps %%xmm6,   (%6,%0,4), %%xmm12, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7,   (%7,%0,4), %%xmm12, %%xmm7 \n\t" 
+	"vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" 
+	"vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t" 
+
+        "addq		$8 , %0	  	 	       \n\t"
+	"subq	        $8 , %1			       \n\t"		
+
+	".L16LABEL%=:				       \n\t"
+
+	"cmpq		$0, %1		               \n\t"
+	"je		.L16END%=		       \n\t"
+
+	".align 16				       \n\t"
+	".L01LOOP%=:				       \n\t"
+        "vmovups        (%2,%0,4), %%xmm12             \n\t"  // 4 * x
+
+	"prefetcht0	 384(%4,%0,4)		       \n\t"
+	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%5,%0,4), %%xmm12, %%xmm5 \n\t" 
+        "vmovups      16(%2,%0,4), %%xmm13             \n\t"  // 4 * x
+	"vfmaddps %%xmm6,   (%6,%0,4), %%xmm12, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7,   (%7,%0,4), %%xmm12, %%xmm7 \n\t" 
+	"prefetcht0	 384(%5,%0,4)		       \n\t"
+	".align 2				       \n\t"
+	"vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" 
+        "vmovups      32(%2,%0,4), %%xmm14             \n\t"  // 4 * x
+	"vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t" 
+	"prefetcht0	 384(%6,%0,4)		       \n\t"
+	".align 2				       \n\t"
+	"vfmaddps %%xmm4, 32(%4,%0,4), %%xmm14, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 32(%5,%0,4), %%xmm14, %%xmm5 \n\t" 
+        "vmovups      48(%2,%0,4), %%xmm15             \n\t"  // 4 * x
+	"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 32(%7,%0,4), %%xmm14, %%xmm7 \n\t" 
+	"prefetcht0	 384(%7,%0,4)		       \n\t"
+	"vfmaddps %%xmm4, 48(%4,%0,4), %%xmm15, %%xmm4 \n\t" 
+        "addq		$16, %0	  	 	       \n\t"
+	"vfmaddps %%xmm5,-16(%5,%0,4), %%xmm15, %%xmm5 \n\t" 
+	"vfmaddps %%xmm6,-16(%6,%0,4), %%xmm15, %%xmm6 \n\t" 
+	"subq	        $16, %1			       \n\t"		
+	"vfmaddps %%xmm7,-16(%7,%0,4), %%xmm15, %%xmm7 \n\t" 
+
+	"jnz		.L01LOOP%=		       \n\t"
+
+	".L16END%=:				\n\t"
+	"vhaddps        %%xmm4, %%xmm4, %%xmm4	\n\t"
+	"vhaddps        %%xmm5, %%xmm5, %%xmm5	\n\t"
+	"vhaddps        %%xmm6, %%xmm6, %%xmm6	\n\t"
+	"vhaddps        %%xmm7, %%xmm7, %%xmm7	\n\t"
+
+	"vhaddps        %%xmm4, %%xmm4, %%xmm4	\n\t"
+	"vhaddps        %%xmm5, %%xmm5, %%xmm5	\n\t"
+	"vhaddps        %%xmm6, %%xmm6, %%xmm6	\n\t"
+	"vhaddps        %%xmm7, %%xmm7, %%xmm7	\n\t"
+
+	"vmovss		%%xmm4,    (%3)		\n\t"
+	"vmovss		%%xmm5,   4(%3)		\n\t"
+	"vmovss		%%xmm6,   8(%3)		\n\t"
+	"vmovss		%%xmm7,  12(%3)		\n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm4", "%xmm5", 
+	  "%xmm6", "%xmm7", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
--- a/kernel/x86_64/sgemv_t_microk_bulldozer.c
+++ b/kernel/x86_64/sgemv_t_microk_bulldozer.c
@ -1,99 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-	//n = n / 16;
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"leaq	(, %%rcx,4), %%rcx       \n\t"		// scale lda by size of float
-	"leaq	(%%rsi,%%rcx,1), %%r8    \n\t"		// pointer to next line
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
-	"vxorps		%%xmm14, %%xmm14, %%xmm14\n\t"	// set to zero
-	"vxorps		%%xmm15, %%xmm15, %%xmm15\n\t"	// set to zero
-
-	"sarq		$4, %%rax		 \n\t"	// n = n / 16
-
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	// "prefetcht0	512(%%rsi)		 \n\t"
-	"prefetcht0	(%%r8)		 	 \n\t" //prefetch next line of a
-	"vmovups	(%%rsi), %%xmm4		 \n\t"
-	"vmovups     4*4(%%rsi), %%xmm5		 \n\t"
-	"vmovups     8*4(%%rsi), %%xmm6		 \n\t"
-	"vmovups    12*4(%%rsi), %%xmm7		 \n\t"
-
-	"vfmaddps %%xmm12,   0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp
-	"vfmaddps %%xmm13,   4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp
-	"vfmaddps %%xmm14,   8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp
-	"vfmaddps %%xmm15,  12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp
-
-        "addq		$16*4    ,   %%r8	 \n\t"  // increment prefetch pointer 
-        "addq		$16*4    ,   %%rsi	 \n\t"  // increment pointer of a 
-        "addq		$16*4    ,   %%rdi	 \n\t"  // increment pointer of c 
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vaddps		%%xmm12, %%xmm14, %%xmm12\n\t"	
-	"vaddps		%%xmm13, %%xmm15, %%xmm13\n\t"	
-	"vaddps		%%xmm12, %%xmm13, %%xmm12\n\t"	
-	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
-	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
-
-	"vfmaddss	(%%rdx), %%xmm12, %%xmm1, %%xmm12\n\t"
-	"vmovss		%%xmm12, (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-
--- a/kernel/x86_64/sgemv_t_microk_haswell-4.c
+++ b/kernel/x86_64/sgemv_t_microk_haswell-4.c
@ -0,0 +1,148 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			         \n\t"
+	"vxorps		%%ymm4 , %%ymm4, %%ymm4  \n\t"
+	"vxorps		%%ymm5 , %%ymm5, %%ymm5  \n\t"
+	"vxorps		%%ymm6 , %%ymm6, %%ymm6  \n\t"
+	"vxorps		%%ymm7 , %%ymm7, %%ymm7  \n\t"
+
+        "testq          $0x04, %1                      \n\t"
+        "jz             .L08LABEL%=                    \n\t"
+
+	"vmovups	(%2,%0,4), %%xmm12             \n\t"	// 4 * x
+
+	"vfmadd231ps   (%4,%0,4), %%xmm12, %%xmm4      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%xmm12, %%xmm5      \n\t" 
+	"vfmadd231ps   (%6,%0,4), %%xmm12, %%xmm6      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%xmm12, %%xmm7      \n\t" 
+
+        "addq		$4 , %0	  	 	      \n\t"
+	"subq	        $4 , %1			      \n\t"		
+
+        ".L08LABEL%=:                                  \n\t"
+
+        "testq          $0x08, %1                      \n\t"
+        "jz             .L16LABEL%=                    \n\t"
+
+	"vmovups	(%2,%0,4), %%ymm12             \n\t"	// 8 * x
+
+	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%ymm12, %%ymm5      \n\t" 
+	"vfmadd231ps   (%6,%0,4), %%ymm12, %%ymm6      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%ymm12, %%ymm7      \n\t" 
+
+        "addq		$8 , %0	  	 	      \n\t"
+	"subq	        $8 , %1			      \n\t"		
+
+        ".L16LABEL%=:                                  \n\t"
+
+        "cmpq           $0, %1                         \n\t"
+        "je             .L16END%=                      \n\t"
+
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"prefetcht0	 384(%2,%0,4)		 \n\t"
+	"vmovups	(%2,%0,4), %%ymm12       \n\t"	// 8 * x
+	"vmovups      32(%2,%0,4), %%ymm13       \n\t"	// 8 * x
+
+	"prefetcht0	 384(%4,%0,4)		       \n\t"
+	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%ymm12, %%ymm5      \n\t" 
+	"prefetcht0	 384(%5,%0,4)		       \n\t"
+	"vfmadd231ps 32(%4,%0,4), %%ymm13, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5      \n\t" 
+	"prefetcht0	 384(%6,%0,4)		       \n\t"
+	"vfmadd231ps   (%6,%0,4), %%ymm12, %%ymm6      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%ymm12, %%ymm7      \n\t" 
+	"prefetcht0	 384(%7,%0,4)		       \n\t"
+	"vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm6      \n\t" 
+	"vfmadd231ps 32(%7,%0,4), %%ymm13, %%ymm7      \n\t" 
+
+        "addq		$16, %0	  	 	      \n\t"
+	"subq	        $16, %1			      \n\t"		
+	"jnz		.L01LOOP%=		      \n\t"
+
+        ".L16END%=:                                   \n\t"
+
+	"vextractf128   $1 , %%ymm4, %%xmm12	      \n\t"
+	"vextractf128   $1 , %%ymm5, %%xmm13	      \n\t"
+	"vextractf128   $1 , %%ymm6, %%xmm14	      \n\t"
+	"vextractf128   $1 , %%ymm7, %%xmm15	      \n\t"
+
+	"vaddps		%%xmm4, %%xmm12, %%xmm4       \n\t"
+	"vaddps		%%xmm5, %%xmm13, %%xmm5       \n\t"
+	"vaddps		%%xmm6, %%xmm14, %%xmm6       \n\t"
+	"vaddps		%%xmm7, %%xmm15, %%xmm7       \n\t"
+
+        "vhaddps        %%xmm4, %%xmm4, %%xmm4  \n\t"
+        "vhaddps        %%xmm5, %%xmm5, %%xmm5  \n\t"
+        "vhaddps        %%xmm6, %%xmm6, %%xmm6  \n\t"
+        "vhaddps        %%xmm7, %%xmm7, %%xmm7  \n\t"
+
+        "vhaddps        %%xmm4, %%xmm4, %%xmm4  \n\t"
+        "vhaddps        %%xmm5, %%xmm5, %%xmm5  \n\t"
+        "vhaddps        %%xmm6, %%xmm6, %%xmm6  \n\t"
+        "vhaddps        %%xmm7, %%xmm7, %%xmm7  \n\t"
+
+        "vmovss         %%xmm4,    (%3)         \n\t"
+        "vmovss         %%xmm5,   4(%3)         \n\t"
+        "vmovss         %%xmm6,   8(%3)         \n\t"
+        "vmovss         %%xmm7,  12(%3)         \n\t"
+
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
--- a/kernel/x86_64/sgemv_t_microk_haswell.c
+++ b/kernel/x86_64/sgemv_t_microk_haswell.c
@ -1,100 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-	//n = n / 16;
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"leaq	(, %%rcx,4), %%rcx       \n\t"		// scale lda by size of float
-	"leaq	(%%rsi,%%rcx,1), %%r8    \n\t"		// pointer to next line
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
-	"vxorps		%%xmm14, %%xmm14, %%xmm14\n\t"	// set to zero
-	"vxorps		%%xmm15, %%xmm15, %%xmm15\n\t"	// set to zero
-
-	"sarq		$4, %%rax		 \n\t"	// n = n / 16
-
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	// "prefetcht0	512(%%rsi)		 \n\t"
-	"prefetcht0	(%%r8)		 	 \n\t" //prefetch next line of a
-	"vmovups	(%%rsi), %%xmm4		 \n\t"
-	"vmovups     4*4(%%rsi), %%xmm5		 \n\t"
-	"vmovups     8*4(%%rsi), %%xmm6		 \n\t"
-	"vmovups    12*4(%%rsi), %%xmm7		 \n\t"
-
-	"vfmadd231ps   0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp
-	"vfmadd231ps   4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp
-	"vfmadd231ps   8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp
-	"vfmadd231ps  12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp
-
-        "addq		$16*4    ,   %%r8	 \n\t"  // increment prefetch pointer 
-        "addq		$16*4    ,   %%rsi	 \n\t"  // increment pointer of a 
-        "addq		$16*4    ,   %%rdi	 \n\t"  // increment pointer of c 
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vaddps		%%xmm12, %%xmm14, %%xmm12\n\t"	
-	"vaddps		%%xmm13, %%xmm15, %%xmm13\n\t"	
-	"vaddps		%%xmm12, %%xmm13, %%xmm12\n\t"	
-	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
-	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
-
-	"vmulss	        %%xmm12, %%xmm1, %%xmm12\n\t"
-	"vaddss		(%%rdx), %%xmm12,%%xmm12\n\t"
-	"vmovss		%%xmm12, (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-
--- a/kernel/x86_64/sgemv_t_microk_nehalem-4.c
+++ b/kernel/x86_64/sgemv_t_microk_nehalem-4.c
@ -0,0 +1,99 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"xorps		%%xmm4 , %%xmm4	         \n\t"
+	"xorps		%%xmm5 , %%xmm5	         \n\t"
+	"xorps		%%xmm6 , %%xmm6	         \n\t"
+	"xorps		%%xmm7 , %%xmm7	         \n\t"
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+
+	"movups	       (%2,%0,4), %%xmm12              \n\t"   // 4 * x
+	"movups        (%4,%0,4), %%xmm8               \n\t"   // 4 * a0
+	"movups        (%5,%0,4), %%xmm9               \n\t"   // 4 * a1
+	"movups        (%6,%0,4), %%xmm10              \n\t"   // 4 * a2
+	"movups        (%7,%0,4), %%xmm11              \n\t"   // 4 * a3
+
+	"mulps		%%xmm12, %%xmm8		       \n\t"
+	"mulps		%%xmm12, %%xmm9		       \n\t"
+	"mulps		%%xmm12, %%xmm10	       \n\t"
+	"mulps		%%xmm12, %%xmm11	       \n\t"
+	"addps		%%xmm8 , %%xmm4		       \n\t"
+        "addq		$4 , %0	  	 	       \n\t"
+	"addps		%%xmm9 , %%xmm5		       \n\t"
+	"subq	        $4 , %1			       \n\t"		
+	"addps		%%xmm10, %%xmm6		       \n\t"
+	"addps		%%xmm11, %%xmm7		       \n\t"
+
+	"jnz		.L01LOOP%=		       \n\t"
+
+        "haddps        %%xmm4, %%xmm4  \n\t"
+        "haddps        %%xmm5, %%xmm5  \n\t"
+        "haddps        %%xmm6, %%xmm6  \n\t"
+        "haddps        %%xmm7, %%xmm7  \n\t"
+
+        "haddps        %%xmm4, %%xmm4  \n\t"
+        "haddps        %%xmm5, %%xmm5  \n\t"
+        "haddps        %%xmm6, %%xmm6  \n\t"
+        "haddps        %%xmm7, %%xmm7  \n\t"
+
+        "movss         %%xmm4,    (%3)         \n\t"
+        "movss         %%xmm5,   4(%3)         \n\t"
+        "movss         %%xmm6,   8(%3)         \n\t"
+        "movss         %%xmm7,  12(%3)         \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12",
+	  "memory"
+	);
+
+} 
+
+
--- a/kernel/x86_64/sgemv_t_microk_sandy-4.c
+++ b/kernel/x86_64/sgemv_t_microk_sandy-4.c
@ -0,0 +1,174 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+        "vxorps         %%ymm0 , %%ymm0, %%ymm0  \n\t"
+        "vxorps         %%ymm1 , %%ymm1, %%ymm1  \n\t"
+        "vxorps         %%ymm2 , %%ymm2, %%ymm2  \n\t"
+        "vxorps         %%ymm3 , %%ymm3, %%ymm3  \n\t"
+        "vxorps         %%ymm4 , %%ymm4, %%ymm4  \n\t"
+        "vxorps         %%ymm5 , %%ymm5, %%ymm5  \n\t"
+        "vxorps         %%ymm6 , %%ymm6, %%ymm6  \n\t"
+        "vxorps         %%ymm7 , %%ymm7, %%ymm7  \n\t"
+
+        "testq          $0x04, %1                      \n\t"
+        "jz             .L08LABEL%=                    \n\t"
+
+        "vmovups        (%2,%0,4), %%xmm12       \n\t"  // 4 * x
+
+	"vmulps   (%4,%0,4), %%xmm12, %%xmm8      \n\t" 
+	"vmulps   (%5,%0,4), %%xmm12, %%xmm10     \n\t" 
+	"vmulps   (%6,%0,4), %%xmm12, %%xmm9      \n\t" 
+	"vmulps   (%7,%0,4), %%xmm12, %%xmm11     \n\t" 
+	"vaddps	  %%xmm4, %%xmm8 , %%xmm4	  \n\t"
+        "addq		$4 , %0	  	 	      \n\t"
+	"vaddps	  %%xmm5, %%xmm10, %%xmm5	  \n\t"
+	"vaddps	  %%xmm6, %%xmm9 , %%xmm6	  \n\t"
+	"subq	        $4 , %1			      \n\t"		
+	"vaddps	  %%xmm7, %%xmm11, %%xmm7	  \n\t"
+
+        ".L08LABEL%=:                                  \n\t"
+
+        "testq          $0x08, %1                      \n\t"
+        "jz             .L16LABEL%=                    \n\t"
+
+        "vmovups        (%2,%0,4), %%ymm12       \n\t"  // 8 * x
+
+	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
+	"vmulps   (%5,%0,4), %%ymm12, %%ymm10     \n\t" 
+	"vmulps   (%6,%0,4), %%ymm12, %%ymm9      \n\t" 
+	"vmulps   (%7,%0,4), %%ymm12, %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+        "addq		$8 , %0	  	 	      \n\t"
+	"vaddps	  %%ymm5, %%ymm10, %%ymm5	  \n\t"
+	"vaddps	  %%ymm6, %%ymm9 , %%ymm6	  \n\t"
+	"subq	        $8 , %1			      \n\t"		
+	"vaddps	  %%ymm7, %%ymm11, %%ymm7	  \n\t"
+
+        ".L16LABEL%=:                                  \n\t"
+
+        "cmpq           $0, %1                         \n\t"
+        "je             .L16END%=                      \n\t"
+
+
+	".align 16				 \n\t"
+	".L01LOOP%=:				 \n\t"
+	"prefetcht0	 384(%2,%0,4)		       \n\t"
+        "vmovups        (%2,%0,4), %%ymm12       \n\t"  // 8 * x
+        "vmovups      32(%2,%0,4), %%ymm13       \n\t"  // 8 * x
+
+	"prefetcht0	 384(%4,%0,4)		       \n\t"
+	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
+	"vmulps 32(%4,%0,4), %%ymm13, %%ymm9      \n\t" 
+	"prefetcht0	 384(%5,%0,4)		       \n\t"
+	"vmulps   (%5,%0,4), %%ymm12, %%ymm10     \n\t" 
+	"vmulps 32(%5,%0,4), %%ymm13, %%ymm11     \n\t" 
+	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
+	"vaddps	  %%ymm0, %%ymm9 , %%ymm0	  \n\t"
+	"vaddps	  %%ymm1, %%ymm10, %%ymm1	  \n\t"
+	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
+	"prefetcht0	 384(%6,%0,4)		  \n\t"
+	"vmulps   (%6,%0,4), %%ymm12, %%ymm8      \n\t" 
+	"vmulps 32(%6,%0,4), %%ymm13, %%ymm9      \n\t" 
+	"prefetcht0	 384(%7,%0,4)		  \n\t"
+	"vmulps   (%7,%0,4), %%ymm12, %%ymm10     \n\t" 
+	"vmulps 32(%7,%0,4), %%ymm13, %%ymm11     \n\t" 
+	"vaddps	  %%ymm6, %%ymm8 , %%ymm6	  \n\t"
+        "addq		$16, %0	  	 	      \n\t"
+	"vaddps	  %%ymm2, %%ymm9 , %%ymm2	  \n\t"
+	"vaddps	  %%ymm7, %%ymm10, %%ymm7	  \n\t"
+	"subq	        $16, %1			      \n\t"		
+	"vaddps	  %%ymm3, %%ymm11, %%ymm3	  \n\t"
+
+	"jnz		.L01LOOP%=		      \n\t"
+
+        ".L16END%=:				      \n\t"
+
+        "vaddps         %%ymm4, %%ymm0, %%ymm4       \n\t"
+        "vaddps         %%ymm5, %%ymm1, %%ymm5       \n\t"
+        "vaddps         %%ymm6, %%ymm2, %%ymm6       \n\t"
+        "vaddps         %%ymm7, %%ymm3, %%ymm7       \n\t"
+
+        "vextractf128   $1 , %%ymm4, %%xmm12          \n\t"
+        "vextractf128   $1 , %%ymm5, %%xmm13          \n\t"
+        "vextractf128   $1 , %%ymm6, %%xmm14          \n\t"
+        "vextractf128   $1 , %%ymm7, %%xmm15          \n\t"
+
+        "vaddps         %%xmm4, %%xmm12, %%xmm4       \n\t"
+        "vaddps         %%xmm5, %%xmm13, %%xmm5       \n\t"
+        "vaddps         %%xmm6, %%xmm14, %%xmm6       \n\t"
+        "vaddps         %%xmm7, %%xmm15, %%xmm7       \n\t"
+
+        "vhaddps        %%xmm4, %%xmm4, %%xmm4  \n\t"
+        "vhaddps        %%xmm5, %%xmm5, %%xmm5  \n\t"
+        "vhaddps        %%xmm6, %%xmm6, %%xmm6  \n\t"
+        "vhaddps        %%xmm7, %%xmm7, %%xmm7  \n\t"
+
+        "vhaddps        %%xmm4, %%xmm4, %%xmm4  \n\t"
+        "vhaddps        %%xmm5, %%xmm5, %%xmm5  \n\t"
+        "vhaddps        %%xmm6, %%xmm6, %%xmm6  \n\t"
+        "vhaddps        %%xmm7, %%xmm7, %%xmm7  \n\t"
+
+        "vmovss         %%xmm4,    (%3)         \n\t"
+        "vmovss         %%xmm5,   4(%3)         \n\t"
+        "vmovss         %%xmm6,   8(%3)         \n\t"
+        "vmovss         %%xmm7,  12(%3)         \n\t"
+
+
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
--- a/kernel/x86_64/sgemv_t_microk_sandy.c
+++ b/kernel/x86_64/sgemv_t_microk_sandy.c
@ -1,106 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-	//n = n / 16;
-
-	__asm__ __volatile__
-	(
-	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
-	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
-	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
-	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
-	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
-	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
-
-	"leaq	(, %%rcx,4), %%rcx       \n\t"		// scale lda by size of float
-	"leaq	(%%rsi,%%rcx,1), %%r8    \n\t"		// pointer to next line
-
-	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
-	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
-	"vxorps		%%xmm14, %%xmm14, %%xmm14\n\t"	// set to zero
-	"vxorps		%%xmm15, %%xmm15, %%xmm15\n\t"	// set to zero
-
-	"sarq		$4, %%rax		 \n\t"	// n = n / 16
-
-	".align 16				 \n\t"
-	".L01LOOP%=:				 \n\t"
-	// "prefetcht0	512(%%rsi)		 \n\t"
-	"prefetcht0	(%%r8)		 	 \n\t" //prefetch next line of a
-	"vmovups	(%%rsi), %%xmm4		 \n\t"
-	"vmovups     4*4(%%rsi), %%xmm5		 \n\t"
-	"vmovups     8*4(%%rsi), %%xmm6		 \n\t"
-	"vmovups    12*4(%%rsi), %%xmm7		 \n\t"
-
-	"vmulps      0*4(%%rdi), %%xmm4, %%xmm8 \n\t" // multiply a and c and add to temp
-	"vmulps      4*4(%%rdi), %%xmm5, %%xmm9 \n\t" // multiply a and c and add to temp
-	"vmulps      8*4(%%rdi), %%xmm6, %%xmm10\n\t" // multiply a and c and add to temp
-	"vmulps     12*4(%%rdi), %%xmm7, %%xmm11\n\t" // multiply a and c and add to temp
-
-	"vaddps		%%xmm12, %%xmm8 , %%xmm12\n\t"	
-	"vaddps		%%xmm13, %%xmm9 , %%xmm13\n\t"	
-	"vaddps		%%xmm14, %%xmm10, %%xmm14\n\t"	
-	"vaddps		%%xmm15, %%xmm11, %%xmm15\n\t"	
-
-        "addq		$16*4    ,   %%r8	 \n\t"  // increment prefetch pointer 
-        "addq		$16*4    ,   %%rsi	 \n\t"  // increment pointer of a 
-        "addq		$16*4    ,   %%rdi	 \n\t"  // increment pointer of c 
-	"dec		%%rax			 \n\t"  // n = n -1
-	"jnz		.L01LOOP%=		 \n\t"
-
-	"vaddps		%%xmm12, %%xmm14, %%xmm12\n\t"	
-	"vaddps		%%xmm13, %%xmm15, %%xmm13\n\t"	
-	"vaddps		%%xmm12, %%xmm13, %%xmm12\n\t"	
-	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
-	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
-
-	"vmulss		%%xmm12, %%xmm1, %%xmm12 \n\t"
-	"vaddss	       (%%rdx), %%xmm12, %%xmm12\n\t"
-	"vmovss		%%xmm12, (%%rdx)	 \n\t"  // store temp -> y
-
-	:
-        :
-          "m" (n),	// 0	
-	  "m" (alpha),  // 1
-	  "m" (a),      // 2
-          "m" (lda),    // 3
-          "m" (x),      // 4
-          "m" (y)       // 5
-	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
-	  "%xmm0", "%xmm1", 
-	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
-	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
-	  "memory"
-	);
-
-} 
-
-
-
--- a/lapack-netlib/TESTING/dstest.in
+++ b/lapack-netlib/TESTING/dstest.in
@ -1,6 +1,6 @@
 Data file for testing DSGESV/DSPOSV LAPACK routines
 12                                      Number of values of M
-0 1 2 13 17 45 78 91 101 119 120 132    values of M (row dimension)
+0 1 2 13 17 45 78 91 101 119 112 132    values of M (row dimension)
 6                                       Number of values of NRHS
 1 2 14 15 16 13                         Values of NRHS (number of right hand sides)
 30.0                                    Threshold value of test ratio