From 3c5732615dd01e4d865d6c5d516e75889b165347 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Thu, 17 Jul 2014 23:15:07 +0200
Subject: [PATCH] added blocked sgemv_n and microkernel for bulldozer and
 piledriver

---
 kernel/x86_64/KERNEL.BULLDOZER           |   1 +
 kernel/x86_64/KERNEL.PILEDRIVER          |   1 +
 kernel/x86_64/sgemv_n_avx.c              | 194 +++++++++++++
 kernel/x86_64/sgemv_n_microk_bulldozer.c | 346 +++++++++++++++++++++++
 4 files changed, 542 insertions(+)
 create mode 100644 kernel/x86_64/sgemv_n_avx.c
 create mode 100644 kernel/x86_64/sgemv_n_microk_bulldozer.c

diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER
index 70370a73c..55932e69f 100644
--- a/kernel/x86_64/KERNEL.BULLDOZER
+++ b/kernel/x86_64/KERNEL.BULLDOZER
@@ -1,3 +1,4 @@
+SGEMVNKERNEL = sgemv_n_avx.c
 
 ZGEMVNKERNEL = zgemv_n_dup.S
 ZGEMVTKERNEL = zgemv_t.S
diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER
index 92b5dc7c9..145d9fb2f 100644
--- a/kernel/x86_64/KERNEL.PILEDRIVER
+++ b/kernel/x86_64/KERNEL.PILEDRIVER
@@ -1,3 +1,4 @@
+SGEMVNKERNEL = sgemv_n_avx.c
 
 ZGEMVNKERNEL = zgemv_n_dup.S
 ZGEMVTKERNEL = zgemv_t.S
diff --git a/kernel/x86_64/sgemv_n_avx.c b/kernel/x86_64/sgemv_n_avx.c
new file mode 100644
index 000000000..8c263543c
--- /dev/null
+++ b/kernel/x86_64/sgemv_n_avx.c
@@ -0,0 +1,194 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(BULLDOZER) || defined(PILEDRIVER)
+#include "sgemv_n_microk_bulldozer.c"
+#endif
+
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
+{
+	BLASLONG i;
+	for ( i=0; i<n; i++ )
+	{
+		*dest = *src;
+		dest++;
+		src += inc_src;
+	}
+}
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+	BLASLONG i;
+	for ( i=0; i<n; i++ )
+	{
+		*dest += *src;
+		src++;
+		dest += inc_dest;
+	}
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	BLASLONG n1,n2;
+	BLASLONG m1,m2;
+	FLOAT *xbuffer,*ybuffer;
+	xbuffer = buffer;
+	ybuffer = xbuffer + 2048 + 256;
+	
+	n1 = n / 512 ;
+	n2 = n % 512 ;
+
+	m1 = m / 32;
+	m2 = m % 32;
+
+	x_ptr = x;
+	a_ptr = a;
+	y_ptr = y;
+
+	for (j=0; j<n1; j++)
+	{
+
+		if ( inc_x == 1 )
+			xbuffer = x_ptr;
+		else
+			copy_x(512,x_ptr,xbuffer,inc_x);
+
+		x_ptr += 512 * inc_x;
+		a_ptr += j * 512;
+		y_ptr = y;
+
+
+		for(i = 0; i<m1; i++ )
+		{
+			sgemv_kernel_32(512,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(32,ybuffer,y_ptr,inc_y);
+			y_ptr += 32 * inc_y;
+			a_ptr += 32;			
+
+		}
+		if ( m2 & 16 )
+		{
+			sgemv_kernel_16(512,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(16,ybuffer,y_ptr,inc_y);
+			y_ptr += 16 * inc_y;
+			a_ptr += 16;			
+		}
+		if ( m2 & 8 )
+		{
+			sgemv_kernel_8(512,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(8,ybuffer,y_ptr,inc_y);
+			y_ptr += 8 * inc_y;
+			a_ptr += 8;			
+		}
+		if ( m2 & 4 )
+		{
+			sgemv_kernel_4(512,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(4,ybuffer,y_ptr,inc_y);
+			y_ptr += 4 * inc_y;
+			a_ptr += 4;			
+		}
+		if ( m2 & 2 )
+		{
+			sgemv_kernel_2(512,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(2,ybuffer,y_ptr,inc_y);
+			y_ptr += 2 * inc_y;
+			a_ptr += 2;			
+		}
+		if ( m2 & 1 )
+		{
+			sgemv_kernel_1(512,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(1,ybuffer,y_ptr,inc_y);
+		}
+
+	}
+
+	if ( n2 > 0 )
+	{
+
+		if ( inc_x == 1 )
+			xbuffer = x_ptr;
+		else
+			copy_x(n2,x_ptr,xbuffer,inc_x);
+
+		y_ptr = y;
+
+		for(i = 0; i<m1; i++ )
+		{
+			sgemv_kernel_32(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(32,ybuffer,y_ptr,inc_y);
+			y_ptr += 32 * inc_y;
+			a_ptr += 32;			
+
+		}
+		if ( m2 & 16 )
+		{
+			sgemv_kernel_16(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(16,ybuffer,y_ptr,inc_y);
+			y_ptr += 16 * inc_y;
+			a_ptr += 16;			
+		}
+		if ( m2 & 8 )
+		{
+			sgemv_kernel_8(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(8,ybuffer,y_ptr,inc_y);
+			y_ptr += 8 * inc_y;
+			a_ptr += 8;			
+		}
+		if ( m2 & 4 )
+		{
+			sgemv_kernel_4(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(4,ybuffer,y_ptr,inc_y);
+			y_ptr += 4 * inc_y;
+			a_ptr += 4;			
+		}
+		if ( m2 & 2 )
+		{
+			sgemv_kernel_2(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(2,ybuffer,y_ptr,inc_y);
+			y_ptr += 2 * inc_y;
+			a_ptr += 2;			
+		}
+		if ( m2 & 1 )
+		{
+			sgemv_kernel_1(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
+			add_y(1,ybuffer,y_ptr,inc_y);
+		}
+
+
+	}
+	return(0);
+}
+
+
diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer.c b/kernel/x86_64/sgemv_n_microk_bulldozer.c
new file mode 100644
index 000000000..3dad43643
--- /dev/null
+++ b/kernel/x86_64/sgemv_n_microk_bulldozer.c
@@ -0,0 +1,346 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+
+	float *pre = a + lda*4*2;
+
+	__asm __volatile
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+	"movq		%6,	 %%r8\n\t"		// address for prefetch
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+	"prefetcht0   64(%%r8)\n\t"			// Prefetch
+
+	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
+	"vxorps		%%ymm13, %%ymm13, %%ymm13\n\t"	// set to zero
+	"vxorps		%%ymm14, %%ymm14, %%ymm14\n\t"	// set to zero
+	"vxorps		%%ymm15, %%ymm15, %%ymm15\n\t"	// set to zero
+
+	".L01LOOP%=:				 \n\t"
+	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+
+	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+	"prefetcht0   64(%%r8)\n\t"			// Prefetch
+
+	"vfmaddps %%ymm12,   0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
+	"vfmaddps %%ymm13,   8*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
+	"vfmaddps %%ymm14,  16*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp
+	"vfmaddps %%ymm15,  24*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp
+
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
+	"vmulps		%%ymm13, %%ymm1,  %%ymm13\n\t"  // scale by alpha
+	"vmulps		%%ymm14, %%ymm1,  %%ymm14\n\t"  // scale by alpha
+	"vmulps		%%ymm15, %%ymm1,  %%ymm15\n\t"  // scale by alpha
+
+	"vmovups	%%ymm12,     (%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm13,  8*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm14, 16*4(%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm15, 24*4(%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y),      // 5
+	  "m" (pre)	// 6
+	: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
+	  "xmm0" , "xmm1", 
+	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "memory"
+	);
+
+} 
+
+static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+	float *pre = a + lda*4*3;
+
+	__asm __volatile
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+	"movq		%6,	 %%r8\n\t"		// address for prefetch
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+
+	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
+	"vxorps		%%ymm13, %%ymm13, %%ymm13\n\t"	// set to zero
+
+	".L01LOOP%=:				 \n\t"
+	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+
+	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
+	"prefetcht0	(%%r8)\n\t"			// Prefetch
+
+	"vfmaddps %%ymm12,   0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
+	"vfmaddps %%ymm13,   8*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
+
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
+	"vmulps		%%ymm13, %%ymm1,  %%ymm13\n\t"  // scale by alpha
+
+	"vmovups	%%ymm12,     (%%rdx)	 \n\t"  // store temp -> y
+	"vmovups	%%ymm13,  8*4(%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y),      // 5
+	  "m" (pre)	// 6
+	: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
+	  "xmm0" , "xmm1", 
+	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "memory"
+	);
+
+} 
+
+
+static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+
+	__asm __volatile
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+
+	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
+
+	".L01LOOP%=:				 \n\t"
+	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+
+	"vfmaddps %%ymm12,   0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
+
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
+
+	"vmovups	%%ymm12,     (%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y)       // 5
+	: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
+	  "xmm0" , "xmm1", 
+	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "memory"
+	);
+
+} 
+
+
+static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+
+	__asm __volatile
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vbroadcastss   %1, 	 %%xmm1\n\t"		// alpha -> xmm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+
+	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
+
+	".L01LOOP%=:				 \n\t"
+	"vbroadcastss	(%%rdi),   %%xmm0	 \n\t"	// load values of c
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+
+	"vfmaddps %%xmm12,   0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
+
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulps		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
+
+	"vmovups	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y)       // 5
+	: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
+	  "xmm0" , "xmm1", 
+	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "memory"
+	);
+
+} 
+
+static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+
+	__asm __volatile
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+
+	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
+	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
+
+	".L01LOOP%=:				 \n\t"
+	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+
+	"vfmaddss %%xmm12,   0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
+	"vfmaddss %%xmm13,   1*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp
+
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
+	"vmulss		%%xmm13, %%xmm1,  %%xmm13\n\t"  // scale by alpha
+
+	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
+	"vmovss 	%%xmm13,    4(%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y)       // 5
+	: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
+	  "xmm0" , "xmm1", 
+	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "memory"
+	);
+
+} 
+
+
+
+static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
+{
+
+
+	__asm __volatile
+	(
+	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
+	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
+	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
+	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
+	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
+	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
+
+	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
+
+	".L01LOOP%=:				 \n\t"
+	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
+        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
+
+	"vfmaddss %%xmm12,   0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
+
+	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a
+
+	"dec		%%rax			 \n\t"  // n = n -1
+	"jnz		.L01LOOP%=		 \n\t"
+
+	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
+
+	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
+
+	:
+        :
+          "m" (n),	// 0	
+	  "m" (alpha),  // 1
+	  "m" (a),      // 2
+          "m" (lda),    // 3
+          "m" (x),      // 4
+          "m" (y)       // 5
+	: "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
+	  "xmm0" , "xmm1", 
+	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "memory"
+	);
+
+} 
+
+