From 5392d11b045abddfe51c45e69848c807121486e8 Mon Sep 17 00:00:00 2001
From: wernsaar <wernsaar@googlemail.com>
Date: Sun, 20 Jul 2014 14:08:04 +0200
Subject: [PATCH] optimized sgemv_n_microk_sandy.c

---
 kernel/x86_64/sgemv_n_microk_sandy.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/kernel/x86_64/sgemv_n_microk_sandy.c b/kernel/x86_64/sgemv_n_microk_sandy.c
index 7d9360f94..9bdb06600 100644
--- a/kernel/x86_64/sgemv_n_microk_sandy.c
+++ b/kernel/x86_64/sgemv_n_microk_sandy.c
@@ -29,7 +29,7 @@ static void  sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x,
 {
 
 
-	float *pre = a + lda*3;
+	float *pre = a + lda*2;
 
 	__asm__  __volatile__
 	(
@@ -58,20 +58,19 @@ static void  sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x,
 	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
 
 	"prefetcht0	(%%r8)\n\t"			// Prefetch
-	"prefetcht0   64(%%r8)\n\t"			// Prefetch
-	"prefetcht0  128(%%r8)\n\t"			// Prefetch
-	"prefetcht0  192(%%r8)\n\t"			// Prefetch
-
 	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
 	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
+	"prefetcht0   64(%%r8)\n\t"			// Prefetch
 	"vmulps  16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
 	"vmulps  24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
 
 	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
 	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
+	"prefetcht0  128(%%r8)\n\t"			// Prefetch
 	"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
 	"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
 
+	"prefetcht0  192(%%r8)\n\t"			// Prefetch
 	"vmulps  32*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
 	"vmulps  40*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
 	"vmulps  48*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp