From 8109d8232c66c4119044fa3111947d88afae9eb6 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 9 Sep 2014 14:38:08 +0200 Subject: [PATCH] optimized dgemv_t kernel for haswell --- kernel/x86_64/dgemv_t_4.c | 10 +--------- kernel/x86_64/dgemv_t_microk_haswell-4.c | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index 0d0409bec..ebec7d2c3 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -28,17 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -/* -#if defined(NEHALEM) -#include "dgemv_t_microk_nehalem-4.c" -#elif defined(BULLDOZER) || defined(PILEDRIVER) -#include "dgemv_t_microk_bulldozer-4.c" -#elif defined(SANDYBRIDGE) -#include "dgemv_t_microk_sandy-4.c" -#elif defined(HASWELL) +#if defined(HASWELL) #include "dgemv_t_microk_haswell-4.c" #endif -*/ #define NBMAX 2048 diff --git a/kernel/x86_64/dgemv_t_microk_haswell-4.c b/kernel/x86_64/dgemv_t_microk_haswell-4.c index 410225500..33b43515d 100644 --- a/kernel/x86_64/dgemv_t_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_t_microk_haswell-4.c @@ -61,25 +61,25 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" ".L01LOOP%=: \n\t" - "prefetcht0 384(%2,%0,8) \n\t" + // "prefetcht0 384(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x "vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x - "prefetcht0 384(%4,%0,8) \n\t" + // "prefetcht0 384(%4,%0,8) \n\t" "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm5 \n\t" - "prefetcht0 384(%5,%0,8) \n\t" - "vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4 \n\t" - "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" - "prefetcht0 384(%6,%0,8) \n\t" + // "prefetcht0 384(%5,%0,8) \n\t" "vfmadd231pd (%6,%0,8), %%ymm12, %%ymm6 \n\t" "vfmadd231pd (%7,%0,8), %%ymm12, %%ymm7 \n\t" - "prefetcht0 384(%7,%0,8) \n\t" - "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm6 \n\t" - "vfmadd231pd 32(%7,%0,8), %%ymm13, %%ymm7 \n\t" + // "prefetcht0 384(%6,%0,8) \n\t" + "vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4 \n\t" + "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" + "addq $8 , %0 \n\t" + // "prefetcht0 384(%7,%0,8) \n\t" + "vfmadd231pd -32(%6,%0,8), %%ymm13, %%ymm6 \n\t" + "subq $8 , %1 \n\t" + "vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t" - "addq $8 , %0 \n\t" - "subq $8 , %1 \n\t" "jnz .L01LOOP%= \n\t" ".L16END%=: \n\t"