From c1a6374c6fe7df294aeca2c550bc58d61acfa654 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 13 Aug 2014 16:10:03 +0200 Subject: [PATCH] optimized zgemv_n kernel for sandybridge --- kernel/x86_64/KERNEL.SANDYBRIDGE | 3 +++ kernel/x86_64/zgemv_n.c | 3 +++ kernel/x86_64/zgemv_n_microk_sandy-2.c | 13 ++++++++----- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index d4fbca7f2..b654d3564 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -1,6 +1,9 @@ SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c +ZGEMVNKERNEL = zgemv_n.c + + SGEMMKERNEL = sgemm_kernel_16x4_sandy.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c diff --git a/kernel/x86_64/zgemv_n.c b/kernel/x86_64/zgemv_n.c index 7b8907044..9098368a5 100644 --- a/kernel/x86_64/zgemv_n.c +++ b/kernel/x86_64/zgemv_n.c @@ -31,9 +31,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HASWELL) #include "zgemv_n_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "zgemv_n_microk_sandy-2.c" #endif + #define NBMAX 1024 #ifndef HAVE_KERNEL_16x4 diff --git a/kernel/x86_64/zgemv_n_microk_sandy-2.c b/kernel/x86_64/zgemv_n_microk_sandy-2.c index f90e2210a..352c60f87 100644 --- a/kernel/x86_64/zgemv_n_microk_sandy-2.c +++ b/kernel/x86_64/zgemv_n_microk_sandy-2.c @@ -50,39 +50,42 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" ".L01LOOP%=: \n\t" + "prefetcht0 256(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 "vmulpd %%ymm8 , %%ymm0 , %%ymm12 \n\t" "vmulpd %%ymm8 , %%ymm1 , %%ymm13 \n\t" + "prefetcht0 256(%5,%0,8) \n\t" "vmulpd %%ymm9 , %%ymm0 , %%ymm14 \n\t" - "vmulpd %%ymm9 , %%ymm1 , %%ymm15 \n\t" - "vmovups (%5,%0,8), %%ymm8 \n\t" // 2 complex values form a0 + "vmulpd %%ymm9 , %%ymm1 , %%ymm15 \n\t" "vmovups 32(%5,%0,8), %%ymm9 \n\t" // 2 complex values form a0 "vmulpd %%ymm8 , %%ymm2 , %%ymm10 \n\t" "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t" "vmulpd %%ymm8 , %%ymm3 , %%ymm11 \n\t" "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t" + "prefetcht0 256(%6,%0,8) \n\t" "vmulpd %%ymm9 , %%ymm2 , %%ymm10 \n\t" "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t" + "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmulpd %%ymm9 , %%ymm3 , %%ymm11 \n\t" "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t" - "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a0 "vmulpd %%ymm8 , %%ymm4 , %%ymm10 \n\t" "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t" "vmulpd %%ymm8 , %%ymm5 , %%ymm11 \n\t" "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t" + "prefetcht0 256(%7,%0,8) \n\t" "vmulpd %%ymm9 , %%ymm4 , %%ymm10 \n\t" "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t" + "vmovups (%7,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmulpd %%ymm9 , %%ymm5 , %%ymm11 \n\t" "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t" - "vmovups (%7,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%7,%0,8), %%ymm9 \n\t" // 2 complex values form a0 "vmulpd %%ymm8 , %%ymm6 , %%ymm10 \n\t" @@ -94,7 +97,7 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vmulpd %%ymm9 , %%ymm7 , %%ymm11 \n\t" "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t" - "prefetcht0 192(%3,%0,8) \n\t" + "prefetcht0 256(%3,%0,8) \n\t" "vmovups (%3,%0,8), %%ymm10 \n\t" "vmovups 32(%3,%0,8), %%ymm11 \n\t"