From 5c3169ecd8440788aea7e69fc70c652212d68991 Mon Sep 17 00:00:00 2001
From: Bart Oldeman <bartoldeman@users.noreply.github.com>
Date: Thu, 1 Dec 2022 07:48:05 -0500
Subject: [PATCH] dscal: use ymm registers in Haswell microkernel

Using 256-bit registers in dscal makes this microkernel consistent with
cscal and zscal, and generally doubles performance if the vector fits in
L1 cache.
---
 kernel/x86_64/dscal_microk_haswell-2.c | 82 +++++++++-----------------
 1 file changed, 28 insertions(+), 54 deletions(-)

diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c
index 77ed59a4e..4551f38a2 100644
--- a/kernel/x86_64/dscal_microk_haswell-2.c
+++ b/kernel/x86_64/dscal_microk_haswell-2.c
@@ -38,22 +38,18 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
 
 	__asm__  __volatile__
 	(
-	"vmovddup		(%2), %%xmm0		    \n\t"  // alpha	
+	"vbroadcastsd		(%2), %%ymm0		    \n\t"  // alpha
 
 	"addq	$128, %1				    \n\t"
 
 	"cmpq 	$0, %0					    \n\t"
 	"je	4f					    \n\t" 
 
-	"vmulpd 	-128(%1), %%xmm0, %%xmm4	    \n\t"
-	"vmulpd 	-112(%1), %%xmm0, %%xmm5	    \n\t"
-	"vmulpd 	 -96(%1), %%xmm0, %%xmm6	    \n\t"
-	"vmulpd 	 -80(%1), %%xmm0, %%xmm7	    \n\t"
+	"vmulpd 	-128(%1), %%ymm0, %%ymm4	    \n\t"
+	"vmulpd 	 -96(%1), %%ymm0, %%ymm5	    \n\t"
 
-	"vmulpd 	 -64(%1), %%xmm0, %%xmm8	    \n\t"
-	"vmulpd 	 -48(%1), %%xmm0, %%xmm9	    \n\t"
-	"vmulpd 	 -32(%1), %%xmm0, %%xmm10    	    \n\t"
-	"vmulpd 	 -16(%1), %%xmm0, %%xmm11           \n\t"
+	"vmulpd 	 -64(%1), %%ymm0, %%ymm6	    \n\t"
+	"vmulpd 	 -32(%1), %%ymm0, %%ymm7    	    \n\t"
 
 	"subq	        $1 , %0			            \n\t"		
 	"jz		2f		             	    \n\t"
@@ -62,26 +58,18 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"1:				            	    \n\t"
 	// "prefetcht0     640(%1)				    \n\t" 
 
-	"vmovups	%%xmm4  ,-128(%1)		    \n\t"
-	"vmovups	%%xmm5  ,-112(%1)		    \n\t"
-	"vmulpd 	   0(%1), %%xmm0, %%xmm4	    \n\t"
-	"vmovups	%%xmm6  , -96(%1)		    \n\t"
-	"vmulpd 	  16(%1), %%xmm0, %%xmm5	    \n\t"
-	"vmovups	%%xmm7  , -80(%1)		    \n\t"
-	"vmulpd 	  32(%1), %%xmm0, %%xmm6	    \n\t"
+	"vmovups	%%ymm4  ,-128(%1)		    \n\t"
+	"vmovups	%%ymm5  , -96(%1)		    \n\t"
+	"vmulpd 	   0(%1), %%ymm0, %%ymm4	    \n\t"
 
 	// "prefetcht0     704(%1)				    \n\t" 
 
-	"vmovups	%%xmm8  , -64(%1)		    \n\t"
-	"vmulpd 	  48(%1), %%xmm0, %%xmm7	    \n\t"
-	"vmovups	%%xmm9  , -48(%1)		    \n\t"
-	"vmulpd 	  64(%1), %%xmm0, %%xmm8	    \n\t"
-	"vmovups	%%xmm10 , -32(%1)		    \n\t"
-	"vmulpd 	  80(%1), %%xmm0, %%xmm9	    \n\t"
-	"vmovups	%%xmm11 , -16(%1)		    \n\t"
+	"vmovups	%%ymm6  , -64(%1)		    \n\t"
+	"vmulpd 	  32(%1), %%ymm0, %%ymm5	    \n\t"
+	"vmovups	%%ymm7  , -32(%1)		    \n\t"
 
-	"vmulpd 	  96(%1), %%xmm0, %%xmm10    	    \n\t"
-	"vmulpd 	 112(%1), %%xmm0, %%xmm11           \n\t"
+	"vmulpd 	  64(%1), %%ymm0, %%ymm6	    \n\t"
+	"vmulpd 	  96(%1), %%ymm0, %%ymm7    	    \n\t"
 
 
 	"addq		$128, %1	  	 	    \n\t"
@@ -90,15 +78,11 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
 
 	"2:				            	    \n\t"
  
-	"vmovups	%%xmm4  ,-128(%1)		    \n\t"
-	"vmovups	%%xmm5  ,-112(%1)		    \n\t"
-	"vmovups	%%xmm6  , -96(%1)		    \n\t"
-	"vmovups	%%xmm7  , -80(%1)		    \n\t"
+	"vmovups	%%ymm4  ,-128(%1)		    \n\t"
+	"vmovups	%%ymm5  , -96(%1)		    \n\t"
 
-	"vmovups	%%xmm8  , -64(%1)		    \n\t"
-	"vmovups	%%xmm9  , -48(%1)		    \n\t"
-	"vmovups	%%xmm10 , -32(%1)		    \n\t"
-	"vmovups	%%xmm11 , -16(%1)		    \n\t"
+	"vmovups	%%ymm6  , -64(%1)		    \n\t"
+	"vmovups	%%ymm7  , -32(%1)		    \n\t"
 
 	"addq		$128, %1	  	 	    \n\t"
 
@@ -107,15 +91,11 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"cmpq	$8  ,%3					    \n\t"
 	"jne	5f					    \n\t"
 
-	"vmulpd	    -128(%1), %%xmm0, %%xmm4	    \n\t"
-	"vmulpd	    -112(%1), %%xmm0, %%xmm5	    \n\t"
-	"vmulpd	     -96(%1), %%xmm0, %%xmm6     	    \n\t"
-	"vmulpd	     -80(%1), %%xmm0, %%xmm7     	    \n\t"
+	"vmulpd	    -128(%1), %%ymm0, %%ymm4	    \n\t"
+	"vmulpd	     -96(%1), %%ymm0, %%ymm5     	    \n\t"
 
-	"vmovups	%%xmm4  ,-128(%1)		    \n\t"
-	"vmovups	%%xmm5  ,-112(%1)		    \n\t"
-	"vmovups	%%xmm6  , -96(%1)		    \n\t"
-	"vmovups	%%xmm7  , -80(%1)		    \n\t"
+	"vmovups	%%ymm4  ,-128(%1)		    \n\t"
+	"vmovups	%%ymm5  , -96(%1)		    \n\t"
 
 	"5:						    \n\t"
 
@@ -149,7 +129,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
 
 	__asm__  __volatile__
 	(
-	"vxorpd		%%xmm0, %%xmm0 , %%xmm0		    \n\t"  
+	"vxorpd	        %%ymm0, %%ymm0 , %%ymm0		    \n\t"
 
 	"addq	$128, %1				    \n\t"
 
@@ -159,15 +139,11 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	".p2align 4				            \n\t"
 	"1:				            	    \n\t"
 
-	"vmovups	%%xmm0  ,-128(%1)		    \n\t"
-	"vmovups	%%xmm0  ,-112(%1)		    \n\t"
-	"vmovups	%%xmm0  , -96(%1)		    \n\t"
-	"vmovups	%%xmm0  , -80(%1)		    \n\t"
+	"vmovups	%%ymm0 , -128(%1)		    \n\t"
+	"vmovups	%%ymm0 ,  -96(%1)		    \n\t"
 
-	"vmovups	%%xmm0  , -64(%1)		    \n\t"
-	"vmovups	%%xmm0  , -48(%1)		    \n\t"
-	"vmovups	%%xmm0  , -32(%1)		    \n\t"
-	"vmovups	%%xmm0  , -16(%1)		    \n\t"
+	"vmovups	%%ymm0 ,  -64(%1)		    \n\t"
+	"vmovups	%%ymm0 ,  -32(%1)		    \n\t"
 
 	"addq		$128, %1	  	 	    \n\t"
 	"subq	        $1 , %0			            \n\t"		
@@ -178,10 +154,8 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"cmpq	$8  ,%3					    \n\t"
 	"jne	4f					    \n\t"
 
-	"vmovups	%%xmm0  ,-128(%1)		    \n\t"
-	"vmovups	%%xmm0  ,-112(%1)		    \n\t"
-	"vmovups	%%xmm0  , -96(%1)		    \n\t"
-	"vmovups	%%xmm0  , -80(%1)		    \n\t"
+	"vmovups	%%ymm0  ,-128(%1)		    \n\t"
+	"vmovups	%%ymm0  , -96(%1)		    \n\t"
 
 	"4:						    \n\t"